def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None): """Poll jobs of specified tasks. Any job that is or was submitted or running can be polled, except for retrying tasks - which would poll (correctly) as failed. And don't poll succeeded tasks by default. This method uses _poll_task_jobs_callback() and _manip_task_jobs_callback() as help/callback methods. _poll_task_job_callback() executes one specific job. """ to_poll_tasks = [] pollable_statuses = set([ TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED]) if poll_succ: pollable_statuses.add(TASK_STATUS_SUCCEEDED) for itask in itasks: if itask.state.status in pollable_statuses: to_poll_tasks.append(itask) else: LOG.debug("skipping %s: not pollable, " "or skipping 'succeeded' tasks" % itask.identity) if to_poll_tasks: if msg is not None: LOG.info(msg) self._run_job_cmd( self.JOBS_POLL, suite, to_poll_tasks, self._poll_task_jobs_callback)
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity(SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs( itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _set_state(self, status): """Set, log and record task status (normal change, not forced - don't update task_events table).""" if self.status == self.hold_swap: self.hold_swap = None if status == self.status and self.hold_swap is None: return prev_status, prev_hold_swap = self.status, self.hold_swap if status == TASK_STATUS_HELD: self.hold_swap = self.status elif status in TASK_STATUSES_ACTIVE: if self.status == TASK_STATUS_HELD: self.hold_swap = TASK_STATUS_HELD elif (self.hold_swap == TASK_STATUS_HELD and status not in TASK_STATUSES_FINAL): self.hold_swap = status status = TASK_STATUS_HELD elif self.hold_swap: self.hold_swap = None self.status = status self.time_updated = get_current_time_string() self.is_updated = True # Log message = str(prev_status) if prev_hold_swap: message += " (%s)" % prev_hold_swap message += " => %s" % self.status if self.hold_swap: message += " (%s)" % self.hold_swap LOG.debug("[%s] -%s", self.identity, message) return (prev_status, prev_hold_swap)
def _check_access_priv_and_report(self, required_privilege_level, log_info=True): """Check access privilege and log requests with identifying info. In debug mode log all requests including task messages. Otherwise log all user commands, and just the first info command from each client. Return: dict: containing the client session """ self._check_access_priv(required_privilege_level) command = inspect.currentframe().f_back.f_code.co_name auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name, priv_level, uuid) if cylc.flags.debug or uuid not in self.clients and log_info: LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name, uuid) self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = time() self._housekeep() return self.clients[uuid]
def _forget_client(self, uuid): """Forget a client.""" try: client_info = self.clients.pop(uuid) except KeyError: return False if client_info.get('err_log_handler') is not None: LOG.removeHandler(client_info.get('err_log_handler')) LOG.debug(self.LOG_FORGET_TMPL, uuid) return True
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy # # Note for "reload": A TaskProxy instance may be replaced on reload, so # the "itasks" list may not reference the TaskProxy objects that # replace the old ones. The .reload_successor attribute provides the # link(s) for us to get to the latest replacement. # # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. tasks = {} for itask in itasks: while itask.reload_successor is not None: itask = itask.reload_successor if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" bad_tasks = dict(tasks) for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY: del bad_tasks[(point, name, submit_num)] itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError, KeyError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc) # Task jobs that are in the original command but did not get a status # in the output. Handle as failures. for key, itask in sorted(bad_tasks.items()): line = ( "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n") summary_callback(suite, itask, ctx, line)
def _remote_host_select_callback(self, proc_ctx, cmd_str): """Callback when host select command exits""" self.ready = True if proc_ctx.ret_code == 0 and proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0] else: # Bad status LOG.error(proc_ctx) self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str, proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
async def async_request(self, command, args=None, timeout=None): """Send an asynchronous request using asyncio. Has the same arguments and return values as ``serial_request``. """ if timeout: timeout = float(timeout) timeout = (timeout * 1000 if timeout else None) or self.timeout if not args: args = {} # get secret for this request # assumes secret won't change during the request try: secret = self.secret() except cylc.suite_srv_files_mgr.SuiteServiceFileError: raise ClientError('could not read suite passphrase') # send message msg = {'command': command, 'args': args} msg.update(self.header) LOG.debug('zmq:send %s' % msg) message = encrypt(msg, secret) self.socket.send_string(message) # receive response if self.poller.poll(timeout): res = await self.socket.recv_string() else: if self.timeout_handler: self.timeout_handler() raise ClientTimeout('Timeout waiting for server response.') try: response = decrypt(res, secret) LOG.debug('zmq:recv %s' % response) except jose.exceptions.JWTError: raise ClientError( 'Could not decrypt response. Has the passphrase changed?') try: return response['data'] except KeyError: error = response['error'] raise ClientError(error['message'], error.get('traceback'))
def callback(self, ctx): """Callback for asynchronous xtrigger functions. Record satisfaction status and function results dict. """ LOG.debug(ctx) sig = ctx.get_signature() self.active.remove(sig) try: satisfied, results = json.loads(ctx.out) except (ValueError, TypeError): return LOG.debug('%s: returned %s' % (sig, results)) if satisfied: self.pflag = True self.sat_xtrig[sig] = results
def _dump_item(path, item, value): """Dump "value" to a file called "item" in the directory "path". 1. File permission should already be user-read-write-only on creation by mkstemp. 2. The combination of os.fsync and os.rename should guarantee that we don't end up with an incomplete file. """ mkdir_p(path) from tempfile import NamedTemporaryFile handle = NamedTemporaryFile(prefix=item, dir=path, delete=False) handle.write(value) os.fsync(handle.fileno()) handle.close() fname = os.path.join(path, item) os.rename(handle.name, fname) LOG.debug('Generated %s', fname)
def callback(self, ctx): """Callback for asynchronous xtrigger functions. Record satisfaction status and function results dict. """ LOG.debug(ctx) sig = ctx.get_signature() self.active.remove(sig) try: satisfied, results = json.loads(ctx.out) except ValueError: return LOG.debug('%s: returned %s' % (sig, results)) if satisfied: self.pflag = True self.sat_xtrig[sig] = results
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_files'): if len(ctx.cmd_kwargs['stdin_files']) > 1: stdin_file = TemporaryFile() for file_ in ctx.cmd_kwargs['stdin_files']: if hasattr(file_, 'read'): stdin_file.write(file_.read()) else: stdin_file.write(open(file_, 'rb').read()) stdin_file.seek(0) elif hasattr(ctx.cmd_kwargs['stdin_files'][0], 'read'): stdin_file = ctx.cmd_kwargs['stdin_files'][0] else: stdin_file = open(ctx.cmd_kwargs['stdin_files'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = TemporaryFile('bw+') stdin_file.write(ctx.cmd_kwargs.get('stdin_str').encode()) stdin_file.seek(0) else: stdin_file = open(os.devnull) proc = procopen( ctx.cmd, stdin=stdin_file, stdoutpipe=True, stderrpipe=True, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), usesh=ctx.cmd_kwargs.get('shell')) # calls to open a shell are aggregated in cylc_subproc.procopen() # with logging for what is calling it and the commands given except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def _manip_task_jobs_callback( self, ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (self.batch_sys_mgr.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (LookupError, ValueError) as exc: LOG.warning( 'Unhandled %s output: %s', ctx.cmd_key, line) LOG.exception(exc)
def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc): """Helper for self._prep_submit_task_job. On error.""" LOG.debug("submit_num %s" % itask.submit_num) LOG.debug(traceback.format_exc()) LOG.error(exc) log_task_job_activity( SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1), suite, itask.point, itask.tdef.name, submit_num=itask.submit_num) if not dry_run: # Persist self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': get_current_time_string(), 'batch_sys_name': itask.summary.get('batch_sys_name'), }) itask.is_manual_submit = False self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
def _report_id_requests(self): """Report the frequency of identification (scan) requests.""" self._num_id_requests += 1 now = time() interval = now - self._id_start_time if interval > self.CLIENT_ID_REPORT_SECONDS: rate = float(self._num_id_requests) / interval if rate > self.CLIENT_ID_MIN_REPORT_RATE: LOG.warning(self.LOG_IDENTIFY_TMPL, self._num_id_requests, interval) else: LOG.debug(self.LOG_IDENTIFY_TMPL, self._num_id_requests, interval) self._id_start_time = now self._num_id_requests = 0 uuid = _get_client_info()[4] self.clients.setdefault(uuid, {}) self.clients[uuid]['time'] = now self._housekeep()
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG) ]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning('ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def _dump_item(path, item, value): """Dump "value" to a file called "item" in the directory "path". 1. File permission should already be user-read-write-only on creation by mkstemp. 2. The combination of os.fsync and os.rename should guarantee that we don't end up with an incomplete file. """ os.makedirs(path, exist_ok=True) from tempfile import NamedTemporaryFile handle = NamedTemporaryFile(prefix=item, dir=path, delete=False) try: handle.write(value.encode()) except AttributeError: handle.write(value) os.fsync(handle.fileno()) handle.close() fname = os.path.join(path, item) os.rename(handle.name, fname) LOG.debug('Generated %s', fname)
def load(self): """Load or reload configuration from files.""" self.sparse.clear() self.dense.clear() LOG.debug("Loading site/user global config files") conf_path_str = os.getenv("CYLC_CONF_PATH") if conf_path_str is None: # CYLC_CONF_PATH not defined, use default locations. for conf_dir_1, conf_dir_2, conf_type in [ (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD, upgrader.SITE_CONFIG), (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2, upgrader.USER_CONFIG)]: fname1 = os.path.join(conf_dir_1, self.CONF_BASE) fname2 = os.path.join(conf_dir_2, self.CONF_BASE) if os.access(fname1, os.F_OK | os.R_OK): fname = fname1 elif os.access(fname2, os.F_OK | os.R_OK): fname = fname2 else: continue try: self.loadcfg(fname, conf_type) except ParsecError as exc: if conf_type == upgrader.SITE_CONFIG: # Warn on bad site file (users can't fix it). LOG.warning( 'ignoring bad %s %s:\n%s', conf_type, fname, exc) else: # Abort on bad user file (users can fix it). LOG.error('bad %s %s', conf_type, fname) raise break elif conf_path_str: # CYLC_CONF_PATH defined with a value for path in conf_path_str.split(os.pathsep): fname = os.path.join(path, self.CONF_BASE) if os.access(fname, os.F_OK | os.R_OK): self.loadcfg(fname, upgrader.USER_CONFIG) # (OK if no global.rc is found, just use system defaults). self.transform()
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def _remote_init_callback(self, proc_ctx, host, owner, tmphandle): """Callback when "cylc remote-init" exits""" self.ready = True try: tmphandle.close() except OSError: # E.g. ignore bad unlink, etc pass if proc_ctx.ret_code == 0: for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED): if status in proc_ctx.out: # Good status LOG.debug(proc_ctx) self.remote_init_map[(host, owner)] = status return # Bad status LOG.error(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_INIT, (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd), proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)) LOG.error(proc_ctx) self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
def _rank_good_hosts(self, all_host_stats): """Rank, by specified method, 'good' hosts to return the most suitable. Take a dictionary of hosts considered 'good' with the corresponding metric data, and rank them via the method specified in the global configuration, returning the lowest-ranked (taken as best) host. """ # Convert all dict values from full metrics structures to single # metric data values corresponding to the rank method to rank with. hosts_with_vals_to_rank = dict( (host, metric[self.rank_method]) for host, metric in all_host_stats.items()) LOG.debug( "INFO: host %s values extracted are: %s", self.rank_method, "\n".join(" %s: %s" % item for item in hosts_with_vals_to_rank.items())) # Sort new dict by value to return ascending-value ordered host list. sort_asc_hosts = sorted(hosts_with_vals_to_rank, key=hosts_with_vals_to_rank.get) base_msg = ("good (metric-returning) hosts were ranked in the " "following order, from most to least suitable: %s") if self.rank_method in ("memory", "disk-space:" + self.USE_DISK_PATH): # Want 'most free' i.e. highest => reverse asc. list for ranking. LOG.debug(base_msg, ', '.join(sort_asc_hosts[::-1])) return sort_asc_hosts[-1] else: # A load av. is only poss. left; 'random' dealt with earlier. # Want lowest => ranking given by asc. list. LOG.debug(base_msg, ', '.join(sort_asc_hosts)) return sort_asc_hosts[0]
def _listener(self): """The server main loop, listen for and serve requests.""" while True: # process any commands passed to the listener by its parent process if self.queue.qsize(): command = self.queue.get() if command == 'STOP': break else: raise ValueError('Unknown command "%s"' % command) try: # wait RECV_TIMEOUT for a message msg = self.socket.recv_string() except zmq.error.Again: # timeout, continue with the loop, this allows the listener # thread to stop continue # attempt to decode the message, authenticating the user in the # process try: message = self.decode(msg, self.secret()) except Exception as exc: # purposefully catch generic exception # failed to decode message, possibly resulting from failed # authentication import traceback return {'error': { 'message': str(exc), 'traceback': traceback.format_exc()}} else: # success case - serve the request LOG.debug('zmq:recv %s', message) res = self._receiver(message) response = self.encode(res, self.secret()) LOG.debug('zmq:send %s', res) # send back the response self.socket.send_string(response) sleep(0) # yield control to other threads
def _run_command_init(cls, ctx, callback=None, callback_args=None): """Prepare and launch shell command in ctx.""" try: if ctx.cmd_kwargs.get('stdin_file_paths'): if len(ctx.cmd_kwargs['stdin_file_paths']) > 1: stdin_file = TemporaryFile() for file_path in ctx.cmd_kwargs['stdin_file_paths']: stdin_file.write(open(file_path, 'rb').read()) stdin_file.seek(0) else: stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0], 'rb') elif ctx.cmd_kwargs.get('stdin_str'): stdin_file = TemporaryFile() stdin_file.write(ctx.cmd_kwargs.get('stdin_str')) stdin_file.seek(0) else: stdin_file = open(os.devnull) proc = Popen( ctx.cmd, stdin=stdin_file, stdout=PIPE, stderr=PIPE, # Execute command as a process group leader, # so we can use "os.killpg" to kill the whole group. preexec_fn=os.setpgrp, env=ctx.cmd_kwargs.get('env'), shell=ctx.cmd_kwargs.get('shell')) except (IOError, OSError) as exc: if exc.filename is None: exc.filename = ctx.cmd[0] LOG.exception(exc) ctx.ret_code = 1 ctx.err = str(exc) cls._run_command_exit(ctx, callback, callback_args) return None else: LOG.debug(ctx.cmd) return proc
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write((ctx_str + '\n').encode()) except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _listener(self): """The server main loop, listen for and serve requests.""" while True: # process any commands passed to the listner by its parent process if self.queue.qsize(): command = self.queue.get() if command == 'STOP': break else: raise ValueError('Unknown command "%s"' % command) try: # wait RECV_TIMEOUT for a message msg = self.socket.recv_string() except zmq.error.Again: # timeout, continue with the loop, this allows the listener # thread to stop continue # attempt to decode the message, authenticating the user in the # process try: message = self.decode(msg, self.secret()) except Exception as exc: # purposefully catch generic exception # failed to decode message, possibly resulting from failed # authentication response = self.encode({'error': { 'message': str(exc) }}, self.secret()) else: # success case - serve the request LOG.debug('zmq:recv %s', message) res = self._receiver(message) response = self.encode(res, self.secret()) LOG.debug('zmq:send %s', res) # send back the response self.socket.send_string(response) sleep(0) # yield control to other threads
def log_task_job_activity(ctx, suite, point, name, submit_num=None): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = get_task_job_activity_log(suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: # This happens when there is no job directory, e.g. if job host # selection command causes an submission failure, there will be no job # directory. In this case, just send the information to the suite log. LOG.exception(exc) LOG.info(ctx_str) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def create_cylc_run_tree(self, suite): """Create all top-level cylc-run output dirs on the suite host.""" cfg = self.get() item = 'suite run directory' idir = self.get_derived_host_item(suite, item) LOG.debug('creating %s: %s', item, idir) if cfg['enable run directory housekeeping']: self.roll_directory(idir, item, cfg['run directory rolling archive length']) for item in [ 'suite log directory', 'suite job log directory', 'suite config log directory', 'suite work directory', 'suite share directory' ]: idir = self.get_derived_host_item(suite, item) LOG.debug('creating %s: %s', item, idir) self.create_directory(idir, item) item = 'temporary directory' value = cfg[item] if value: self.create_directory(value, item)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists( get_task_job_log(schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity(log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx): """Call back when log job retrieval completes.""" if proc_ctx.ret_code: LOG.error(proc_ctx) else: LOG.debug(proc_ctx) for id_key in proc_ctx.cmd_kwargs["id_keys"]: key1, point, name, submit_num = id_key try: # All completed jobs are expected to have a "job.out". fnames = [JOB_LOG_OUT] try: if key1[1] not in 'succeeded': fnames.append(JOB_LOG_ERR) except TypeError: pass fname_oks = {} for fname in fnames: fname_oks[fname] = os.path.exists(get_task_job_log( schd_ctx.suite, point, name, submit_num, fname)) # All expected paths must exist to record a good attempt log_ctx = SubProcContext((key1, submit_num), None) if all(fname_oks.values()): log_ctx.ret_code = 0 del self.event_timers[id_key] else: log_ctx.ret_code = 1 log_ctx.err = "File(s) not retrieved:" for fname, exist_ok in sorted(fname_oks.items()): if not exist_ok: log_ctx.err += " %s" % fname self.event_timers[id_key].unset_waiting() log_task_job_activity( log_ctx, schd_ctx.suite, point, name, submit_num) except KeyError as exc: LOG.exception(exc)
def create_cylc_run_tree(self, suite): """Create all top-level cylc-run output dirs on the suite host.""" cfg = self.get() item = 'suite run directory' idir = self.get_derived_host_item(suite, item) LOG.debug('creating %s: %s', item, idir) if cfg['enable run directory housekeeping']: self.roll_directory( idir, item, cfg['run directory rolling archive length']) for item in [ 'suite log directory', 'suite job log directory', 'suite config log directory', 'suite work directory', 'suite share directory']: idir = self.get_derived_host_item(suite, item) LOG.debug('creating %s: %s', item, idir) self.create_directory(idir, item) item = 'temporary directory' value = cfg[item] if value: self.create_directory(value, item)
def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True): """Prepare a task job submission. Return itask on a good preparation. """ if itask.local_job_file_path and not dry_run: return itask # Handle broadcasts overrides = self.task_events_mgr.broadcast_mgr.get_broadcast( itask.identity) if overrides: rtconfig = pdeepcopy(itask.tdef.rtconfig) poverride(rtconfig, overrides, prepend=True) else: rtconfig = itask.tdef.rtconfig # Determine task host settings now, just before job submission, # because dynamic host selection may be used. try: task_host = self.task_remote_mgr.remote_host_select( rtconfig['remote']['host']) except TaskRemoteMgmtError as exc: # Submit number not yet incremented itask.submit_num += 1 itask.summary['job_hosts'][itask.submit_num] = '' # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) self._prep_submit_task_job_error( suite, itask, dry_run, '(remote host select)', exc) return False else: if task_host is None: # host select not ready itask.set_summary_message(self.REMOTE_SELECT_MSG) return itask.task_host = task_host # Submit number not yet incremented itask.submit_num += 1 # Retry delays, needed for the try_num self._set_retry_timers(itask, rtconfig) try: job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig) local_job_file_path = get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num) self.job_file_writer.write(local_job_file_path, job_conf, check_syntax=check_syntax) except Exception as exc: # Could be a bad command template, IOError, etc self._prep_submit_task_job_error( suite, itask, dry_run, '(prepare job file)', exc) return False itask.local_job_file_path = local_job_file_path if dry_run: itask.set_summary_message('job file written (edit/dry-run)') LOG.debug('[%s] -%s', itask, itask.summary['latest_message']) # Return value used by "cylc submit" and "cylc jobscript": return itask
def submit_task_jobs(self, suite, itasks, is_simulation=False): """Prepare and submit task jobs. Submit tasks where possible. Ignore tasks that are waiting for host select command to complete, or tasks that are waiting for remote initialisation. Bad host select command, error writing to a job file or bad remote initialisation will cause a bad task - leading to submission failure. This method uses prep_submit_task_job() as helper. Return (list): list of tasks that attempted submission. """ if is_simulation: return self._simulation_submit_task_jobs(itasks) # Prepare tasks for job submission prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks) # Reset consumed host selection results self.task_remote_mgr.remote_host_select_reset() if not prepared_tasks: return bad_tasks # Group task jobs by (host, owner) auth_itasks = {} # {(host, owner): [itask, ...], ...} for itask in prepared_tasks: auth_itasks.setdefault((itask.task_host, itask.task_owner), []) auth_itasks[(itask.task_host, itask.task_owner)].append(itask) # Submit task jobs for each (host, owner) group done_tasks = bad_tasks for (host, owner), itasks in sorted(auth_itasks.items()): is_init = self.task_remote_mgr.remote_init(host, owner) if is_init is None: # Remote is waiting to be initialised for itask in itasks: itask.set_summary_message(self.REMOTE_INIT_MSG) continue # Ensure that localhost background/at jobs are recorded as running # on the host name of the current suite host, rather than just # "localhost". On suite restart on a different suite host, this # allows the restart logic to correctly poll the status of the # background/at jobs that may still be running on the previous # suite host. if ( self.batch_sys_mgr.is_job_local_to_host( itask.summary['batch_sys_name']) and not is_remote_host(host) ): owner_at_host = get_host() else: owner_at_host = host # Persist if owner: owner_at_host = owner + '@' + owner_at_host now_str = get_current_time_string() done_tasks.extend(itasks) for itask in itasks: # Log and persist LOG.info( '[%s] -submit-num=%d, owner@host=%s', itask, itask.submit_num, owner_at_host) self.suite_db_mgr.put_insert_task_jobs(itask, { 'is_manual_submit': itask.is_manual_submit, 'try_num': itask.get_try_num(), 'time_submit': now_str, 'user_at_host': owner_at_host, 'batch_sys_name': itask.summary['batch_sys_name'], }) itask.is_manual_submit = False if is_init == REMOTE_INIT_FAILED: # Remote has failed to initialise # Set submit-failed for all affected tasks for itask in itasks: itask.local_job_file_path = None # reset for retry log_task_job_activity( SubProcContext( self.JOBS_SUBMIT, '(init %s)' % owner_at_host, err=REMOTE_INIT_FAILED, ret_code=1), suite, itask.point, itask.tdef.name) self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED) continue # Build the "cylc jobs-submit" command cmd = ['cylc', self.JOBS_SUBMIT] if LOG.isEnabledFor(DEBUG): cmd.append('--debug') if get_utc_mode(): cmd.append('--utc-mode') remote_mode = False kwargs = {} for key, value, test_func in [ ('host', host, is_remote_host), ('user', owner, is_remote_user)]: if test_func(value): cmd.append('--%s=%s' % (key, value)) remote_mode = True kwargs[key] = value if remote_mode: cmd.append('--remote-mode') cmd.append('--') cmd.append(glbl_cfg().get_derived_host_item( suite, 'suite job log directory', host, owner)) # Chop itasks into a series of shorter lists if it's very big # to prevent overloading of stdout and stderr pipes. itasks = sorted(itasks, key=lambda itask: itask.identity) chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1 itasks_batches = [ itasks[i:i + chunk_size] for i in range(0, len(itasks), chunk_size)] LOG.debug( '%s ... # will invoke in batches, sizes=%s', cmd, [len(b) for b in itasks_batches]) for i, itasks_batch in enumerate(itasks_batches): stdin_files = [] job_log_dirs = [] for itask in itasks_batch: if remote_mode: stdin_files.append( get_task_job_job_log( suite, itask.point, itask.tdef.name, itask.submit_num)) job_log_dirs.append(get_task_job_id( itask.point, itask.tdef.name, itask.submit_num)) # The job file is now (about to be) used: reset the file # write flag so that subsequent manual retrigger will # generate a new job file. itask.local_job_file_path = None itask.state.reset_state(TASK_STATUS_READY) if itask.state.outputs.has_custom_triggers(): self.suite_db_mgr.put_update_task_outputs(itask) self.proc_pool.put_command( SubProcContext( self.JOBS_SUBMIT, cmd + job_log_dirs, stdin_files=stdin_files, job_log_dirs=job_log_dirs, **kwargs ), self._submit_task_jobs_callback, [suite, itasks_batch]) return done_tasks
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() if item == self.FILE_BASE_CONTACT and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', 'localhost', owner, replace_home=False), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = ( r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''' ) % { 'prefix': prefix, 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split( glbl_cfg().get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen( command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = (f.decode() for f in proc.communicate()) ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: LOG.debug( '$ %(command)s # code=%(ret_code)s\n%(err)s', { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, }) return return content
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug): r"""Look for a matching message in a Kafka topic. ARGUMENTS: * kafka_server - Kafka server URL, e.g. "localhost:9092". * kafka_topic - the Kafka topic to check, e.g. "data-avail". * group_id - determines Kafka offset ownership (see below). * message - string-ified dict with optional pattern elements (see below). * debug - boolean; set by daemon debug mode; prints to suite err log. The topic is first consumed from the beginning, then from the previous committed offset. If the message is not found by end of topic, commit the offset and return (to will try again later). If found, return the result. Kafka commits offsets per "consumer group" so the group_id argument must be unique per distinct trigger in the suite - this allows each trigger to separately consume the topic from the beginning, looking for its own messages (otherwise, with shared offsets, one trigger could move the offset beyond the messages of another trigger). This goes for successive instances of an external-triggered cycling task too, because out-of-order triggering could be required sometimes. So this argument should typically be, e.g.: group_id=x%(id)s # id ID of the dependent task where "x" is an arbitrary string you can use to change the group name if you need to re-run the suite, and the messages, from the start again, without re-running the producer suite. Note this also serves to make the function signature cycle-point-specific for Cylc even if the message does not contain the cycle point (although it probably should). The "message" argument is a stringified dict, e.g.: {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'} should be represented as: "system:prod point:2025 data:<nwp.*\.nc>" A match occurs Kafka if all message dict items match, and the result returned is the sub-dict of the actual values of items containing angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'} """ consumer = KafkaConsumer(kafka_topic, bootstrap_servers=[kafka_server], value_deserializer=json.loads, consumer_timeout_ms=CONSUMER_TIMEOUT_MS, auto_offset_reset='earliest', group_id=group_id) # Construct a dict from the message argument "key1=val1 key2=val2 ...". cylc_msg = dict(m.split(':') for m in message.split()) result = (False, {}) n_cons = 0 for kafka_msg in consumer: n_cons += 1 m = _match_msg(cylc_msg, kafka_msg) if m: result = (True, m) break # (else consume and compare next message) consumer.commit() # Unsubscribe before exit, otherwise next call will be slow while # Kafka times out waiting for this original consumer connection. consumer.unsubscribe() if debug: if result[0]: res = "\n MATCHED: %s" % result[1] else: res = "no match." LOG.debug('Kafka: "%s" (consumed %d) ... %s', message, n_cons, res) return result
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If an old contact file does not exist, do nothing. If one does exist but the suite process is definitely not alive, remove it. If one exists and the suite process is still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % ( check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = (f.decode() for f in proc.communicate()) if ret_code: LOG.debug("$ %s # return %d\n%s", ' '.join(cmd), ret_code, err) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( ( r"""suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """ ) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, } )
async def async_request(self, command, args=None, timeout=None): """Send a request. For convenience use __call__ to call this method. Args: command (str): The name of the endpoint to call. args (dict): Arguments to pass to the endpoint function. timeout (float): Override the default timeout (seconds). Raises: ClientTimeout: If a response takes longer than timeout to arrive. ClientError: Coverall for all other issues including failed authentication. Returns: object: The data exactly as returned from the endpoint function, nothing more, nothing less. """ if timeout: timeout = float(timeout) timeout = (timeout * 1000 if timeout else None) or self.timeout if not args: args = {} # get secret for this request # assumes secret won't change during the request try: secret = self.secret() except cylc.suite_srv_files_mgr.SuiteServiceFileError: raise ClientError({'message': 'could not read suite passphrase'}) # send message msg = {'command': command, 'args': args} msg.update(self.header) LOG.debug('zmq:send %s' % msg) message = encrypt(msg, secret) self.socket.send_string(message) # receive response if self.poller.poll(timeout): res = await self.socket.recv_string() else: if self.timeout_handler: self.timeout_handler() raise ClientTimeout('Timeout waiting for server response.') try: response = decrypt(res, secret) LOG.debug('zmq:recv %s' % response) except jose.exceptions.JWTError: raise ClientError({ 'message': 'Could not decrypt response. Has the passphrase ' + 'changed?' }) # return data or handle error if 'data' in response: return response['data'] else: # if else to avoid complicating the traceback stack raise ClientError(response['error'])
def process_message(self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG LOG.log(self.LEVELS.get(severity, INFO), r'[%s] -(current:%s)%s %s at %s', itask, itask.state.status, flag, message, event_time) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning('[%s] -submit-num=%02d: ignore message from job(%02d)', itask, itask.submit_num, submit_num) return # always update the suite state summary for latest message if flag == self.POLLED_FLAG: itask.set_summary_message('%s %s' % (message, self.POLLED_FLAG)) else: itask.set_summary_message(message) # Satisfy my output, if possible, and record the result. completed_trigger = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) # We should really have a special 'vacated' handler, but given that # this feature can only be used on the deprecated loadleveler # system, we should probably aim to remove support for job vacation # instead. Otherwise, we should have: # self.setup_event_handlers(itask, 'vacated', message) elif completed_trigger: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) self.setup_event_handlers(itask, completed_trigger, message) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('[%s] -(current:%s) unhandled: %s', itask, itask.state.status, message) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) lseverity = str(severity).lower() if lseverity in self.NON_UNIQUE_EVENTS: itask.non_unique_events.setdefault(lseverity, 0) itask.non_unique_events[lseverity] += 1 self.setup_event_handlers(itask, lseverity, message)
def stop(self): """Finish serving the current request then stop the server.""" LOG.debug('stopping zmq server...') self.queue.put('STOP') self.thread.join() # wait for the listener to return LOG.debug('...stopped')
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf( itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = (key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote(str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote(str(itask.summary['submit_method_id'])), "submit_time": quote(str(itask.summary['submitted_time_string'])), "start_time": quote(str(itask.summary['started_time_string'])), "finish_time": quote(str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = (TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug): """Look for a matching message in a Kafka topic. ARGUMENTS: * kafka_server - Kafka server URL, e.g. "localhost:9092". * kafka_topic - the Kafka topic to check, e.g. "data-avail". * group_id - determines Kafka offset ownership (see below). * message - string-ified dict with optional pattern elements (see below). * debug - boolean; set by daemon debug mode; prints to suite err log. The topic is first consumed from the beginning, then from the previous committed offset. If the message is not found by end of topic, commit the offset and return (to will try again later). If found, return the result. Kafka commits offsets per "consumer group" so the group_id argument must be unique per distinct trigger in the suite - this allows each trigger to separately consume the topic from the beginning, looking for its own messages (otherwise, with shared offsets, one trigger could move the offset beyond the messages of another trigger). This goes for successive instances of an external-triggered cycling task too, because out-of-order triggering could be required sometimes. So this argument should typically be, e.g.: group_id=x%(id)s # id ID of the dependent task where "x" is an arbitrary string you can use to change the group name if you need to re-run the suite, and the messages, from the start again, without re-running the producer suite. Note this also serves to make the function signature cycle-point-specific for Cylc even if the message does not contain the cycle point (although it probably should). The "message" argument is a stringified dict, e.g.: {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'} should be represented as: "system:prod point:2025 data:<nwp.*\.nc>" A match occurs Kafka if all message dict items match, and the result returned is the sub-dict of the actual values of items containing angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'}. """ consumer = KafkaConsumer(kafka_topic, bootstrap_servers=[kafka_server], value_deserializer=json.loads, consumer_timeout_ms=CONSUMER_TIMEOUT_MS, auto_offset_reset='earliest', group_id=group_id) # Construct a dict from the message argument "key1=val1 key2=val2 ...". cylc_msg = dict(m.split(':') for m in message.split()) result = (False, {}) n_cons = 0 for kafka_msg in consumer: n_cons += 1 m = _match_msg(cylc_msg, kafka_msg) if m: result = (True, m) break # (else consume and compare next message) consumer.commit() # Unsubscribe before exit, otherwise next call will be slow while # Kafka times out waiting for this original consumer connection. consumer.unsubscribe() if debug: if result[0]: res = "\n MATCHED: %s" % result[1] else: res = "no match." LOG.debug('Kafka: "%s" (consumed %d) ... %s', message, n_cons, res) return result
def construct_ssh_cmd(raw_cmd, user=None, host=None, forward_x11=False, stdin=False, ssh_login_shell=None, ssh_cylc=None, set_UTC=False, allow_flag_opts=False): """Append a bare command with further options required to run via ssh. Arguments: raw_cmd (list): primitive command to run remotely. user (string): user ID for the remote login. host (string): remote host name. Use 'localhost' if not specified. forward_x11 (boolean): If True, use 'ssh -Y' to enable X11 forwarding, else just 'ssh'. stdin: If None, the `-n` option will be added to the SSH command line. ssh_login_shell (boolean): If True, launch remote command with `bash -l -c 'exec "$0" "$@"'`. ssh_cylc (string): Location of the remote cylc executable. set_UTC (boolean): If True, check UTC mode and specify if set to True (non-default). allow_flag_opts (boolean): If True, check CYLC_DEBUG and CYLC_VERBOSE and if non-default, specify debug and/or verbosity as options to the 'raw cmd'. Return: A list containing a chosen command including all arguments and options necessary to directly execute the bare command on a given host via ssh. """ command = shlex.split(glbl_cfg().get_host_item('ssh command', host, user)) if forward_x11: command.append('-Y') if stdin is None: command.append('-n') user_at_host = '' if user: user_at_host = user + '@' if host: user_at_host += host else: user_at_host += 'localhost' command.append(user_at_host) # Pass CYLC_VERSION and optionally, CYLC_CONF_PATH & CYLC_UTC through. command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION)] try: command.append( quote(r'CYLC_CONF_PATH=%s' % os.environ['CYLC_CONF_PATH'])) except KeyError: pass if set_UTC and os.getenv('CYLC_UTC') in ["True", "true"]: command.append(quote(r'CYLC_UTC=True')) command.append(quote(r'TZ=UTC')) # Use bash -l? if ssh_login_shell is None: ssh_login_shell = glbl_cfg().get_host_item('use login shell', host, user) if ssh_login_shell: # A login shell will always source /etc/profile and the user's bash # profile file. To avoid having to quote the entire remote command # it is passed as arguments to the bash script. command += ['bash', '--login', '-c', quote(r'exec "$0" "$@"')] # 'cylc' on the remote host if ssh_cylc: command.append(ssh_cylc) else: ssh_cylc = glbl_cfg().get_host_item('cylc executable', host, user) if ssh_cylc.endswith('cylc'): command.append(ssh_cylc) else: raise ValueError( r'ERROR: bad cylc executable in global config: %s' % ssh_cylc) # Insert core raw command after ssh, but before its own, command options. command += raw_cmd if allow_flag_opts: if (cylc.flags.verbose or os.getenv('CYLC_VERBOSE') in ["True", "true"]): command.append(r'--verbose') if cylc.flags.debug or os.getenv('CYLC_DEBUG') in ["True", "true"]: command.append(r'--debug') if LOG.handlers: LOG.debug("$ %s", ' '.join(quote(c) for c in command)) elif cylc.flags.debug: sys.stderr.write("$ %s\n" % ' '.join(quote(c) for c in command)) return command
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Scheduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % ( point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % ( point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now ): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SubProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": HTTP(S) and SSH+HTTP(S) task comm ".service/passphrase": HTTP(S) task comm ".service/ssl.cert": HTTPS task comm "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item('task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create "stdin_file_paths" file, with "items" in it. tmphandle = NamedTemporaryFile() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(self.uuid_str) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(self.uuid_str) cmd.append(glbl_cfg().get_derived_host_item(self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_file_paths=[tmphandle.name]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def _load_remote_item(self, item, reg, owner, host): """Load content of service item from remote [owner@]host via SSH.""" if not is_remote(host, owner): return if host is None: host = 'localhost' if owner is None: owner = get_user() if item == self.FILE_BASE_CONTACT and not is_remote_host(host): # Attempt to read suite contact file via the local filesystem. path = r'%(run_d)s/%(srv_base)s' % { 'run_d': glbl_cfg().get_derived_host_item(reg, 'suite run directory', 'localhost', owner, replace_home=False), 'srv_base': self.DIR_BASE_SRV, } content = self._load_local_item(item, path) if content is not None: return content # Else drop through and attempt via ssh to the suite account. # Prefix STDOUT to ensure returned content is relevant prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg} # Attempt to cat passphrase file under suite service directory script = (r"""echo '%(prefix)s'; """ r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % { 'prefix': prefix, 'run_d': glbl_cfg().get_derived_host_item( reg, 'suite run directory', host, owner), 'srv_base': self.DIR_BASE_SRV, 'item': item } import shlex command = shlex.split(glbl_cfg().get_host_item('ssh command', host, owner)) command += ['-n', owner + '@' + host, script] from subprocess import Popen, PIPE try: proc = Popen(command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) except OSError: if cylc.flags.debug: import traceback traceback.print_exc() return out, err = proc.communicate() ret_code = proc.wait() # Extract passphrase from STDOUT # It should live in the line with the correct prefix content = "" can_read = False for line in out.splitlines(True): if can_read: content += line elif line.strip() == prefix: can_read = True if not content or ret_code: LOG.debug( '$ %(command)s # code=%(ret_code)s\n%(err)s', { 'command': command, # STDOUT may contain passphrase, so not safe to print # 'out': out, 'err': err, 'ret_code': ret_code, }) return return content
def _setup_custom_event_handlers(self, itask, event, message): """Set up custom task event handlers.""" handlers = self._get_events_conf(itask, event + ' handler') if (handlers is None and event in self._get_events_conf(itask, 'handler events', [])): handlers = self._get_events_conf(itask, 'handlers') if handlers is None: return retry_delays = self._get_events_conf( itask, 'handler retry delays', self.get_host_conf(itask, "task event handler retry delays")) if not retry_delays: retry_delays = [0] # There can be multiple custom event handlers for i, handler in enumerate(handlers): if event in self.NON_UNIQUE_EVENTS: key1 = ( '%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' % (event, itask.non_unique_events.get(event, 1))) else: key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event) id_key = ( key1, str(itask.point), itask.tdef.name, itask.submit_num) if id_key in self.event_timers: continue # Note: user@host may not always be set for a submit number, e.g. # on late event or if host select command fails. Use null string to # prevent issues in this case. user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '') if user_at_host and '@' not in user_at_host: # (only has 'user@' on the front if user is not suite owner). user_at_host = '%s@%s' % (get_user(), user_at_host) # Custom event handler can be a command template string # or a command that takes 4 arguments (classic interface) # Note quote() fails on None, need str(None). try: handler_data = { "event": quote(event), "suite": quote(self.suite), 'suite_uuid': quote(str(self.uuid_str)), "point": quote(str(itask.point)), "name": quote(itask.tdef.name), "submit_num": itask.submit_num, "try_num": itask.get_try_num(), "id": quote(itask.identity), "message": quote(message), "batch_sys_name": quote( str(itask.summary['batch_sys_name'])), "batch_sys_job_id": quote( str(itask.summary['submit_method_id'])), "submit_time": quote( str(itask.summary['submitted_time_string'])), "start_time": quote( str(itask.summary['started_time_string'])), "finish_time": quote( str(itask.summary['finished_time_string'])), "user@host": quote(user_at_host) } if self.suite_cfg: for key, value in self.suite_cfg.items(): if key == "URL": handler_data["suite_url"] = quote(value) else: handler_data["suite_" + key] = quote(value) if itask.tdef.rtconfig['meta']: for key, value in itask.tdef.rtconfig['meta'].items(): if key == "URL": handler_data["task_url"] = quote(value) handler_data[key] = quote(value) cmd = handler % (handler_data) except KeyError as exc: message = "%s/%s/%02d %s bad template: %s" % ( itask.point, itask.tdef.name, itask.submit_num, key1, exc) LOG.error(message) continue if cmd == handler: # Nothing substituted, assume classic interface cmd = "%s '%s' '%s' '%s' '%s'" % ( handler, event, self.suite, itask.identity, message) LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd) self.event_timers[id_key] = ( TaskActionTimer( CustomTaskEventHandlerContext( key1, self.HANDLER_CUSTOM, cmd, ), retry_delays))
def detect_old_contact_file(self, reg, check_host_port=None): """Detect old suite contact file. If an old contact file does not exist, do nothing. If one does exist but the suite process is definitely not alive, remove it. If one exists and the suite process is still alive, raise SuiteServiceFileError. If check_host_port is specified and does not match the (host, port) value in the old contact file, raise AssertionError. Args: reg (str): suite name check_host_port (tuple): (host, port) to check against Raise: AssertionError: If old contact file exists but does not have matching (host, port) with value of check_host_port. SuiteServiceFileError: If old contact file exists and the suite process still alive. """ # An old suite of the same name may be running if a contact file exists # and can be loaded. try: data = self.load_contact_file(reg) old_host = data[self.KEY_HOST] old_port = data[self.KEY_PORT] old_proc_str = data[self.KEY_PROCESS] except (IOError, ValueError, SuiteServiceFileError): # Contact file does not exist or corrupted, should be OK to proceed return if check_host_port and check_host_port != (old_host, int(old_port)): raise AssertionError("%s != (%s, %s)" % (check_host_port, old_host, old_port)) # Run the "ps" command to see if the process is still running or not. # If the old suite process is still running, it should show up with the # same command line as before. # Terminate command after 10 seconds to prevent hanging, etc. old_pid_str = old_proc_str.split(None, 1)[0].strip() cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)] if is_remote_host(old_host): import shlex ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host)) cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd from subprocess import Popen, PIPE from time import sleep, time proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE) # Terminate command after 10 seconds to prevent hanging SSH, etc. timeout = time() + 10.0 while proc.poll() is None: if time() > timeout: proc.terminate() sleep(0.1) fname = self.get_contact_file(reg) ret_code = proc.wait() out, err = proc.communicate() if ret_code: LOG.debug("$ %s # return %d\n%s", ' '.join(cmd), ret_code, err) for line in reversed(out.splitlines()): if line.strip() == old_proc_str: # Suite definitely still running break elif line.split(None, 1)[0].strip() == "PID": # Only "ps" header - "ps" has run, but no matching results. # Suite not running. Attempt to remove suite contact file. try: os.unlink(fname) return except OSError: break raise SuiteServiceFileError( (r"""ERROR, suite contact file exists: %(fname)s Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s". To start a new run, stop the old one first with one or more of these: * cylc stop %(suite)s # wait for active tasks/event handlers * cylc stop --kill %(suite)s # kill active tasks and wait * cylc stop --now %(suite)s # don't wait for active tasks * cylc stop --now --now %(suite)s # don't wait * ssh -n "%(host)s" kill %(pid)s # final brute force! """) % { "host": old_host, "port": old_port, "pid": old_pid_str, "fname": fname, "suite": reg, })
def remote_init(self, host, owner): """Initialise a remote [owner@]host if necessary. Create UUID file on suite host ".service/uuid" for remotes to identify shared file system with suite host. Call "cylc remote-init" to install suite items to remote: ".service/contact": For TCP task communication ".service/passphrase": For TCP task communication "python/": if source exists Return: REMOTE_INIT_NOT_REQUIRED: If remote init is not required, e.g. not remote REMOTE_INIT_DONE: If remote init done. REMOTE_INIT_FAILED: If init of the remote failed. Note: this will reset to None to allow retry. None: If waiting for remote init command to complete """ if self.single_task_mode or not is_remote(host, owner): return REMOTE_INIT_NOT_REQUIRED try: status = self.remote_init_map[(host, owner)] except KeyError: pass # Not yet initialised else: if status == REMOTE_INIT_FAILED: del self.remote_init_map[(host, owner)] # reset to allow retry return status # Determine what items to install comm_meth = glbl_cfg().get_host_item( 'task communication method', host, owner) owner_at_host = 'localhost' if host: owner_at_host = host if owner: owner_at_host = owner + '@' + owner_at_host LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth)) items = self._remote_init_items(comm_meth) # No item to install if not items: self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED return self.remote_init_map[(host, owner)] # Create a TAR archive with the service files, # so they can be sent later via SSH's STDIN to the task remote. tmphandle = self.proc_pool.get_temporary_file() tarhandle = tarfile.open(fileobj=tmphandle, mode='w') for path, arcname in items: tarhandle.add(path, arcname=arcname) tarhandle.close() tmphandle.seek(0) # UUID file - for remote to identify shared file system with suite host uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) if not os.path.exists(uuid_fname): open(uuid_fname, 'wb').write(str(self.uuid_str).encode()) # Build the command cmd = ['cylc', 'remote-init'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') if comm_meth in ['ssh']: cmd.append('--indirect-comm=%s' % comm_meth) cmd.append(str(self.uuid_str)) cmd.append(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner)) self.proc_pool.put_command( SubProcContext('remote-init', cmd, stdin_files=[tmphandle]), self._remote_init_callback, [host, owner, tmphandle]) # None status: Waiting for command to finish self.remote_init_map[(host, owner)] = None return self.remote_init_map[(host, owner)]
def process_message( self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG LOG.log( self.LEVELS.get(severity, INFO), r'[%s] -(current:%s)%s %s at %s', itask, itask.state.status, flag, message, event_time) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( '[%s] -submit-num=%02d: ignore message from job(%02d)', itask, itask.submit_num, submit_num) return # always update the suite state summary for latest message if flag == self.POLLED_FLAG: itask.set_summary_message('%s %s' % (message, self.POLLED_FLAG)) else: itask.set_summary_message(message) # Satisfy my output, if possible, and record the result. completed_trigger = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) # We should really have a special 'vacated' handler, but given that # this feature can only be used on the deprecated loadleveler # system, we should probably aim to remove support for job vacation # instead. Otherwise, we should have: # self.setup_event_handlers(itask, 'vacated', message) elif completed_trigger: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) self.setup_event_handlers(itask, completed_trigger, message) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '[%s] -(current:%s) unhandled: %s', itask, itask.state.status, message) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) lseverity = str(severity).lower() if lseverity in self.NON_UNIQUE_EVENTS: itask.non_unique_events.setdefault(lseverity, 0) itask.non_unique_events[lseverity] += 1 self.setup_event_handlers(itask, lseverity, message)