def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SubProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp) elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message(itask, CRITICAL, TASK_OUTPUT_FAILED) else: log_lvl = DEBUG log_msg = ('ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.set_summary_message(log_msg) LOG.log( log_lvl, "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))
def _process_message_check( self, itask, severity, message, event_time, flag, submit_num, ): """Helper for `.process_message`. See `.process_message` for argument list Check whether to process/skip message. Return True if `.process_message` should contine, False otherwise. """ logfmt = r'[%s] status=%s: %s%s at %s for job(%02d)' if flag == self.FLAG_RECEIVED and submit_num != itask.submit_num: # Ignore received messages from old jobs LOG.warning(logfmt + r' != current job(%02d)', itask, itask.state, self.FLAG_RECEIVED_IGNORED, message, event_time, submit_num, itask.submit_num) return False if itask.state.status in (TASK_STATUS_SUBMIT_RETRYING, TASK_STATUS_RETRYING): # Ignore polled messages if task is already in retrying statuses LOG.warning(logfmt, itask, itask.state, self.FLAG_POLLED_IGNORED, message, event_time, submit_num) return False LOG.log(self.LEVELS.get(severity, INFO), logfmt, itask, itask.state, flag, message, event_time, submit_num) return True
def _process_message_check( self, itask, severity, message, event_time, flag, submit_num, ): """Helper for `.process_message`. See `.process_message` for argument list Check whether to process/skip message. Return True if `.process_message` should contine, False otherwise. """ if self.timestamp: timestamp = " at %s " % event_time else: timestamp = "" logfmt = r'[%s] status=%s: %s%s%s for job(%02d) flow(%s)' if flag == self.FLAG_RECEIVED and submit_num != itask.submit_num: # Ignore received messages from old jobs LOG.warning( logfmt + r' != current job(%02d)', itask, itask.state, self.FLAG_RECEIVED_IGNORED, message, timestamp, submit_num, itask.flow_label, itask.submit_num) return False if ( itask.state(TASK_STATUS_WAITING) and ( ( # task has a submit-retry lined up TimerFlags.SUBMISSION_RETRY in itask.try_timers and itask.try_timers[ TimerFlags.SUBMISSION_RETRY].num > 0 ) or ( # task has an execution-retry lined up TimerFlags.EXECUTION_RETRY in itask.try_timers and itask.try_timers[ TimerFlags.EXECUTION_RETRY].num > 0 ) ) ): # Ignore polled messages if task has a retry lined up LOG.warning( logfmt, itask, itask.state, self.FLAG_POLLED_IGNORED, message, timestamp, submit_num, itask.flow_label) return False LOG.log( LOG_LEVELS.get(severity, INFO), logfmt, itask, itask.state, flag, message, timestamp, submit_num, itask.flow_label) return True
def upgrade(self): warnings = OrderedDict() for vn, upgs in self.upgrades.items(): for u in upgs: try: exp = self.expand(u) except (KeyError, UpgradeError): continue for upg in exp: try: old = self.get_item(upg['old']) except KeyError: # OK: deprecated item not found pass else: msg = self.show_keys(upg['old']) if upg['new']: msg += ' -> ' + self.show_keys(upg['new']) else: upg['new'] = upg['old'] msg += " - " + upg['cvt'].describe() if not upg['silent']: warnings.setdefault(vn, []) warnings[vn].append(msg) self.del_item(upg['old']) if upg['cvt'].describe() != "DELETED (OBSOLETE)": # check self.cfg does not already contain a # non-deprecated item matching upg['new']: try: self.get_item(upg['new']) except KeyError: self.put_item(upg['new'], upg['cvt'].convert(old)) else: raise UpgradeError( 'ERROR: Cannot upgrade deprecated ' f'item "{msg}" because the upgraded ' 'item already exists' ) if warnings: level = WARNING if self.descr == self.SITE_CONFIG: # Site level configuration, user cannot easily fix. # Only log at debug level. level = DEBUG else: # User level configuration, user should be able to fix. # Log at warning level. level = WARNING LOG.log(level, 'deprecated items were automatically upgraded in ' f'"{self.descr}"') for vn, msgs in warnings.items(): for msg in msgs: LOG.log(level, ' * (%s) %s', vn, msg)
def upgrade(self): warnings = OrderedDict() for vn, upgs in self.upgrades.items(): for u in upgs: try: exp = self.expand(u) except (KeyError, UpgradeError): continue for upg in exp: try: old = self.get_item(upg['old']) except KeyError: # OK: deprecated item not found pass else: msg = self.show_keys(upg['old']) if upg['new']: msg += ' -> ' + self.show_keys(upg['new']) else: upg['new'] = upg['old'] msg += " - " + upg['cvt'].describe() if not upg['silent']: warnings.setdefault(vn, []) warnings[vn].append(msg) self.del_item(upg['old']) if upg['cvt'].describe() != "DELETED (OBSOLETE)": self.put_item(upg['new'], upg['cvt'].convert(old)) if warnings: level = WARNING if self.descr == self.SITE_CONFIG: # Site level configuration, user cannot easily fix. # Only log at debug level. level = DEBUG else: # User level configuration, user should be able to fix. # Log at warning level. level = WARNING LOG.log( level, "deprecated items were automatically upgraded in '%s':", self.descr) for vn, msgs in warnings.items(): for msg in msgs: LOG.log(level, ' * (%s) %s', vn, msg)
def process_message( self, itask, severity, message, event_time=None, flag=FLAG_INTERNAL, submit_num=None, ): """Parse an task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for the current state of a task to be inconsistent with a message (whether internal, received or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.flow.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be: FLAG_INTERNAL (default): To indicate an internal message. FLAG_RECEIVED: To indicate a message received from a job or an external source. FLAG_POLLED: To indicate a message resulted from a poll. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num logfmt = r'[%s] status=%s: %s%s at %s for job(%02d)' if flag == self.FLAG_RECEIVED and submit_num != itask.submit_num: LOG.warning(logfmt + r' != current job(%02d)', itask, itask.state, self.FLAG_RECEIVED_IGNORED, message, event_time, submit_num, itask.submit_num) return LOG.log(self.LEVELS.get(severity, INFO), logfmt, itask, itask.state, flag, message, event_time, submit_num) # always update the suite state summary for latest message if flag == self.FLAG_POLLED: itask.set_summary_message('%s %s' % (message, self.FLAG_POLLED)) else: itask.set_summary_message(message) # Satisfy my output, if possible, and record the result. completed_trigger = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.FLAG_RECEIVED and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) # We should really have a special 'vacated' handler, but given that # this feature can only be used on the deprecated loadleveler # system, we should probably aim to remove support for job vacation # instead. Otherwise, we should have: # self.setup_event_handlers(itask, 'vacated', message) elif completed_trigger: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) self.setup_event_handlers(itask, completed_trigger, message) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('[%s] status=%s: unhandled: %s', itask, itask.state.status, message) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) lseverity = str(severity).lower() if lseverity in self.NON_UNIQUE_EVENTS: itask.non_unique_events.setdefault(lseverity, 0) itask.non_unique_events[lseverity] += 1 self.setup_event_handlers(itask, lseverity, message)