def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity(ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, "%s at %s" % (self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp), self.poll_task_jobs) cylc.flags.iflag = True elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message(itask, CRITICAL, TASK_OUTPUT_FAILED, self.poll_task_jobs) cylc.flags.iflag = True else: log_lvl = WARNING log_msg = ('ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.summary['latest_message'] = log_msg LOG.log( log_lvl, "[%s] -job(%02d) %s" % (itask.identity, itask.submit_num, log_msg))
def _kill_task_job_callback(self, suite, itask, cmd_ctx, line): """Helper for _kill_task_jobs_callback, on one task job.""" ctx = SuiteProcContext(self.JOBS_KILL, None) ctx.out = line try: ctx.timestamp, _, ctx.ret_code = line.split("|", 2) except ValueError: ctx.ret_code = 1 ctx.cmd = cmd_ctx.cmd # print original command on failure else: ctx.ret_code = int(ctx.ret_code) if ctx.ret_code: ctx.cmd = cmd_ctx.cmd # print original command on failure self.task_events_mgr.log_task_job_activity( ctx, suite, itask.point, itask.tdef.name) log_lvl = INFO log_msg = 'killed' if ctx.ret_code: # non-zero exit status log_lvl = WARNING log_msg = 'kill failed' itask.state.kill_failed = True elif itask.state.status == TASK_STATUS_SUBMITTED: self.task_events_mgr.process_message( itask, CRITICAL, "%s at %s" % ( self.task_events_mgr.EVENT_SUBMIT_FAILED, ctx.timestamp)) cylc.flags.iflag = True elif itask.state.status == TASK_STATUS_RUNNING: self.task_events_mgr.process_message( itask, CRITICAL, TASK_OUTPUT_FAILED) cylc.flags.iflag = True else: log_lvl = WARNING log_msg = ( 'ignoring job kill result, unexpected task state: %s' % itask.state.status) itask.summary['latest_message'] = log_msg LOG.log(log_lvl, "[%s] -job(%02d) %s" % ( itask.identity, itask.submit_num, log_msg))
def process_message(self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % (itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def process_message(self, itask, severity, message, poll_func, poll_event_time=None, is_incoming=False): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. """ is_polled = poll_event_time is not None # Log incoming messages with '>' to distinguish non-message log entries message_flag = "" if is_incoming: message_flag = self.INCOMING_FLAG log_message = '(current:%s)%s %s' % ( itask.state.status, message_flag, message) if poll_event_time is not None: log_message += ' %s' % self.POLLED_INDICATOR LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) # Strip the "at TIME" suffix. event_time = poll_event_time if not event_time: match = self.RE_MESSAGE_TIME.match(message) if match: message, event_time = match.groups() if not event_time: event_time = get_current_time_string() # always update the suite state summary for latest message itask.summary['latest_message'] = message if is_polled: itask.summary['latest_message'] += " %s" % self.POLLED_INDICATOR cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func): return self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func): return self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMIT_FAILED, poll_func): return self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func): return self._process_message_submitted(itask, event_time) elif message.startswith(TaskMessage.FAIL_MESSAGE_PREFIX): # Task received signal. signal = message[len(TaskMessage.FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(TaskMessage.ABORT_MESSAGE_PREFIX): # Task aborted with message aborted_with = message[len(TaskMessage.ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(TaskMessage.VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_event_time('started') # reset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None # Believe this and change state without polling (could poll?). cylc.flags.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. cylc.flags.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current: %s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity == "CUSTOM": self.setup_event_handlers(itask, "custom", message) elif severity in [TaskMessage.WARNING, TaskMessage.CRITICAL]: self.setup_event_handlers(itask, severity.lower(), message)
def process_message( self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % ( itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def process_message(self, itask, severity, message, poll_func, poll_event_time=None, incoming_event_time=None, submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. """ # Log incoming messages with '>' to distinguish non-message log entries if incoming_event_time: if submit_num is None or submit_num == itask.submit_num: message_flag = self.INCOMING_FLAG else: message_flag = self.IGNORED_INCOMING_FLAG event_time = incoming_event_time elif poll_event_time: message_flag = self.POLLED_FLAG event_time = poll_event_time else: message_flag = '' event_time = get_current_time_string() log_message = '(current:%s)%s %s at %s' % ( itask.state.status, message_flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if message_flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if poll_event_time is not None: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func): return self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func): return self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMIT_FAILED, poll_func): return self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func): return self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)