Ejemplo n.º 1
0
    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        try:
            itask.summary['submit_method_id'] = items[3]
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
Ejemplo n.º 2
0
    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        try:
            itask.summary['submit_method_id'] = items[3]
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
Ejemplo n.º 3
0
 def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _kill_task_jobs_callback, on one task job."""
     ctx = SubProcContext(self.JOBS_KILL, None)
     ctx.out = line
     try:
         ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = int(ctx.ret_code)
         if ctx.ret_code:
             ctx.cmd = cmd_ctx.cmd  # print original command on failure
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
     log_lvl = INFO
     log_msg = 'killed'
     if ctx.ret_code:  # non-zero exit status
         log_lvl = WARNING
         log_msg = 'kill failed'
         itask.state.kill_failed = True
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
             ctx.timestamp)
     elif itask.state.status == TASK_STATUS_RUNNING:
         self.task_events_mgr.process_message(
             itask, CRITICAL, TASK_OUTPUT_FAILED)
     else:
         log_lvl = DEBUG
         log_msg = (
             'ignoring job kill result, unexpected task state: %s' %
             itask.state.status)
     itask.set_summary_message(log_msg)
     LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
         itask.identity, itask.submit_num, log_msg))
Ejemplo n.º 4
0
 def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _kill_task_jobs_callback, on one task job."""
     ctx = SubProcContext(self.JOBS_KILL, None)
     ctx.out = line
     try:
         ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = int(ctx.ret_code)
         if ctx.ret_code:
             ctx.cmd = cmd_ctx.cmd  # print original command on failure
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
     log_lvl = INFO
     log_msg = 'killed'
     if ctx.ret_code:  # non-zero exit status
         log_lvl = WARNING
         log_msg = 'kill failed'
         itask.state.kill_failed = True
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
             ctx.timestamp)
     elif itask.state.status == TASK_STATUS_RUNNING:
         self.task_events_mgr.process_message(
             itask, CRITICAL, TASK_OUTPUT_FAILED)
     else:
         log_lvl = DEBUG
         log_msg = (
             'ignoring job kill result, unexpected task state: %s' %
             itask.state.status)
     itask.set_summary_message(log_msg)
     LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
         itask.identity, itask.submit_num, log_msg))
Ejemplo n.º 5
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 6
0
 def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _poll_task_jobs_callback, on message of one task job."""
     ctx = SubProcContext(self.JOBS_POLL, None)
     ctx.out = line
     try:
         event_time, severity, message = line.split("|")[2:5]
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = 0
         self.task_events_mgr.process_message(
             itask, severity, message, event_time,
             self.task_events_mgr.POLLED_FLAG)
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
Ejemplo n.º 7
0
 def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _poll_task_jobs_callback, on message of one task job."""
     ctx = SubProcContext(self.JOBS_POLL, None)
     ctx.out = line
     try:
         event_time, severity, message = line.split("|")[2:5]
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = 0
         self.task_events_mgr.process_message(
             itask, severity, message, event_time,
             self.task_events_mgr.POLLED_FLAG)
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
Ejemplo n.º 8
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 9
0
 def _event_email_callback(self, proc_ctx, schd_ctx):
     """Call back when email notification command exits."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             if proc_ctx.ret_code == 0:
                 del self.event_timers[id_key]
                 log_ctx = SubProcContext((key1, submit_num), None)
                 log_ctx.ret_code = 0
                 log_task_job_activity(
                     log_ctx, schd_ctx.suite, point, name, submit_num)
             else:
                 self.event_timers[id_key].unset_waiting()
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 10
0
 def _event_email_callback(self, proc_ctx, schd_ctx):
     """Call back when email notification command exits."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             if proc_ctx.ret_code == 0:
                 del self.event_timers[id_key]
                 log_ctx = SubProcContext((key1, submit_num), None)
                 log_ctx.ret_code = 0
                 log_task_job_activity(log_ctx, schd_ctx.suite, point, name,
                                       submit_num)
             else:
                 self.event_timers[id_key].unset_waiting()
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 11
0
    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.batch_sys_manager.JobPollContext
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x]) for
                    x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.POLLED_FLAG
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(),
                flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit,
                flag)
Ejemplo n.º 12
0
    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.batch_sys_manager.JobPollContext
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x]) for
                    x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.POLLED_FLAG
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(),
                flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit,
                flag)