Ejemplo n.º 1
0
    def _process_job_logs_retrieval(self, schd_ctx, ctx, id_keys):
        """Process retrieval of task job logs from remote user@host."""
        if ctx.user_at_host and "@" in ctx.user_at_host:
            s_user, s_host = ctx.user_at_host.split("@", 1)
        else:
            s_user, s_host = (None, ctx.user_at_host)
        ssh_str = str(glbl_cfg().get_host_item("ssh command", s_host, s_user))
        rsync_str = str(glbl_cfg().get_host_item("retrieve job logs command",
                                                 s_host, s_user))

        cmd = shlex.split(rsync_str) + ["--rsh=" + ssh_str]
        if LOG.isEnabledFor(DEBUG):
            cmd.append("-v")
        if ctx.max_size:
            cmd.append("--max-size=%s" % (ctx.max_size, ))
        # Includes and excludes
        includes = set()
        for _, point, name, submit_num in id_keys:
            # Include relevant directories, all levels needed
            includes.add("/%s" % (point))
            includes.add("/%s/%s" % (point, name))
            includes.add("/%s/%s/%02d" % (point, name, submit_num))
            includes.add("/%s/%s/%02d/**" % (point, name, submit_num))
        cmd += ["--include=%s" % (include) for include in sorted(includes)]
        cmd.append("--exclude=/**")  # exclude everything else
        # Remote source
        cmd.append(ctx.user_at_host + ":" + glbl_cfg().get_derived_host_item(
            schd_ctx.suite, "suite job log directory", s_host, s_user) + "/")
        # Local target
        cmd.append(glbl_cfg().get_derived_host_item(
            schd_ctx.suite, "suite job log directory") + "/")
        self.proc_pool.put_command(
            SubProcContext(ctx, cmd, env=dict(os.environ), id_keys=id_keys),
            self._job_logs_retrieval_callback, [schd_ctx])
Ejemplo n.º 2
0
    def _run_job_cmd(self, cmd_key, suite, itasks, callback):
        """Run job commands, e.g. poll, kill, etc.

        Group itasks with their user@host.
        Put a job command for each user@host to the multiprocess pool.

        """
        if not itasks:
            return
        auth_itasks = {}
        for itask in itasks:
            if (itask.task_host, itask.task_owner) not in auth_itasks:
                auth_itasks[(itask.task_host, itask.task_owner)] = []
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        for (host, owner), itasks in sorted(auth_itasks.items()):
            cmd = ["cylc", cmd_key]
            if LOG.isEnabledFor(DEBUG):
                cmd.append("--debug")
            if is_remote_host(host):
                cmd.append("--host=%s" % (host))
            if is_remote_user(owner):
                cmd.append("--user=%s" % (owner))
            cmd.append("--")
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, "suite job log directory", host, owner))
            job_log_dirs = []
            for itask in sorted(itasks, key=lambda itask: itask.identity):
                job_log_dirs.append(get_task_job_id(
                    itask.point, itask.tdef.name, itask.submit_num))
            cmd += job_log_dirs
            self.proc_pool.put_command(
                SubProcContext(cmd_key, cmd), callback, [suite, itasks])
Ejemplo n.º 3
0
 def _kill_task_job_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _kill_task_jobs_callback, on one task job."""
     ctx = SubProcContext(self.JOBS_KILL, None)
     ctx.out = line
     try:
         ctx.timestamp, _, ctx.ret_code = line.split("|", 2)
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = int(ctx.ret_code)
         if ctx.ret_code:
             ctx.cmd = cmd_ctx.cmd  # print original command on failure
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
     log_lvl = INFO
     log_msg = 'killed'
     if ctx.ret_code:  # non-zero exit status
         log_lvl = WARNING
         log_msg = 'kill failed'
         itask.state.kill_failed = True
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
             ctx.timestamp)
     elif itask.state.status == TASK_STATUS_RUNNING:
         self.task_events_mgr.process_message(
             itask, CRITICAL, TASK_OUTPUT_FAILED)
     else:
         log_lvl = DEBUG
         log_msg = (
             'ignoring job kill result, unexpected task state: %s' %
             itask.state.status)
     itask.set_summary_message(log_msg)
     LOG.log(log_lvl, "[%s] -job(%02d) %s" % (
         itask.identity, itask.submit_num, log_msg))
Ejemplo n.º 4
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 5
0
 def test_run_command_returns_0(self):
     """Test basic usage, command returns 0"""
     ctx = SubProcContext('truth', ['true'])
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, '')
     self.assertEqual(ctx.ret_code, 0)
Ejemplo n.º 6
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                          action,
                                          err=exc,
                                          ret_code=1),
                           suite,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(
             itask, {
                 'is_manual_submit': itask.is_manual_submit,
                 'try_num': itask.get_try_num(),
                 'time_submit': get_current_time_string(),
                 'batch_sys_name': itask.summary.get('batch_sys_name'),
             })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
Ejemplo n.º 7
0
 def test_run_command_returns_1(self):
     """Test basic usage, command returns 1"""
     ctx = SubProcContext('lies', ['false'])
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, '')
     self.assertEqual(ctx.ret_code, 1)
Ejemplo n.º 8
0
 def test_run_command_writes_to_out(self):
     """Test basic usage, command writes to STDOUT"""
     ctx = SubProcContext('parrot', ['echo', 'pirate', 'urrrr'])
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, 'pirate urrrr\n')
     self.assertEqual(ctx.ret_code, 0)
Ejemplo n.º 9
0
 def test_run_command_with_stdin_from_str(self):
     """Test STDIN from string"""
     ctx = SubProcContext('meow', ['cat'], stdin_str='catches mice.\n')
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, 'catches mice.\n')
     self.assertEqual(ctx.ret_code, 0)
Ejemplo n.º 10
0
    def _submit_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _submit_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_SUBMIT, None)
        ctx.out = line
        items = line.split("|")
        try:
            ctx.timestamp, _, ctx.ret_code = items[0:3]
        except ValueError:
            ctx.ret_code = 1
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
        else:
            ctx.ret_code = int(ctx.ret_code)
            if ctx.ret_code:
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
        log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        if ctx.ret_code == SubProcPool.RET_CODE_SUITE_STOPPING:
            return

        try:
            itask.summary['submit_method_id'] = items[3]
        except IndexError:
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] == "None":
            itask.summary['submit_method_id'] = None
        if itask.summary['submit_method_id'] and ctx.ret_code == 0:
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUBMITTED, ctx.timestamp)
        else:
            self.task_events_mgr.process_message(
                itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                ctx.timestamp)
Ejemplo n.º 11
0
 def test_run_command_writes_to_err(self):
     """Test basic usage, command writes to STDERR"""
     ctx = SubProcContext('parrot2',
                          ['bash', '-c', 'echo pirate errrr >&2'])
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, 'pirate errrr\n')
     self.assertEqual(ctx.out, '')
     self.assertEqual(ctx.ret_code, 0)
Ejemplo n.º 12
0
    def remote_host_select(self, host_str):
        """Evaluate a task host string.

        Arguments:
            host_str (str):
                An explicit host name, a command in back-tick or $(command)
                format, or an environment variable holding a hostname.

        Return (str):
            None if evaluate of host_str is still taking place.
            'localhost' if host_str is not defined or if the evaluated host
            name is equivalent to 'localhost'.
            Otherwise, return the evaluated host name on success.

        Raise TaskRemoteMgmtError on error.

        """
        if not host_str:
            return 'localhost'

        # Host selection command: $(command) or `command`
        match = REC_COMMAND.match(host_str)
        if match:
            cmd_str = match.groups()[1]
            if cmd_str in self.remote_host_str_map:
                # Command recently launched
                value = self.remote_host_str_map[cmd_str]
                if isinstance(value, TaskRemoteMgmtError):
                    raise value  # command failed
                elif value is None:
                    return  # command not yet ready
                else:
                    host_str = value  # command succeeded
            else:
                # Command not launched (or already reset)
                timeout = glbl_cfg().get(['task host select command timeout'])
                if timeout:
                    cmd = ['timeout', str(int(timeout)), 'bash', '-c', cmd_str]
                else:
                    cmd = ['bash', '-c', cmd_str]
                self.proc_pool.put_command(
                    SubProcContext('remote-host-select',
                                   cmd,
                                   env=dict(os.environ)),
                    self._remote_host_select_callback, [cmd_str])
                self.remote_host_str_map[cmd_str] = None
                return self.remote_host_str_map[cmd_str]

        # Environment variable substitution
        host_str = os.path.expandvars(host_str)
        # Remote?
        if is_remote_host(host_str):
            return host_str
        else:
            return 'localhost'
Ejemplo n.º 13
0
 def _process_event_email(self, schd_ctx, ctx, id_keys):
     """Process event notification, by email."""
     if len(id_keys) == 1:
         # 1 event from 1 task
         (_, event), point, name, submit_num = id_keys[0]
         subject = "[%s/%s/%02d %s] %s" % (point, name, submit_num, event,
                                           schd_ctx.suite)
     else:
         event_set = set(id_key[0][1] for id_key in id_keys)
         if len(event_set) == 1:
             # 1 event from n tasks
             subject = "[%d tasks %s] %s" % (len(id_keys), event_set.pop(),
                                             schd_ctx.suite)
         else:
             # n events from n tasks
             subject = "[%d task events] %s" % (len(id_keys),
                                                schd_ctx.suite)
     cmd = ["mail", "-s", subject]
     # From: and To:
     cmd.append("-r")
     cmd.append(ctx.mail_from)
     cmd.append(ctx.mail_to)
     # STDIN for mail, tasks
     stdin_str = ""
     for id_key in sorted(id_keys):
         (_, event), point, name, submit_num = id_key
         stdin_str += "%s: %s/%s/%02d\n" % (event, point, name, submit_num)
     # STDIN for mail, event info + suite detail
     stdin_str += "\n"
     for label, value in [('suite', schd_ctx.suite),
                          ("host", schd_ctx.host), ("port", schd_ctx.port),
                          ("owner", schd_ctx.owner)]:
         if value:
             stdin_str += "%s: %s\n" % (label, value)
     if self.mail_footer:
         stdin_str += (self.mail_footer + "\n") % {
             "host": schd_ctx.host,
             "port": schd_ctx.port,
             "owner": schd_ctx.owner,
             "suite": schd_ctx.suite
         }
     # SMTP server
     env = dict(os.environ)
     mail_smtp = ctx.mail_smtp
     if mail_smtp:
         env["smtp"] = mail_smtp
     self.proc_pool.put_command(
         SubProcContext(
             ctx,
             cmd,
             env=env,
             stdin_str=stdin_str,
             id_keys=id_keys,
         ), self._event_email_callback, [schd_ctx])
Ejemplo n.º 14
0
 def test_run_command_with_stdin_from_path(self):
     """Test STDIN from a single file path"""
     handle = NamedTemporaryFile()
     handle.write('catches mice.\n'.encode('UTF-8'))
     handle.seek(0)
     ctx = SubProcContext('meow', ['cat'], stdin_files=[handle.name])
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, 'catches mice.\n')
     self.assertEqual(ctx.ret_code, 0)
     handle.close()
Ejemplo n.º 15
0
 def _run_event_custom_handlers(self, config, ctx):
     """Helper for "run_event_handlers", custom event handlers."""
     # Look for event handlers
     # 1. Handlers for specific event
     # 2. General handlers
     handlers = self.get_events_conf(config, '%s handler' % ctx.event)
     if not handlers and (ctx.event in self.get_events_conf(
             config, 'handler events', [])):
         handlers = self.get_events_conf(config, 'handlers')
     if not handlers:
         return
     for i, handler in enumerate(handlers):
         cmd_key = ('%s-%02d' % (self.SUITE_EVENT_HANDLER, i), ctx.event)
         # Handler command may be a string for substitution
         abort_on_error = self.get_events_conf(
             config, 'abort if %s handler fails' % ctx.event)
         try:
             handler_data = {
                 'event': quote(ctx.event),
                 'message': quote(ctx.reason),
                 'suite': quote(ctx.suite),
                 'suite_uuid': quote(str(ctx.uuid_str)),
             }
             if config.cfg['meta']:
                 for key, value in config.cfg['meta'].items():
                     if key == "URL":
                         handler_data["suite_url"] = quote(value)
                     handler_data[key] = quote(value)
             cmd = handler % (handler_data)
         except KeyError as exc:
             message = "%s bad template: %s" % (cmd_key, exc)
             LOG.error(message)
             if abort_on_error:
                 raise SuiteEventError(message)
             continue
         if cmd == handler:
             # Nothing substituted, assume classic interface
             cmd = "%s '%s' '%s' '%s'" % (handler, ctx.event, ctx.suite,
                                          ctx.reason)
         proc_ctx = SubProcContext(cmd_key,
                                   cmd,
                                   env=dict(os.environ),
                                   shell=True)
         if abort_on_error or self.proc_pool.closed:
             # Run command in foreground if abort on failure is set or if
             # process pool is closed
             self.proc_pool.run_command(proc_ctx)
             self._run_event_handlers_callback(
                 proc_ctx, abort_on_error=abort_on_error)
         else:
             # Run command using process pool otherwise
             self.proc_pool.put_command(proc_ctx,
                                        self._run_event_handlers_callback)
Ejemplo n.º 16
0
 def _poll_task_job_message_callback(self, suite, itask, cmd_ctx, line):
     """Helper for _poll_task_jobs_callback, on message of one task job."""
     ctx = SubProcContext(self.JOBS_POLL, None)
     ctx.out = line
     try:
         event_time, severity, message = line.split("|")[2:5]
     except ValueError:
         ctx.ret_code = 1
         ctx.cmd = cmd_ctx.cmd  # print original command on failure
     else:
         ctx.ret_code = 0
         self.task_events_mgr.process_message(
             itask, severity, message, event_time,
             self.task_events_mgr.POLLED_FLAG)
     log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)
Ejemplo n.º 17
0
 def _event_email_callback(self, proc_ctx, schd_ctx):
     """Call back when email notification command exits."""
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             if proc_ctx.ret_code == 0:
                 del self.event_timers[id_key]
                 log_ctx = SubProcContext((key1, submit_num), None)
                 log_ctx.ret_code = 0
                 log_task_job_activity(log_ctx, schd_ctx.suite, point, name,
                                       submit_num)
             else:
                 self.event_timers[id_key].unset_waiting()
         except KeyError as exc:
             LOG.exception(exc)
Ejemplo n.º 18
0
 def test_run_command_with_stdin_from_handles(self):
     """Test STDIN from multiple file handles"""
     handles = []
     for txt in ['catches mice.\n', 'eat fish.\n']:
         handle = TemporaryFile()
         handle.write(txt.encode('UTF-8'))
         handle.seek(0)
         handles.append(handle)
     ctx = SubProcContext('meow', ['cat'], stdin_files=handles)
     SubProcPool.run_command(ctx)
     self.assertEqual(ctx.err, '')
     self.assertEqual(ctx.out, 'catches mice.\neat fish.\n')
     self.assertEqual(ctx.ret_code, 0)
     for handle in handles:
         handle.close()
Ejemplo n.º 19
0
    def test_log_debug_on_noerror_exit_code(self, cylc_log):
        """Test that a debug log is emitted when the log retrieval command
        exited with an non-error code (i.e. 0).

        :param cylc_log: mocked cylc logger
        :type cylc_log: mock.MagicMock
        """
        task_events_manager = TaskEventsManager(None, None, None, None)
        proc_ctx = SubProcContext(cmd_key=None,
                                  cmd="ls /tmp/123",
                                  ret_code=0,
                                  err="",
                                  id_keys=[])
        task_events_manager._job_logs_retrieval_callback(proc_ctx, None)
        self.assertEqual(1, cylc_log.debug.call_count)
        self.assertTrue(cylc_log.debug.call_args.contains("ls /tmp/123"))
Ejemplo n.º 20
0
    def test_log_error_on_error_exit_code(self, cylc_log):
        """Test that an error log is emitted when the log retrieval command
        exited with a code different than zero.

        :param cylc_log: mocked cylc logger
        :type cylc_log: mock.MagicMock
        """
        task_events_manager = TaskEventsManager(None, None, None, None)
        proc_ctx = SubProcContext(cmd_key=None,
                                  cmd="error",
                                  ret_code=1,
                                  err="Error!",
                                  id_keys=[])
        task_events_manager._job_logs_retrieval_callback(proc_ctx, None)
        self.assertEqual(1, cylc_log.error.call_count)
        self.assertTrue(cylc_log.error.call_args.contains("Error!"))
Ejemplo n.º 21
0
 def _run_event_mail(self, config, ctx):
     """Helper for "run_event_handlers", do mail notification."""
     if ctx.event in self.get_events_conf(config, 'mail events', []):
         # SMTP server
         env = dict(os.environ)
         mail_smtp = self.get_events_conf(config, 'mail smtp')
         if mail_smtp:
             env['smtp'] = mail_smtp
         subject = '[suite %(event)s] %(suite)s' % {
             'suite': ctx.suite,
             'event': ctx.event
         }
         stdin_str = ''
         for name, value in [('suite event', ctx.event),
                             ('reason', ctx.reason), ('suite', ctx.suite),
                             ('host', ctx.host), ('port', ctx.port),
                             ('owner', ctx.owner)]:
             if value:
                 stdin_str += '%s: %s\n' % (name, value)
         mail_footer_tmpl = self.get_events_conf(config, 'mail footer')
         if mail_footer_tmpl:
             stdin_str += (mail_footer_tmpl + '\n') % {
                 'host': ctx.host,
                 'port': ctx.port,
                 'owner': ctx.owner,
                 'suite': ctx.suite
             }
         proc_ctx = SubProcContext((self.SUITE_EVENT_HANDLER, ctx.event), [
             'mail',
             '-s',
             subject,
             '-r',
             self.get_events_conf(config, 'mail from',
                                  'notifications@' + get_host()),
             self.get_events_conf(config, 'mail to', get_user()),
         ],
                                   env=env,
                                   stdin_str=stdin_str)
         if self.proc_pool.closed:
             # Run command in foreground if process pool is closed
             self.proc_pool.run_command(proc_ctx)
             self._run_event_handlers_callback(proc_ctx)
         else:
             # Run command using process pool otherwise
             self.proc_pool.put_command(proc_ctx,
                                        self._run_event_mail_callback)
Ejemplo n.º 22
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Ejemplo n.º 23
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Scheduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" %
                                (point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (point, name, submit_num, key1,
                                      timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                    # Avoid flooding user's mail box with mail notification.
                    # Group together as many notifications as possible within a
                    # given interval.
                    timer.ctx.ctx_type == self.HANDLER_MAIL
                    and not schd_ctx.stop_mode
                    and self.next_mail_time is not None
                    and self.next_mail_time > now):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SubProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd,
                        env=os.environ,
                        shell=True,
                    ), self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
Ejemplo n.º 24
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": HTTP(S) and SSH+HTTP(S) task comm
            ".service/passphrase": HTTP(S) task comm
            ".service/ssl.cert": HTTPS task comm
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create "stdin_file_paths" file, with "items" in it.
        tmphandle = NamedTemporaryFile()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(self.uuid_str)
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(self.uuid_str)
        cmd.append(glbl_cfg().get_derived_host_item(self.suite,
                                                    'suite run directory',
                                                    host, owner))
        self.proc_pool.put_command(
            SubProcContext('remote-init',
                           cmd,
                           stdin_file_paths=[tmphandle.name]),
            self._remote_init_callback, [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Ejemplo n.º 25
0
    def _poll_task_job_callback(self, suite, itask, cmd_ctx, line):
        """Helper for _poll_task_jobs_callback, on one task job."""
        ctx = SubProcContext(self.JOBS_POLL, None)
        ctx.out = line
        ctx.ret_code = 0

        # See cylc.batch_sys_manager.JobPollContext
        try:
            job_log_dir, context = line.split('|')[1:3]
            items = json.loads(context)
            jp_ctx = JobPollContext(job_log_dir, **items)
        except TypeError:
            itask.set_summary_message(self.POLL_FAIL)
            ctx.cmd = cmd_ctx.cmd  # print original command on failure
            return
        except ValueError:
            # back compat for cylc 7.7.1 and previous
            try:
                values = line.split('|')
                items = dict(  # done this way to ensure IndexError is raised
                    (key, values[x]) for
                    x, key in enumerate(JobPollContext.CONTEXT_ATTRIBUTES))
                job_log_dir = items.pop('job_log_dir')
            except (ValueError, IndexError):
                itask.set_summary_message(self.POLL_FAIL)
                ctx.cmd = cmd_ctx.cmd  # print original command on failure
                return
        finally:
            log_task_job_activity(ctx, suite, itask.point, itask.tdef.name)

        flag = self.task_events_mgr.POLLED_FLAG
        if jp_ctx.run_status == 1 and jp_ctx.run_signal in ["ERR", "EXIT"]:
            # Failed normally
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
        elif jp_ctx.run_status == 1 and jp_ctx.batch_sys_exit_polled == 1:
            # Failed by a signal, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, jp_ctx.time_run_exit, flag)
            self.task_events_mgr.process_message(
                itask, INFO, FAIL_MESSAGE_PREFIX + jp_ctx.run_signal,
                jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.run_status == 1:
            # The job has terminated, but is still managed by batch system.
            # Some batch system may restart a job in this state, so don't
            # mark as failed yet.
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.run_status == 0:
            # The job succeeded
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_SUCCEEDED, jp_ctx.time_run_exit,
                flag)
        elif jp_ctx.time_run and jp_ctx.batch_sys_exit_polled == 1:
            # The job has terminated without executing the error trap
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_FAILED, get_current_time_string(),
                flag)
        elif jp_ctx.time_run:
            # The job has started, and is still managed by batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_OUTPUT_STARTED, jp_ctx.time_run, flag)
        elif jp_ctx.batch_sys_exit_polled == 1:
            # The job never ran, and no longer in batch system
            self.task_events_mgr.process_message(
                itask, INFO, self.task_events_mgr.EVENT_SUBMIT_FAILED,
                jp_ctx.time_submit_exit, flag)
        else:
            # The job never ran, and is in batch system
            self.task_events_mgr.process_message(
                itask, INFO, TASK_STATUS_SUBMITTED, jp_ctx.time_submit_exit,
                flag)