Example #1
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Example #2
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(SubProcContext(self.JOBS_SUBMIT,
                                          action,
                                          err=exc,
                                          ret_code=1),
                           suite,
                           itask.point,
                           itask.tdef.name,
                           submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(
             itask, {
                 'is_manual_submit': itask.is_manual_submit,
                 'try_num': itask.get_try_num(),
                 'time_submit': get_current_time_string(),
                 'batch_sys_name': itask.summary.get('batch_sys_name'),
             })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
Example #3
0
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     prev_status, prev_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD and
             status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     self.is_updated = True
     # Log
     message = str(prev_status)
     if prev_hold_swap:
         message += " (%s)" % prev_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug("[%s] -%s", self.identity, message)
     return (prev_status, prev_hold_swap)
Example #4
0
    def poll_task_jobs(self, suite, itasks, poll_succ=True, msg=None):
        """Poll jobs of specified tasks.

        Any job that is or was submitted or running can be polled, except for
        retrying tasks - which would poll (correctly) as failed. And don't poll
        succeeded tasks by default.

        This method uses _poll_task_jobs_callback() and
        _manip_task_jobs_callback() as help/callback methods.

        _poll_task_job_callback() executes one specific job.
        """
        to_poll_tasks = []
        pollable_statuses = set([
            TASK_STATUS_SUBMITTED, TASK_STATUS_RUNNING, TASK_STATUS_FAILED])
        if poll_succ:
            pollable_statuses.add(TASK_STATUS_SUCCEEDED)
        for itask in itasks:
            if itask.state.status in pollable_statuses:
                to_poll_tasks.append(itask)
            else:
                LOG.debug("skipping %s: not pollable, "
                          "or skipping 'succeeded' tasks" % itask.identity)
        if to_poll_tasks:
            if msg is not None:
                LOG.info(msg)
            self._run_job_cmd(
                self.JOBS_POLL, suite, to_poll_tasks,
                self._poll_task_jobs_callback)
Example #5
0
    def _check_access_priv_and_report(self,
                                      required_privilege_level,
                                      log_info=True):
        """Check access privilege and log requests with identifying info.

        In debug mode log all requests including task messages. Otherwise log
        all user commands, and just the first info command from each client.

        Return:
            dict: containing the client session

        """
        self._check_access_priv(required_privilege_level)
        command = inspect.currentframe().f_back.f_code.co_name
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        LOG.debug(self.LOG_CONNECT_ALLOWED_TMPL, user, host, prog_name,
                  priv_level, uuid)
        if cylc.flags.debug or uuid not in self.clients and log_info:
            LOG.info(self.LOG_COMMAND_TMPL, command, user, host, prog_name,
                     uuid)
        self.clients.setdefault(uuid, {})
        self.clients[uuid]['time'] = time()
        self._housekeep()
        return self.clients[uuid]
Example #6
0
 def _set_state(self, status):
     """Set, log and record task status (normal change, not forced - don't
     update task_events table)."""
     if self.status == self.hold_swap:
         self.hold_swap = None
     if status == self.status and self.hold_swap is None:
         return
     prev_status, prev_hold_swap = self.status, self.hold_swap
     if status == TASK_STATUS_HELD:
         self.hold_swap = self.status
     elif status in TASK_STATUSES_ACTIVE:
         if self.status == TASK_STATUS_HELD:
             self.hold_swap = TASK_STATUS_HELD
     elif (self.hold_swap == TASK_STATUS_HELD
           and status not in TASK_STATUSES_FINAL):
         self.hold_swap = status
         status = TASK_STATUS_HELD
     elif self.hold_swap:
         self.hold_swap = None
     self.status = status
     self.time_updated = get_current_time_string()
     self.is_updated = True
     # Log
     message = str(prev_status)
     if prev_hold_swap:
         message += " (%s)" % prev_hold_swap
     message += " => %s" % self.status
     if self.hold_swap:
         message += " (%s)" % self.hold_swap
     LOG.debug("[%s] -%s", self.identity, message)
     return (prev_status, prev_hold_swap)
Example #7
0
 def _forget_client(self, uuid):
     """Forget a client."""
     try:
         client_info = self.clients.pop(uuid)
     except KeyError:
         return False
     if client_info.get('err_log_handler') is not None:
         LOG.removeHandler(client_info.get('err_log_handler'))
     LOG.debug(self.LOG_FORGET_TMPL, uuid)
     return True
Example #8
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
     #
     # Note for "reload": A TaskProxy instance may be replaced on reload, so
     # the "itasks" list may not reference the TaskProxy objects that
     # replace the old ones. The .reload_successor attribute provides the
     # link(s) for us to get to the latest replacement.
     #
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     tasks = {}
     for itask in itasks:
         while itask.reload_successor is not None:
             itask = itask.reload_successor
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
     bad_tasks = dict(tasks)
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                         del bad_tasks[(point, name, submit_num)]
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError, KeyError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
     # Task jobs that are in the original command but did not get a status
     # in the output. Handle as failures.
     for key, itask in sorted(bad_tasks.items()):
         line = (
             "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
         summary_callback(suite, itask, ctx, line)
Example #9
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     # A dict for easy reference of (CYCLE, NAME, SUBMIT_NUM) -> TaskProxy
     #
     # Note for "reload": A TaskProxy instance may be replaced on reload, so
     # the "itasks" list may not reference the TaskProxy objects that
     # replace the old ones. The .reload_successor attribute provides the
     # link(s) for us to get to the latest replacement.
     #
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     tasks = {}
     for itask in itasks:
         while itask.reload_successor is not None:
             itask = itask.reload_successor
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
     bad_tasks = dict(tasks)
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     if prefix == self.batch_sys_mgr.OUT_PREFIX_SUMMARY:
                         del bad_tasks[(point, name, submit_num)]
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError, KeyError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
     # Task jobs that are in the original command but did not get a status
     # in the output. Handle as failures.
     for key, itask in sorted(bad_tasks.items()):
         line = (
             "|".join([ctx.timestamp, os.sep.join(key), "1"]) + "\n")
         summary_callback(suite, itask, ctx, line)
Example #10
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Example #11
0
 def _remote_host_select_callback(self, proc_ctx, cmd_str):
     """Callback when host select command exits"""
     self.ready = True
     if proc_ctx.ret_code == 0 and proc_ctx.out:
         # Good status
         LOG.debug(proc_ctx)
         self.remote_host_str_map[cmd_str] = proc_ctx.out.splitlines()[0]
     else:
         # Bad status
         LOG.error(proc_ctx)
         self.remote_host_str_map[cmd_str] = TaskRemoteMgmtError(
             TaskRemoteMgmtError.MSG_SELECT, (cmd_str, None), cmd_str,
             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err)
Example #12
0
    async def async_request(self, command, args=None, timeout=None):
        """Send an asynchronous request using asyncio.

        Has the same arguments and return values as ``serial_request``.

        """
        if timeout:
            timeout = float(timeout)
        timeout = (timeout * 1000 if timeout else None) or self.timeout
        if not args:
            args = {}

        # get secret for this request
        # assumes secret won't change during the request
        try:
            secret = self.secret()
        except cylc.suite_srv_files_mgr.SuiteServiceFileError:
            raise ClientError('could not read suite passphrase')

        # send message
        msg = {'command': command, 'args': args}
        msg.update(self.header)
        LOG.debug('zmq:send %s' % msg)
        message = encrypt(msg, secret)
        self.socket.send_string(message)

        # receive response
        if self.poller.poll(timeout):
            res = await self.socket.recv_string()
        else:
            if self.timeout_handler:
                self.timeout_handler()
            raise ClientTimeout('Timeout waiting for server response.')

        try:
            response = decrypt(res, secret)
            LOG.debug('zmq:recv %s' % response)
        except jose.exceptions.JWTError:
            raise ClientError(
                'Could not decrypt response. Has the passphrase changed?')

        try:
            return response['data']
        except KeyError:
            error = response['error']
            raise ClientError(error['message'], error.get('traceback'))
Example #13
0
    def callback(self, ctx):
        """Callback for asynchronous xtrigger functions.

        Record satisfaction status and function results dict.

        """
        LOG.debug(ctx)
        sig = ctx.get_signature()
        self.active.remove(sig)
        try:
            satisfied, results = json.loads(ctx.out)
        except (ValueError, TypeError):
            return
        LOG.debug('%s: returned %s' % (sig, results))
        if satisfied:
            self.pflag = True
            self.sat_xtrig[sig] = results
Example #14
0
    def _dump_item(path, item, value):
        """Dump "value" to a file called "item" in the directory "path".

        1. File permission should already be user-read-write-only on
           creation by mkstemp.
        2. The combination of os.fsync and os.rename should guarantee
           that we don't end up with an incomplete file.
        """
        mkdir_p(path)
        from tempfile import NamedTemporaryFile
        handle = NamedTemporaryFile(prefix=item, dir=path, delete=False)
        handle.write(value)
        os.fsync(handle.fileno())
        handle.close()
        fname = os.path.join(path, item)
        os.rename(handle.name, fname)
        LOG.debug('Generated %s', fname)
Example #15
0
    def callback(self, ctx):
        """Callback for asynchronous xtrigger functions.

        Record satisfaction status and function results dict.

        """
        LOG.debug(ctx)
        sig = ctx.get_signature()
        self.active.remove(sig)
        try:
            satisfied, results = json.loads(ctx.out)
        except ValueError:
            return
        LOG.debug('%s: returned %s' % (sig, results))
        if satisfied:
            self.pflag = True
            self.sat_xtrig[sig] = results
Example #16
0
 def _run_command_init(cls, ctx, callback=None, callback_args=None):
     """Prepare and launch shell command in ctx."""
     try:
         if ctx.cmd_kwargs.get('stdin_files'):
             if len(ctx.cmd_kwargs['stdin_files']) > 1:
                 stdin_file = TemporaryFile()
                 for file_ in ctx.cmd_kwargs['stdin_files']:
                     if hasattr(file_, 'read'):
                         stdin_file.write(file_.read())
                     else:
                         stdin_file.write(open(file_, 'rb').read())
                 stdin_file.seek(0)
             elif hasattr(ctx.cmd_kwargs['stdin_files'][0], 'read'):
                 stdin_file = ctx.cmd_kwargs['stdin_files'][0]
             else:
                 stdin_file = open(ctx.cmd_kwargs['stdin_files'][0], 'rb')
         elif ctx.cmd_kwargs.get('stdin_str'):
             stdin_file = TemporaryFile('bw+')
             stdin_file.write(ctx.cmd_kwargs.get('stdin_str').encode())
             stdin_file.seek(0)
         else:
             stdin_file = open(os.devnull)
         proc = procopen(
             ctx.cmd,
             stdin=stdin_file,
             stdoutpipe=True,
             stderrpipe=True,
             # Execute command as a process group leader,
             # so we can use "os.killpg" to kill the whole group.
             preexec_fn=os.setpgrp,
             env=ctx.cmd_kwargs.get('env'),
             usesh=ctx.cmd_kwargs.get('shell'))
         # calls to open a shell are aggregated in cylc_subproc.procopen()
         # with logging for what is calling it and the commands given
     except (IOError, OSError) as exc:
         if exc.filename is None:
             exc.filename = ctx.cmd[0]
         LOG.exception(exc)
         ctx.ret_code = 1
         ctx.err = str(exc)
         cls._run_command_exit(ctx, callback, callback_args)
         return None
     else:
         LOG.debug(ctx.cmd)
         return proc
Example #17
0
 def _manip_task_jobs_callback(
         self, ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(self.batch_sys_mgr.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (self.batch_sys_mgr.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (LookupError, ValueError) as exc:
                     LOG.warning(
                         'Unhandled %s output: %s', ctx.cmd_key, line)
                     LOG.exception(exc)
Example #18
0
 def _prep_submit_task_job_error(self, suite, itask, dry_run, action, exc):
     """Helper for self._prep_submit_task_job. On error."""
     LOG.debug("submit_num %s" % itask.submit_num)
     LOG.debug(traceback.format_exc())
     LOG.error(exc)
     log_task_job_activity(
         SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
         suite, itask.point, itask.tdef.name, submit_num=itask.submit_num)
     if not dry_run:
         # Persist
         self.suite_db_mgr.put_insert_task_jobs(itask, {
             'is_manual_submit': itask.is_manual_submit,
             'try_num': itask.get_try_num(),
             'time_submit': get_current_time_string(),
             'batch_sys_name': itask.summary.get('batch_sys_name'),
         })
         itask.is_manual_submit = False
         self.task_events_mgr.process_message(
             itask, CRITICAL, self.task_events_mgr.EVENT_SUBMIT_FAILED)
Example #19
0
 def _report_id_requests(self):
     """Report the frequency of identification (scan) requests."""
     self._num_id_requests += 1
     now = time()
     interval = now - self._id_start_time
     if interval > self.CLIENT_ID_REPORT_SECONDS:
         rate = float(self._num_id_requests) / interval
         if rate > self.CLIENT_ID_MIN_REPORT_RATE:
             LOG.warning(self.LOG_IDENTIFY_TMPL, self._num_id_requests,
                         interval)
         else:
             LOG.debug(self.LOG_IDENTIFY_TMPL, self._num_id_requests,
                       interval)
         self._id_start_time = now
         self._num_id_requests = 0
     uuid = _get_client_info()[4]
     self.clients.setdefault(uuid, {})
     self.clients[uuid]['time'] = now
     self._housekeep()
Example #20
0
 def load(self):
     """Load or reload configuration from files."""
     self.sparse.clear()
     self.dense.clear()
     LOG.debug("Loading site/user global config files")
     conf_path_str = os.getenv("CYLC_CONF_PATH")
     if conf_path_str is None:
         # CYLC_CONF_PATH not defined, use default locations.
         for conf_dir_1, conf_dir_2, conf_type in [
             (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD,
              upgrader.SITE_CONFIG),
             (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2,
              upgrader.USER_CONFIG)
         ]:
             fname1 = os.path.join(conf_dir_1, self.CONF_BASE)
             fname2 = os.path.join(conf_dir_2, self.CONF_BASE)
             if os.access(fname1, os.F_OK | os.R_OK):
                 fname = fname1
             elif os.access(fname2, os.F_OK | os.R_OK):
                 fname = fname2
             else:
                 continue
             try:
                 self.loadcfg(fname, conf_type)
             except ParsecError as exc:
                 if conf_type == upgrader.SITE_CONFIG:
                     # Warn on bad site file (users can't fix it).
                     LOG.warning('ignoring bad %s %s:\n%s', conf_type,
                                 fname, exc)
                 else:
                     # Abort on bad user file (users can fix it).
                     LOG.error('bad %s %s', conf_type, fname)
                     raise
                 break
     elif conf_path_str:
         # CYLC_CONF_PATH defined with a value
         for path in conf_path_str.split(os.pathsep):
             fname = os.path.join(path, self.CONF_BASE)
             if os.access(fname, os.F_OK | os.R_OK):
                 self.loadcfg(fname, upgrader.USER_CONFIG)
     # (OK if no global.rc is found, just use system defaults).
     self.transform()
Example #21
0
    def _dump_item(path, item, value):
        """Dump "value" to a file called "item" in the directory "path".

        1. File permission should already be user-read-write-only on
           creation by mkstemp.
        2. The combination of os.fsync and os.rename should guarantee
           that we don't end up with an incomplete file.
        """
        os.makedirs(path, exist_ok=True)
        from tempfile import NamedTemporaryFile
        handle = NamedTemporaryFile(prefix=item, dir=path, delete=False)
        try:
            handle.write(value.encode())
        except AttributeError:
            handle.write(value)
        os.fsync(handle.fileno())
        handle.close()
        fname = os.path.join(path, item)
        os.rename(handle.name, fname)
        LOG.debug('Generated %s', fname)
Example #22
0
 def load(self):
     """Load or reload configuration from files."""
     self.sparse.clear()
     self.dense.clear()
     LOG.debug("Loading site/user global config files")
     conf_path_str = os.getenv("CYLC_CONF_PATH")
     if conf_path_str is None:
         # CYLC_CONF_PATH not defined, use default locations.
         for conf_dir_1, conf_dir_2, conf_type in [
                 (self.SITE_CONF_DIR, self.SITE_CONF_DIR_OLD,
                  upgrader.SITE_CONFIG),
                 (self.USER_CONF_DIR_1, self.USER_CONF_DIR_2,
                  upgrader.USER_CONFIG)]:
             fname1 = os.path.join(conf_dir_1, self.CONF_BASE)
             fname2 = os.path.join(conf_dir_2, self.CONF_BASE)
             if os.access(fname1, os.F_OK | os.R_OK):
                 fname = fname1
             elif os.access(fname2, os.F_OK | os.R_OK):
                 fname = fname2
             else:
                 continue
             try:
                 self.loadcfg(fname, conf_type)
             except ParsecError as exc:
                 if conf_type == upgrader.SITE_CONFIG:
                     # Warn on bad site file (users can't fix it).
                     LOG.warning(
                         'ignoring bad %s %s:\n%s', conf_type, fname, exc)
                 else:
                     # Abort on bad user file (users can fix it).
                     LOG.error('bad %s %s', conf_type, fname)
                     raise
                 break
     elif conf_path_str:
         # CYLC_CONF_PATH defined with a value
         for path in conf_path_str.split(os.pathsep):
             fname = os.path.join(path, self.CONF_BASE)
             if os.access(fname, os.F_OK | os.R_OK):
                 self.loadcfg(fname, upgrader.USER_CONFIG)
     # (OK if no global.rc is found, just use system defaults).
     self.transform()
Example #23
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     try:
         tmphandle.close()
     except OSError:  # E.g. ignore bad unlink, etc
         pass
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(
         TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_INIT, (host, owner),
                             ' '.join(quote(item) for item in proc_ctx.cmd),
                             proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
Example #24
0
 def _remote_init_callback(self, proc_ctx, host, owner, tmphandle):
     """Callback when "cylc remote-init" exits"""
     self.ready = True
     try:
         tmphandle.close()
     except OSError:  # E.g. ignore bad unlink, etc
         pass
     if proc_ctx.ret_code == 0:
         for status in (REMOTE_INIT_DONE, REMOTE_INIT_NOT_REQUIRED):
             if status in proc_ctx.out:
                 # Good status
                 LOG.debug(proc_ctx)
                 self.remote_init_map[(host, owner)] = status
                 return
     # Bad status
     LOG.error(TaskRemoteMgmtError(
         TaskRemoteMgmtError.MSG_INIT,
         (host, owner), ' '.join(quote(item) for item in proc_ctx.cmd),
         proc_ctx.ret_code, proc_ctx.out, proc_ctx.err))
     LOG.error(proc_ctx)
     self.remote_init_map[(host, owner)] = REMOTE_INIT_FAILED
Example #25
0
    def _rank_good_hosts(self, all_host_stats):
        """Rank, by specified method, 'good' hosts to return the most suitable.

        Take a dictionary of hosts considered 'good' with the corresponding
        metric data, and rank them via the method specified in the global
        configuration, returning the lowest-ranked (taken as best) host.
        """
        # Convert all dict values from full metrics structures to single
        # metric data values corresponding to the rank method to rank with.
        hosts_with_vals_to_rank = dict(
            (host, metric[self.rank_method])
            for host, metric in all_host_stats.items())
        LOG.debug(
            "INFO: host %s values extracted are: %s", self.rank_method,
            "\n".join("  %s: %s" % item
                      for item in hosts_with_vals_to_rank.items()))

        # Sort new dict by value to return ascending-value ordered host list.
        sort_asc_hosts = sorted(hosts_with_vals_to_rank,
                                key=hosts_with_vals_to_rank.get)
        base_msg = ("good (metric-returning) hosts were ranked in the "
                    "following order, from most to least suitable: %s")
        if self.rank_method in ("memory", "disk-space:" + self.USE_DISK_PATH):
            # Want 'most free' i.e. highest => reverse asc. list for ranking.
            LOG.debug(base_msg, ', '.join(sort_asc_hosts[::-1]))
            return sort_asc_hosts[-1]
        else:  # A load av. is only poss. left; 'random' dealt with earlier.
            # Want lowest => ranking given by asc. list.
            LOG.debug(base_msg, ', '.join(sort_asc_hosts))
            return sort_asc_hosts[0]
Example #26
0
    def _listener(self):
        """The server main loop, listen for and serve requests."""
        while True:
            # process any commands passed to the listener by its parent process
            if self.queue.qsize():
                command = self.queue.get()
                if command == 'STOP':
                    break
                else:
                    raise ValueError('Unknown command "%s"' % command)

            try:
                # wait RECV_TIMEOUT for a message
                msg = self.socket.recv_string()
            except zmq.error.Again:
                # timeout, continue with the loop, this allows the listener
                # thread to stop
                continue

            # attempt to decode the message, authenticating the user in the
            # process
            try:
                message = self.decode(msg, self.secret())
            except Exception as exc:  # purposefully catch generic exception
                # failed to decode message, possibly resulting from failed
                # authentication
                import traceback
                return {'error': {
                    'message': str(exc), 'traceback': traceback.format_exc()}}
            else:
                # success case - serve the request
                LOG.debug('zmq:recv %s', message)
                res = self._receiver(message)
                response = self.encode(res, self.secret())
                LOG.debug('zmq:send %s', res)

            # send back the response
            self.socket.send_string(response)
            sleep(0)  # yield control to other threads
Example #27
0
 def _run_command_init(cls, ctx, callback=None, callback_args=None):
     """Prepare and launch shell command in ctx."""
     try:
         if ctx.cmd_kwargs.get('stdin_file_paths'):
             if len(ctx.cmd_kwargs['stdin_file_paths']) > 1:
                 stdin_file = TemporaryFile()
                 for file_path in ctx.cmd_kwargs['stdin_file_paths']:
                     stdin_file.write(open(file_path, 'rb').read())
                 stdin_file.seek(0)
             else:
                 stdin_file = open(ctx.cmd_kwargs['stdin_file_paths'][0],
                                   'rb')
         elif ctx.cmd_kwargs.get('stdin_str'):
             stdin_file = TemporaryFile()
             stdin_file.write(ctx.cmd_kwargs.get('stdin_str'))
             stdin_file.seek(0)
         else:
             stdin_file = open(os.devnull)
         proc = Popen(
             ctx.cmd,
             stdin=stdin_file,
             stdout=PIPE,
             stderr=PIPE,
             # Execute command as a process group leader,
             # so we can use "os.killpg" to kill the whole group.
             preexec_fn=os.setpgrp,
             env=ctx.cmd_kwargs.get('env'),
             shell=ctx.cmd_kwargs.get('shell'))
     except (IOError, OSError) as exc:
         if exc.filename is None:
             exc.filename = ctx.cmd[0]
         LOG.exception(exc)
         ctx.ret_code = 1
         ctx.err = str(exc)
         cls._run_command_exit(ctx, callback, callback_args)
         return None
     else:
         LOG.debug(ctx.cmd)
         return proc
Example #28
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(
        suite, point, name, submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write((ctx_str + '\n').encode())
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Example #29
0
    def _listener(self):
        """The server main loop, listen for and serve requests."""
        while True:
            # process any commands passed to the listner by its parent process
            if self.queue.qsize():
                command = self.queue.get()
                if command == 'STOP':
                    break
                else:
                    raise ValueError('Unknown command "%s"' % command)

            try:
                # wait RECV_TIMEOUT for a message
                msg = self.socket.recv_string()
            except zmq.error.Again:
                # timeout, continue with the loop, this allows the listener
                # thread to stop
                continue

            # attempt to decode the message, authenticating the user in the
            # process
            try:
                message = self.decode(msg, self.secret())
            except Exception as exc:  # purposefully catch generic exception
                # failed to decode message, possibly resulting from failed
                # authentication
                response = self.encode({'error': {
                    'message': str(exc)
                }}, self.secret())
            else:
                # success case - serve the request
                LOG.debug('zmq:recv %s', message)
                res = self._receiver(message)
                response = self.encode(res, self.secret())
                LOG.debug('zmq:send %s', res)

            # send back the response
            self.socket.send_string(response)
            sleep(0)  # yield control to other threads
Example #30
0
def log_task_job_activity(ctx, suite, point, name, submit_num=None):
    """Log an activity for a task job."""
    ctx_str = str(ctx)
    if not ctx_str:
        return
    if isinstance(ctx.cmd_key, tuple):  # An event handler
        submit_num = ctx.cmd_key[-1]
    job_activity_log = get_task_job_activity_log(suite, point, name,
                                                 submit_num)
    try:
        with open(job_activity_log, "ab") as handle:
            handle.write(ctx_str + '\n')
    except IOError as exc:
        # This happens when there is no job directory, e.g. if job host
        # selection command causes an submission failure, there will be no job
        # directory. In this case, just send the information to the suite log.
        LOG.exception(exc)
        LOG.info(ctx_str)
    if ctx.cmd and ctx.ret_code:
        LOG.error(ctx_str)
    elif ctx.cmd:
        LOG.debug(ctx_str)
Example #31
0
    def create_cylc_run_tree(self, suite):
        """Create all top-level cylc-run output dirs on the suite host."""
        cfg = self.get()
        item = 'suite run directory'
        idir = self.get_derived_host_item(suite, item)
        LOG.debug('creating %s: %s', item, idir)
        if cfg['enable run directory housekeeping']:
            self.roll_directory(idir, item,
                                cfg['run directory rolling archive length'])

        for item in [
                'suite log directory', 'suite job log directory',
                'suite config log directory', 'suite work directory',
                'suite share directory'
        ]:
            idir = self.get_derived_host_item(suite, item)
            LOG.debug('creating %s: %s', item, idir)
            self.create_directory(idir, item)

        item = 'temporary directory'
        value = cfg[item]
        if value:
            self.create_directory(value, item)
Example #32
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(
                     get_task_job_log(schd_ctx.suite, point, name,
                                      submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(log_ctx, schd_ctx.suite, point, name,
                                   submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Example #33
0
 def _job_logs_retrieval_callback(self, proc_ctx, schd_ctx):
     """Call back when log job retrieval completes."""
     if proc_ctx.ret_code:
         LOG.error(proc_ctx)
     else:
         LOG.debug(proc_ctx)
     for id_key in proc_ctx.cmd_kwargs["id_keys"]:
         key1, point, name, submit_num = id_key
         try:
             # All completed jobs are expected to have a "job.out".
             fnames = [JOB_LOG_OUT]
             try:
                 if key1[1] not in 'succeeded':
                     fnames.append(JOB_LOG_ERR)
             except TypeError:
                 pass
             fname_oks = {}
             for fname in fnames:
                 fname_oks[fname] = os.path.exists(get_task_job_log(
                     schd_ctx.suite, point, name, submit_num, fname))
             # All expected paths must exist to record a good attempt
             log_ctx = SubProcContext((key1, submit_num), None)
             if all(fname_oks.values()):
                 log_ctx.ret_code = 0
                 del self.event_timers[id_key]
             else:
                 log_ctx.ret_code = 1
                 log_ctx.err = "File(s) not retrieved:"
                 for fname, exist_ok in sorted(fname_oks.items()):
                     if not exist_ok:
                         log_ctx.err += " %s" % fname
                 self.event_timers[id_key].unset_waiting()
             log_task_job_activity(
                 log_ctx, schd_ctx.suite, point, name, submit_num)
         except KeyError as exc:
             LOG.exception(exc)
Example #34
0
    def create_cylc_run_tree(self, suite):
        """Create all top-level cylc-run output dirs on the suite host."""
        cfg = self.get()
        item = 'suite run directory'
        idir = self.get_derived_host_item(suite, item)
        LOG.debug('creating %s: %s', item, idir)
        if cfg['enable run directory housekeeping']:
            self.roll_directory(
                idir, item, cfg['run directory rolling archive length'])

        for item in [
                'suite log directory',
                'suite job log directory',
                'suite config log directory',
                'suite work directory',
                'suite share directory']:
            idir = self.get_derived_host_item(suite, item)
            LOG.debug('creating %s: %s', item, idir)
            self.create_directory(idir, item)

        item = 'temporary directory'
        value = cfg[item]
        if value:
            self.create_directory(value, item)
Example #35
0
    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num)
            self.job_file_writer.write(local_job_file_path, job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            itask.set_summary_message('job file written (edit/dry-run)')
            LOG.debug('[%s] -%s', itask, itask.summary['latest_message'])

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask
Example #36
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Example #37
0
 def _load_remote_item(self, item, reg, owner, host):
     """Load content of service item from remote [owner@]host via SSH."""
     if not is_remote(host, owner):
         return
     if host is None:
         host = 'localhost'
     if owner is None:
         owner = get_user()
     if item == self.FILE_BASE_CONTACT and not is_remote_host(host):
         # Attempt to read suite contact file via the local filesystem.
         path = r'%(run_d)s/%(srv_base)s' % {
             'run_d': glbl_cfg().get_derived_host_item(
                 reg, 'suite run directory', 'localhost', owner,
                 replace_home=False),
             'srv_base': self.DIR_BASE_SRV,
         }
         content = self._load_local_item(item, path)
         if content is not None:
             return content
         # Else drop through and attempt via ssh to the suite account.
     # Prefix STDOUT to ensure returned content is relevant
     prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg}
     # Attempt to cat passphrase file under suite service directory
     script = (
         r"""echo '%(prefix)s'; """
         r'''cat "%(run_d)s/%(srv_base)s/%(item)s"'''
     ) % {
         'prefix': prefix,
         'run_d': glbl_cfg().get_derived_host_item(
             reg, 'suite run directory', host, owner),
         'srv_base': self.DIR_BASE_SRV,
         'item': item
     }
     import shlex
     command = shlex.split(
         glbl_cfg().get_host_item('ssh command', host, owner))
     command += ['-n', owner + '@' + host, script]
     from subprocess import Popen, PIPE
     try:
         proc = Popen(
             command, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE)
     except OSError:
         if cylc.flags.debug:
             import traceback
             traceback.print_exc()
         return
     out, err = (f.decode() for f in proc.communicate())
     ret_code = proc.wait()
     # Extract passphrase from STDOUT
     # It should live in the line with the correct prefix
     content = ""
     can_read = False
     for line in out.splitlines(True):
         if can_read:
             content += line
         elif line.strip() == prefix:
             can_read = True
     if not content or ret_code:
         LOG.debug(
             '$ %(command)s  # code=%(ret_code)s\n%(err)s',
             {
                 'command': command,
                 # STDOUT may contain passphrase, so not safe to print
                 # 'out': out,
                 'err': err,
                 'ret_code': ret_code,
             })
         return
     return content
Example #38
0
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug):
    r"""Look for a matching message in a Kafka topic.

    ARGUMENTS:
     * kafka_server - Kafka server URL, e.g. "localhost:9092".
     * kafka_topic - the Kafka topic to check, e.g. "data-avail".
     * group_id - determines Kafka offset ownership (see below).
     * message - string-ified dict with optional pattern elements (see below).
     * debug - boolean; set by daemon debug mode; prints to suite err log.

    The topic is first consumed from the beginning, then from the previous
    committed offset. If the message is not found by end of topic, commit the
    offset and return (to will try again later). If found, return the result.

    Kafka commits offsets per "consumer group" so the group_id argument
    must be unique per distinct trigger in the suite - this allows each trigger
    to separately consume the topic from the beginning, looking for its own
    messages (otherwise, with shared offsets, one trigger could move the offset
    beyond the messages of another trigger). This goes for successive instances
    of an external-triggered cycling task too, because out-of-order triggering
    could be required sometimes. So this argument should typically be, e.g.:

        group_id=x%(id)s  # id ID of the dependent task

    where "x" is an arbitrary string you can use to change the group name if
    you need to re-run the suite, and the messages, from the start again,
    without re-running the producer suite. Note this also serves to make the
    function signature cycle-point-specific for Cylc even if the message does
    not contain the cycle point (although it probably should).

    The "message" argument is a stringified dict, e.g.:
        {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'}
    should be represented as:
        "system:prod point:2025 data:<nwp.*\.nc>"

    A match occurs Kafka if all message dict items match, and the result
    returned is the sub-dict of the actual values of items containing
    angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'}

    """

    consumer = KafkaConsumer(kafka_topic, bootstrap_servers=[kafka_server],
                             value_deserializer=json.loads,
                             consumer_timeout_ms=CONSUMER_TIMEOUT_MS,
                             auto_offset_reset='earliest',
                             group_id=group_id)

    # Construct a dict from the message argument "key1=val1 key2=val2 ...".
    cylc_msg = dict(m.split(':') for m in message.split())

    result = (False, {})
    n_cons = 0
    for kafka_msg in consumer:
        n_cons += 1
        m = _match_msg(cylc_msg, kafka_msg)
        if m:
            result = (True, m)
            break
        # (else consume and compare next message)
    consumer.commit()
    # Unsubscribe before exit, otherwise next call will be slow while
    # Kafka times out waiting for this original consumer connection.
    consumer.unsubscribe()
    if debug:
        if result[0]:
            res = "\n  MATCHED: %s" % result[1]
        else:
            res = "no match."
        LOG.debug('Kafka: "%s" (consumed %d) ... %s', message, n_cons, res)
    return result
Example #39
0
    def detect_old_contact_file(self, reg, check_host_port=None):
        """Detect old suite contact file.

        If an old contact file does not exist, do nothing. If one does exist
        but the suite process is definitely not alive, remove it. If one exists
        and the suite process is still alive, raise SuiteServiceFileError.

        If check_host_port is specified and does not match the (host, port)
        value in the old contact file, raise AssertionError.

        Args:
            reg (str): suite name
            check_host_port (tuple): (host, port) to check against

        Raise:
            AssertionError:
                If old contact file exists but does not have matching
                (host, port) with value of check_host_port.
            SuiteServiceFileError:
                If old contact file exists and the suite process still alive.
        """
        # An old suite of the same name may be running if a contact file exists
        # and can be loaded.
        try:
            data = self.load_contact_file(reg)
            old_host = data[self.KEY_HOST]
            old_port = data[self.KEY_PORT]
            old_proc_str = data[self.KEY_PROCESS]
        except (IOError, ValueError, SuiteServiceFileError):
            # Contact file does not exist or corrupted, should be OK to proceed
            return
        if check_host_port and check_host_port != (old_host, int(old_port)):
            raise AssertionError("%s != (%s, %s)" % (
                check_host_port, old_host, old_port))
        # Run the "ps" command to see if the process is still running or not.
        # If the old suite process is still running, it should show up with the
        # same command line as before.
        # Terminate command after 10 seconds to prevent hanging, etc.
        old_pid_str = old_proc_str.split(None, 1)[0].strip()
        cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)]
        if is_remote_host(old_host):
            import shlex
            ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host))
            cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd
        from subprocess import Popen, PIPE
        from time import sleep, time
        proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE)
        # Terminate command after 10 seconds to prevent hanging SSH, etc.
        timeout = time() + 10.0
        while proc.poll() is None:
            if time() > timeout:
                proc.terminate()
            sleep(0.1)
        fname = self.get_contact_file(reg)
        ret_code = proc.wait()
        out, err = (f.decode() for f in proc.communicate())
        if ret_code:
            LOG.debug("$ %s  # return %d\n%s", ' '.join(cmd), ret_code, err)
        for line in reversed(out.splitlines()):
            if line.strip() == old_proc_str:
                # Suite definitely still running
                break
            elif line.split(None, 1)[0].strip() == "PID":
                # Only "ps" header - "ps" has run, but no matching results.
                # Suite not running. Attempt to remove suite contact file.
                try:
                    os.unlink(fname)
                    return
                except OSError:
                    break

        raise SuiteServiceFileError(
            (
                r"""suite contact file exists: %(fname)s

Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s".

To start a new run, stop the old one first with one or more of these:
* cylc stop %(suite)s              # wait for active tasks/event handlers
* cylc stop --kill %(suite)s       # kill active tasks and wait
* cylc stop --now %(suite)s        # don't wait for active tasks
* cylc stop --now --now %(suite)s  # don't wait
* ssh -n "%(host)s" kill %(pid)s   # final brute force!
"""
            ) % {
                "host": old_host,
                "port": old_port,
                "pid": old_pid_str,
                "fname": fname,
                "suite": reg,
            }
        )
Example #40
0
    async def async_request(self, command, args=None, timeout=None):
        """Send a request.

        For convenience use __call__ to call this method.

        Args:
            command (str): The name of the endpoint to call.
            args (dict): Arguments to pass to the endpoint function.
            timeout (float): Override the default timeout (seconds).

        Raises:
            ClientTimeout: If a response takes longer than timeout to arrive.
            ClientError: Coverall for all other issues including failed
                authentication.

        Returns:
            object: The data exactly as returned from the endpoint function,
                nothing more, nothing less.

        """
        if timeout:
            timeout = float(timeout)
        timeout = (timeout * 1000 if timeout else None) or self.timeout
        if not args:
            args = {}

        # get secret for this request
        # assumes secret won't change during the request
        try:
            secret = self.secret()
        except cylc.suite_srv_files_mgr.SuiteServiceFileError:
            raise ClientError({'message': 'could not read suite passphrase'})

        # send message
        msg = {'command': command, 'args': args}
        msg.update(self.header)
        LOG.debug('zmq:send %s' % msg)
        message = encrypt(msg, secret)
        self.socket.send_string(message)

        # receive response
        if self.poller.poll(timeout):
            res = await self.socket.recv_string()
        else:
            if self.timeout_handler:
                self.timeout_handler()
            raise ClientTimeout('Timeout waiting for server response.')

        try:
            response = decrypt(res, secret)
            LOG.debug('zmq:recv %s' % response)
        except jose.exceptions.JWTError:
            raise ClientError({
                'message':
                'Could not decrypt response. Has the passphrase ' + 'changed?'
            })

        # return data or handle error
        if 'data' in response:
            return response['data']
        else:  # if else to avoid complicating the traceback stack
            raise ClientError(response['error'])
Example #41
0
    def process_message(self,
                        itask,
                        severity,
                        message,
                        event_time=None,
                        flag='',
                        submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        LOG.log(self.LEVELS.get(severity,
                                INFO), r'[%s] -(current:%s)%s %s at %s', itask,
                itask.state.status, flag, message, event_time)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning('[%s] -submit-num=%02d: ignore message from job(%02d)',
                        itask, itask.submit_num, submit_num)
            return

        # always update the suite state summary for latest message
        if flag == self.POLLED_FLAG:
            itask.set_summary_message('%s %s' % (message, self.POLLED_FLAG))
        else:
            itask.set_summary_message(message)

        # Satisfy my output, if possible, and record the result.
        completed_trigger = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(itask,
                                                   {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
            # We should really have a special 'vacated' handler, but given that
            # this feature can only be used on the deprecated loadleveler
            # system, we should probably aim to remove support for job vacation
            # instead. Otherwise, we should have:
            # self.setup_event_handlers(itask, 'vacated', message)
        elif completed_trigger:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
            self.setup_event_handlers(itask, completed_trigger, message)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug('[%s] -(current:%s) unhandled: %s', itask,
                      itask.state.status, message)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(itask,
                                   ("message %s" % str(severity).lower()),
                                   message)
        lseverity = str(severity).lower()
        if lseverity in self.NON_UNIQUE_EVENTS:
            itask.non_unique_events.setdefault(lseverity, 0)
            itask.non_unique_events[lseverity] += 1
            self.setup_event_handlers(itask, lseverity, message)
Example #42
0
 def stop(self):
     """Finish serving the current request then stop the server."""
     LOG.debug('stopping zmq server...')
     self.queue.put('STOP')
     self.thread.join()  # wait for the listener to return
     LOG.debug('...stopped')
Example #43
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and event in self._get_events_conf(
                itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask, 'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            if event in self.NON_UNIQUE_EVENTS:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), '%s-%d' %
                        (event, itask.non_unique_events.get(event, 1)))
            else:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event)
            id_key = (key1, str(itask.point), itask.tdef.name,
                      itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event":
                    quote(event),
                    "suite":
                    quote(self.suite),
                    'suite_uuid':
                    quote(str(self.uuid_str)),
                    "point":
                    quote(str(itask.point)),
                    "name":
                    quote(itask.tdef.name),
                    "submit_num":
                    itask.submit_num,
                    "try_num":
                    itask.get_try_num(),
                    "id":
                    quote(itask.identity),
                    "message":
                    quote(message),
                    "batch_sys_name":
                    quote(str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id":
                    quote(str(itask.summary['submit_method_id'])),
                    "submit_time":
                    quote(str(itask.summary['submitted_time_string'])),
                    "start_time":
                    quote(str(itask.summary['started_time_string'])),
                    "finish_time":
                    quote(str(itask.summary['finished_time_string'])),
                    "user@host":
                    quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (handler, event, self.suite,
                                                  itask.identity, message)
            LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd)
            self.event_timers[id_key] = (TaskActionTimer(
                CustomTaskEventHandlerContext(
                    key1,
                    self.HANDLER_CUSTOM,
                    cmd,
                ), retry_delays))
Example #44
0
def cylc_kafka_consumer(kafka_server, kafka_topic, group_id, message, debug):
    """Look for a matching message in a Kafka topic.

    ARGUMENTS:
     * kafka_server - Kafka server URL, e.g. "localhost:9092".
     * kafka_topic - the Kafka topic to check, e.g. "data-avail".
     * group_id - determines Kafka offset ownership (see below).
     * message - string-ified dict with optional pattern elements (see below).
     * debug - boolean; set by daemon debug mode; prints to suite err log.

    The topic is first consumed from the beginning, then from the previous
    committed offset. If the message is not found by end of topic, commit the
    offset and return (to will try again later). If found, return the result.

    Kafka commits offsets per "consumer group" so the group_id argument
    must be unique per distinct trigger in the suite - this allows each trigger
    to separately consume the topic from the beginning, looking for its own
    messages (otherwise, with shared offsets, one trigger could move the offset
    beyond the messages of another trigger). This goes for successive instances
    of an external-triggered cycling task too, because out-of-order triggering
    could be required sometimes. So this argument should typically be, e.g.:

        group_id=x%(id)s  # id ID of the dependent task

    where "x" is an arbitrary string you can use to change the group name if
    you need to re-run the suite, and the messages, from the start again,
    without re-running the producer suite. Note this also serves to make the
    function signature cycle-point-specific for Cylc even if the message does
    not contain the cycle point (although it probably should).

    The "message" argument is a stringified dict, e.g.:
        {'system': 'prod', 'point': '2025', 'data': '<nwp.*\.nc>'}
    should be represented as:
        "system:prod point:2025 data:<nwp.*\.nc>"

    A match occurs Kafka if all message dict items match, and the result returned
    is the sub-dict of the actual values of items containing
    angle-bracket-delineated regex patterns. E.g. above {'data': 'nwp-2025.nc'}.

    """

    consumer = KafkaConsumer(kafka_topic, bootstrap_servers=[kafka_server],
                             value_deserializer=json.loads,
                             consumer_timeout_ms=CONSUMER_TIMEOUT_MS,
                             auto_offset_reset='earliest',
                             group_id=group_id)

    # Construct a dict from the message argument "key1=val1 key2=val2 ...".
    cylc_msg = dict(m.split(':') for m in message.split())

    result = (False, {})
    n_cons = 0
    for kafka_msg in consumer:
        n_cons += 1
        m = _match_msg(cylc_msg, kafka_msg)
        if m:
            result = (True, m)
            break
        # (else consume and compare next message)
    consumer.commit()
    # Unsubscribe before exit, otherwise next call will be slow while
    # Kafka times out waiting for this original consumer connection.
    consumer.unsubscribe()
    if debug:
        if result[0]:
            res = "\n  MATCHED: %s" % result[1]
        else:
            res = "no match."
        LOG.debug('Kafka: "%s" (consumed %d) ... %s', message, n_cons, res)
    return result
Example #45
0
def construct_ssh_cmd(raw_cmd,
                      user=None,
                      host=None,
                      forward_x11=False,
                      stdin=False,
                      ssh_login_shell=None,
                      ssh_cylc=None,
                      set_UTC=False,
                      allow_flag_opts=False):
    """Append a bare command with further options required to run via ssh.

    Arguments:
        raw_cmd (list): primitive command to run remotely.
        user (string): user ID for the remote login.
        host (string): remote host name. Use 'localhost' if not specified.
        forward_x11 (boolean):
            If True, use 'ssh -Y' to enable X11 forwarding, else just 'ssh'.
        stdin:
            If None, the `-n` option will be added to the SSH command line.
        ssh_login_shell (boolean):
            If True, launch remote command with `bash -l -c 'exec "$0" "$@"'`.
        ssh_cylc (string):
            Location of the remote cylc executable.
        set_UTC (boolean):
            If True, check UTC mode and specify if set to True (non-default).
        allow_flag_opts (boolean):
            If True, check CYLC_DEBUG and CYLC_VERBOSE and if non-default,
            specify debug and/or verbosity as options to the 'raw cmd'.

    Return:
        A list containing a chosen command including all arguments and options
        necessary to directly execute the bare command on a given host via ssh.
    """
    command = shlex.split(glbl_cfg().get_host_item('ssh command', host, user))

    if forward_x11:
        command.append('-Y')
    if stdin is None:
        command.append('-n')

    user_at_host = ''
    if user:
        user_at_host = user + '@'
    if host:
        user_at_host += host
    else:
        user_at_host += 'localhost'
    command.append(user_at_host)

    # Pass CYLC_VERSION and optionally, CYLC_CONF_PATH & CYLC_UTC through.
    command += ['env', quote(r'CYLC_VERSION=%s' % CYLC_VERSION)]
    try:
        command.append(
            quote(r'CYLC_CONF_PATH=%s' % os.environ['CYLC_CONF_PATH']))
    except KeyError:
        pass
    if set_UTC and os.getenv('CYLC_UTC') in ["True", "true"]:
        command.append(quote(r'CYLC_UTC=True'))
        command.append(quote(r'TZ=UTC'))

    # Use bash -l?
    if ssh_login_shell is None:
        ssh_login_shell = glbl_cfg().get_host_item('use login shell', host,
                                                   user)
    if ssh_login_shell:
        # A login shell will always source /etc/profile and the user's bash
        # profile file. To avoid having to quote the entire remote command
        # it is passed as arguments to the bash script.
        command += ['bash', '--login', '-c', quote(r'exec "$0" "$@"')]

    # 'cylc' on the remote host
    if ssh_cylc:
        command.append(ssh_cylc)
    else:
        ssh_cylc = glbl_cfg().get_host_item('cylc executable', host, user)
        if ssh_cylc.endswith('cylc'):
            command.append(ssh_cylc)
        else:
            raise ValueError(
                r'ERROR: bad cylc executable in global config: %s' % ssh_cylc)

    # Insert core raw command after ssh, but before its own, command options.
    command += raw_cmd

    if allow_flag_opts:
        if (cylc.flags.verbose
                or os.getenv('CYLC_VERBOSE') in ["True", "true"]):
            command.append(r'--verbose')
        if cylc.flags.debug or os.getenv('CYLC_DEBUG') in ["True", "true"]:
            command.append(r'--debug')
    if LOG.handlers:
        LOG.debug("$ %s", ' '.join(quote(c) for c in command))
    elif cylc.flags.debug:
        sys.stderr.write("$ %s\n" % ' '.join(quote(c) for c in command))

    return command
Example #46
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Scheduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" % (
                        point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (
                        point, name, submit_num, key1,
                        timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                # Avoid flooding user's mail box with mail notification.
                # Group together as many notifications as possible within a
                # given interval.
                timer.ctx.ctx_type == self.HANDLER_MAIL and
                not schd_ctx.stop_mode and
                self.next_mail_time is not None and
                self.next_mail_time > now
            ):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SubProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd, env=os.environ, shell=True,
                    ),
                    self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
Example #47
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": HTTP(S) and SSH+HTTP(S) task comm
            ".service/passphrase": HTTP(S) task comm
            ".service/ssl.cert": HTTPS task comm
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item('task communication method', host,
                                             owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create "stdin_file_paths" file, with "items" in it.
        tmphandle = NamedTemporaryFile()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(self.uuid_str)
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(self.uuid_str)
        cmd.append(glbl_cfg().get_derived_host_item(self.suite,
                                                    'suite run directory',
                                                    host, owner))
        self.proc_pool.put_command(
            SubProcContext('remote-init',
                           cmd,
                           stdin_file_paths=[tmphandle.name]),
            self._remote_init_callback, [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Example #48
0
 def _load_remote_item(self, item, reg, owner, host):
     """Load content of service item from remote [owner@]host via SSH."""
     if not is_remote(host, owner):
         return
     if host is None:
         host = 'localhost'
     if owner is None:
         owner = get_user()
     if item == self.FILE_BASE_CONTACT and not is_remote_host(host):
         # Attempt to read suite contact file via the local filesystem.
         path = r'%(run_d)s/%(srv_base)s' % {
             'run_d':
             glbl_cfg().get_derived_host_item(reg,
                                              'suite run directory',
                                              'localhost',
                                              owner,
                                              replace_home=False),
             'srv_base':
             self.DIR_BASE_SRV,
         }
         content = self._load_local_item(item, path)
         if content is not None:
             return content
         # Else drop through and attempt via ssh to the suite account.
     # Prefix STDOUT to ensure returned content is relevant
     prefix = r'[CYLC-AUTH] %(suite)s' % {'suite': reg}
     # Attempt to cat passphrase file under suite service directory
     script = (r"""echo '%(prefix)s'; """
               r'''cat "%(run_d)s/%(srv_base)s/%(item)s"''') % {
                   'prefix':
                   prefix,
                   'run_d':
                   glbl_cfg().get_derived_host_item(
                       reg, 'suite run directory', host, owner),
                   'srv_base':
                   self.DIR_BASE_SRV,
                   'item':
                   item
               }
     import shlex
     command = shlex.split(glbl_cfg().get_host_item('ssh command', host,
                                                    owner))
     command += ['-n', owner + '@' + host, script]
     from subprocess import Popen, PIPE
     try:
         proc = Popen(command,
                      stdin=open(os.devnull),
                      stdout=PIPE,
                      stderr=PIPE)
     except OSError:
         if cylc.flags.debug:
             import traceback
             traceback.print_exc()
         return
     out, err = proc.communicate()
     ret_code = proc.wait()
     # Extract passphrase from STDOUT
     # It should live in the line with the correct prefix
     content = ""
     can_read = False
     for line in out.splitlines(True):
         if can_read:
             content += line
         elif line.strip() == prefix:
             can_read = True
     if not content or ret_code:
         LOG.debug(
             '$ %(command)s  # code=%(ret_code)s\n%(err)s',
             {
                 'command': command,
                 # STDOUT may contain passphrase, so not safe to print
                 # 'out': out,
                 'err': err,
                 'ret_code': ret_code,
             })
         return
     return content
Example #49
0
    def _setup_custom_event_handlers(self, itask, event, message):
        """Set up custom task event handlers."""
        handlers = self._get_events_conf(itask, event + ' handler')
        if (handlers is None and
                event in self._get_events_conf(itask, 'handler events', [])):
            handlers = self._get_events_conf(itask, 'handlers')
        if handlers is None:
            return
        retry_delays = self._get_events_conf(
            itask,
            'handler retry delays',
            self.get_host_conf(itask, "task event handler retry delays"))
        if not retry_delays:
            retry_delays = [0]
        # There can be multiple custom event handlers
        for i, handler in enumerate(handlers):
            if event in self.NON_UNIQUE_EVENTS:
                key1 = (
                    '%s-%02d' % (self.HANDLER_CUSTOM, i),
                    '%s-%d' % (event, itask.non_unique_events.get(event, 1)))
            else:
                key1 = ('%s-%02d' % (self.HANDLER_CUSTOM, i), event)
            id_key = (
                key1, str(itask.point), itask.tdef.name, itask.submit_num)
            if id_key in self.event_timers:
                continue
            # Note: user@host may not always be set for a submit number, e.g.
            # on late event or if host select command fails. Use null string to
            # prevent issues in this case.
            user_at_host = itask.summary['job_hosts'].get(itask.submit_num, '')
            if user_at_host and '@' not in user_at_host:
                # (only has 'user@' on the front if user is not suite owner).
                user_at_host = '%s@%s' % (get_user(), user_at_host)
            # Custom event handler can be a command template string
            # or a command that takes 4 arguments (classic interface)
            # Note quote() fails on None, need str(None).
            try:
                handler_data = {
                    "event": quote(event),
                    "suite": quote(self.suite),
                    'suite_uuid': quote(str(self.uuid_str)),
                    "point": quote(str(itask.point)),
                    "name": quote(itask.tdef.name),
                    "submit_num": itask.submit_num,
                    "try_num": itask.get_try_num(),
                    "id": quote(itask.identity),
                    "message": quote(message),
                    "batch_sys_name": quote(
                        str(itask.summary['batch_sys_name'])),
                    "batch_sys_job_id": quote(
                        str(itask.summary['submit_method_id'])),
                    "submit_time": quote(
                        str(itask.summary['submitted_time_string'])),
                    "start_time": quote(
                        str(itask.summary['started_time_string'])),
                    "finish_time": quote(
                        str(itask.summary['finished_time_string'])),
                    "user@host": quote(user_at_host)
                }

                if self.suite_cfg:
                    for key, value in self.suite_cfg.items():
                        if key == "URL":
                            handler_data["suite_url"] = quote(value)
                        else:
                            handler_data["suite_" + key] = quote(value)

                if itask.tdef.rtconfig['meta']:
                    for key, value in itask.tdef.rtconfig['meta'].items():
                        if key == "URL":
                            handler_data["task_url"] = quote(value)
                        handler_data[key] = quote(value)

                cmd = handler % (handler_data)
            except KeyError as exc:
                message = "%s/%s/%02d %s bad template: %s" % (
                    itask.point, itask.tdef.name, itask.submit_num, key1, exc)
                LOG.error(message)
                continue

            if cmd == handler:
                # Nothing substituted, assume classic interface
                cmd = "%s '%s' '%s' '%s' '%s'" % (
                    handler, event, self.suite, itask.identity, message)
            LOG.debug("[%s] -Queueing %s handler: %s", itask, event, cmd)
            self.event_timers[id_key] = (
                TaskActionTimer(
                    CustomTaskEventHandlerContext(
                        key1,
                        self.HANDLER_CUSTOM,
                        cmd,
                    ),
                    retry_delays))
Example #50
0
    def submit_task_jobs(self, suite, itasks, is_simulation=False):
        """Prepare and submit task jobs.

        Submit tasks where possible. Ignore tasks that are waiting for host
        select command to complete, or tasks that are waiting for remote
        initialisation. Bad host select command, error writing to a job file or
        bad remote initialisation will cause a bad task - leading to submission
        failure.

        This method uses prep_submit_task_job() as helper.

        Return (list): list of tasks that attempted submission.
        """
        if is_simulation:
            return self._simulation_submit_task_jobs(itasks)

        # Prepare tasks for job submission
        prepared_tasks, bad_tasks = self.prep_submit_task_jobs(suite, itasks)

        # Reset consumed host selection results
        self.task_remote_mgr.remote_host_select_reset()

        if not prepared_tasks:
            return bad_tasks

        # Group task jobs by (host, owner)
        auth_itasks = {}  # {(host, owner): [itask, ...], ...}
        for itask in prepared_tasks:
            auth_itasks.setdefault((itask.task_host, itask.task_owner), [])
            auth_itasks[(itask.task_host, itask.task_owner)].append(itask)
        # Submit task jobs for each (host, owner) group
        done_tasks = bad_tasks
        for (host, owner), itasks in sorted(auth_itasks.items()):
            is_init = self.task_remote_mgr.remote_init(host, owner)
            if is_init is None:
                # Remote is waiting to be initialised
                for itask in itasks:
                    itask.set_summary_message(self.REMOTE_INIT_MSG)
                continue
            # Ensure that localhost background/at jobs are recorded as running
            # on the host name of the current suite host, rather than just
            # "localhost". On suite restart on a different suite host, this
            # allows the restart logic to correctly poll the status of the
            # background/at jobs that may still be running on the previous
            # suite host.
            if (
                self.batch_sys_mgr.is_job_local_to_host(
                    itask.summary['batch_sys_name']) and
                not is_remote_host(host)
            ):
                owner_at_host = get_host()
            else:
                owner_at_host = host
            # Persist
            if owner:
                owner_at_host = owner + '@' + owner_at_host
            now_str = get_current_time_string()
            done_tasks.extend(itasks)
            for itask in itasks:
                # Log and persist
                LOG.info(
                    '[%s] -submit-num=%d, owner@host=%s',
                    itask, itask.submit_num, owner_at_host)
                self.suite_db_mgr.put_insert_task_jobs(itask, {
                    'is_manual_submit': itask.is_manual_submit,
                    'try_num': itask.get_try_num(),
                    'time_submit': now_str,
                    'user_at_host': owner_at_host,
                    'batch_sys_name': itask.summary['batch_sys_name'],
                })
                itask.is_manual_submit = False
            if is_init == REMOTE_INIT_FAILED:
                # Remote has failed to initialise
                # Set submit-failed for all affected tasks
                for itask in itasks:
                    itask.local_job_file_path = None  # reset for retry
                    log_task_job_activity(
                        SubProcContext(
                            self.JOBS_SUBMIT,
                            '(init %s)' % owner_at_host,
                            err=REMOTE_INIT_FAILED,
                            ret_code=1),
                        suite, itask.point, itask.tdef.name)
                    self.task_events_mgr.process_message(
                        itask, CRITICAL,
                        self.task_events_mgr.EVENT_SUBMIT_FAILED)
                continue
            # Build the "cylc jobs-submit" command
            cmd = ['cylc', self.JOBS_SUBMIT]
            if LOG.isEnabledFor(DEBUG):
                cmd.append('--debug')
            if get_utc_mode():
                cmd.append('--utc-mode')
            remote_mode = False
            kwargs = {}
            for key, value, test_func in [
                    ('host', host, is_remote_host),
                    ('user', owner, is_remote_user)]:
                if test_func(value):
                    cmd.append('--%s=%s' % (key, value))
                    remote_mode = True
                    kwargs[key] = value
            if remote_mode:
                cmd.append('--remote-mode')
            cmd.append('--')
            cmd.append(glbl_cfg().get_derived_host_item(
                suite, 'suite job log directory', host, owner))
            # Chop itasks into a series of shorter lists if it's very big
            # to prevent overloading of stdout and stderr pipes.
            itasks = sorted(itasks, key=lambda itask: itask.identity)
            chunk_size = len(itasks) // ((len(itasks) // 100) + 1) + 1
            itasks_batches = [
                itasks[i:i + chunk_size] for i in range(0,
                                                        len(itasks),
                                                        chunk_size)]
            LOG.debug(
                '%s ... # will invoke in batches, sizes=%s',
                cmd, [len(b) for b in itasks_batches])
            for i, itasks_batch in enumerate(itasks_batches):
                stdin_files = []
                job_log_dirs = []
                for itask in itasks_batch:
                    if remote_mode:
                        stdin_files.append(
                            get_task_job_job_log(
                                suite, itask.point, itask.tdef.name,
                                itask.submit_num))
                    job_log_dirs.append(get_task_job_id(
                        itask.point, itask.tdef.name, itask.submit_num))
                    # The job file is now (about to be) used: reset the file
                    # write flag so that subsequent manual retrigger will
                    # generate a new job file.
                    itask.local_job_file_path = None
                    itask.state.reset_state(TASK_STATUS_READY)
                    if itask.state.outputs.has_custom_triggers():
                        self.suite_db_mgr.put_update_task_outputs(itask)
                self.proc_pool.put_command(
                    SubProcContext(
                        self.JOBS_SUBMIT,
                        cmd + job_log_dirs,
                        stdin_files=stdin_files,
                        job_log_dirs=job_log_dirs,
                        **kwargs
                    ),
                    self._submit_task_jobs_callback, [suite, itasks_batch])
        return done_tasks
Example #51
0
    def detect_old_contact_file(self, reg, check_host_port=None):
        """Detect old suite contact file.

        If an old contact file does not exist, do nothing. If one does exist
        but the suite process is definitely not alive, remove it. If one exists
        and the suite process is still alive, raise SuiteServiceFileError.

        If check_host_port is specified and does not match the (host, port)
        value in the old contact file, raise AssertionError.

        Args:
            reg (str): suite name
            check_host_port (tuple): (host, port) to check against

        Raise:
            AssertionError:
                If old contact file exists but does not have matching
                (host, port) with value of check_host_port.
            SuiteServiceFileError:
                If old contact file exists and the suite process still alive.
        """
        # An old suite of the same name may be running if a contact file exists
        # and can be loaded.
        try:
            data = self.load_contact_file(reg)
            old_host = data[self.KEY_HOST]
            old_port = data[self.KEY_PORT]
            old_proc_str = data[self.KEY_PROCESS]
        except (IOError, ValueError, SuiteServiceFileError):
            # Contact file does not exist or corrupted, should be OK to proceed
            return
        if check_host_port and check_host_port != (old_host, int(old_port)):
            raise AssertionError("%s != (%s, %s)" %
                                 (check_host_port, old_host, old_port))
        # Run the "ps" command to see if the process is still running or not.
        # If the old suite process is still running, it should show up with the
        # same command line as before.
        # Terminate command after 10 seconds to prevent hanging, etc.
        old_pid_str = old_proc_str.split(None, 1)[0].strip()
        cmd = ["timeout", "10", "ps", self.PS_OPTS, str(old_pid_str)]
        if is_remote_host(old_host):
            import shlex
            ssh_str = str(glbl_cfg().get_host_item("ssh command", old_host))
            cmd = shlex.split(ssh_str) + ["-n", old_host] + cmd
        from subprocess import Popen, PIPE
        from time import sleep, time
        proc = Popen(cmd, stdin=open(os.devnull), stdout=PIPE, stderr=PIPE)
        # Terminate command after 10 seconds to prevent hanging SSH, etc.
        timeout = time() + 10.0
        while proc.poll() is None:
            if time() > timeout:
                proc.terminate()
            sleep(0.1)
        fname = self.get_contact_file(reg)
        ret_code = proc.wait()
        out, err = proc.communicate()
        if ret_code:
            LOG.debug("$ %s  # return %d\n%s", ' '.join(cmd), ret_code, err)
        for line in reversed(out.splitlines()):
            if line.strip() == old_proc_str:
                # Suite definitely still running
                break
            elif line.split(None, 1)[0].strip() == "PID":
                # Only "ps" header - "ps" has run, but no matching results.
                # Suite not running. Attempt to remove suite contact file.
                try:
                    os.unlink(fname)
                    return
                except OSError:
                    break

        raise SuiteServiceFileError(
            (r"""ERROR, suite contact file exists: %(fname)s

Suite "%(suite)s" is already running, and listening at "%(host)s:%(port)s".

To start a new run, stop the old one first with one or more of these:
* cylc stop %(suite)s              # wait for active tasks/event handlers
* cylc stop --kill %(suite)s       # kill active tasks and wait
* cylc stop --now %(suite)s        # don't wait for active tasks
* cylc stop --now --now %(suite)s  # don't wait
* ssh -n "%(host)s" kill %(pid)s   # final brute force!
""") % {
                "host": old_host,
                "port": old_port,
                "pid": old_pid_str,
                "fname": fname,
                "suite": reg,
            })
Example #52
0
    def remote_init(self, host, owner):
        """Initialise a remote [owner@]host if necessary.

        Create UUID file on suite host ".service/uuid" for remotes to identify
        shared file system with suite host.

        Call "cylc remote-init" to install suite items to remote:
            ".service/contact": For TCP task communication
            ".service/passphrase": For TCP task communication
            "python/": if source exists

        Return:
            REMOTE_INIT_NOT_REQUIRED:
                If remote init is not required, e.g. not remote
            REMOTE_INIT_DONE:
                If remote init done.
            REMOTE_INIT_FAILED:
                If init of the remote failed.
                Note: this will reset to None to allow retry.
            None:
                If waiting for remote init command to complete

        """
        if self.single_task_mode or not is_remote(host, owner):
            return REMOTE_INIT_NOT_REQUIRED
        try:
            status = self.remote_init_map[(host, owner)]
        except KeyError:
            pass  # Not yet initialised
        else:
            if status == REMOTE_INIT_FAILED:
                del self.remote_init_map[(host, owner)]  # reset to allow retry
            return status

        # Determine what items to install
        comm_meth = glbl_cfg().get_host_item(
            'task communication method', host, owner)
        owner_at_host = 'localhost'
        if host:
            owner_at_host = host
        if owner:
            owner_at_host = owner + '@' + owner_at_host
        LOG.debug('comm_meth[%s]=%s' % (owner_at_host, comm_meth))
        items = self._remote_init_items(comm_meth)
        # No item to install
        if not items:
            self.remote_init_map[(host, owner)] = REMOTE_INIT_NOT_REQUIRED
            return self.remote_init_map[(host, owner)]

        # Create a TAR archive with the service files,
        # so they can be sent later via SSH's STDIN to the task remote.
        tmphandle = self.proc_pool.get_temporary_file()
        tarhandle = tarfile.open(fileobj=tmphandle, mode='w')
        for path, arcname in items:
            tarhandle.add(path, arcname=arcname)
        tarhandle.close()
        tmphandle.seek(0)
        # UUID file - for remote to identify shared file system with suite host
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        if not os.path.exists(uuid_fname):
            open(uuid_fname, 'wb').write(str(self.uuid_str).encode())
        # Build the command
        cmd = ['cylc', 'remote-init']
        if is_remote_host(host):
            cmd.append('--host=%s' % host)
        if is_remote_user(owner):
            cmd.append('--user=%s' % owner)
        if cylc.flags.debug:
            cmd.append('--debug')
        if comm_meth in ['ssh']:
            cmd.append('--indirect-comm=%s' % comm_meth)
        cmd.append(str(self.uuid_str))
        cmd.append(glbl_cfg().get_derived_host_item(
            self.suite, 'suite run directory', host, owner))
        self.proc_pool.put_command(
            SubProcContext('remote-init', cmd, stdin_files=[tmphandle]),
            self._remote_init_callback,
            [host, owner, tmphandle])
        # None status: Waiting for command to finish
        self.remote_init_map[(host, owner)] = None
        return self.remote_init_map[(host, owner)]
Example #53
0
    def process_message(
            self, itask, severity, message, event_time=None, flag='',
            submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        LOG.log(
            self.LEVELS.get(severity, INFO),
            r'[%s] -(current:%s)%s %s at %s',
            itask, itask.state.status, flag, message, event_time)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                '[%s] -submit-num=%02d: ignore message from job(%02d)',
                itask, itask.submit_num, submit_num)
            return

        # always update the suite state summary for latest message
        if flag == self.POLLED_FLAG:
            itask.set_summary_message('%s %s' % (message, self.POLLED_FLAG))
        else:
            itask.set_summary_message(message)

        # Satisfy my output, if possible, and record the result.
        completed_trigger = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
            # We should really have a special 'vacated' handler, but given that
            # this feature can only be used on the deprecated loadleveler
            # system, we should probably aim to remove support for job vacation
            # instead. Otherwise, we should have:
            # self.setup_event_handlers(itask, 'vacated', message)
        elif completed_trigger:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
            self.setup_event_handlers(itask, completed_trigger, message)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '[%s] -(current:%s) unhandled: %s',
                itask, itask.state.status, message)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        lseverity = str(severity).lower()
        if lseverity in self.NON_UNIQUE_EVENTS:
            itask.non_unique_events.setdefault(lseverity, 0)
            itask.non_unique_events[lseverity] += 1
            self.setup_event_handlers(itask, lseverity, message)
Example #54
0
    def _prep_submit_task_job(self, suite, itask, dry_run, check_syntax=True):
        """Prepare a task job submission.

        Return itask on a good preparation.

        """
        if itask.local_job_file_path and not dry_run:
            return itask

        # Handle broadcasts
        overrides = self.task_events_mgr.broadcast_mgr.get_broadcast(
            itask.identity)
        if overrides:
            rtconfig = pdeepcopy(itask.tdef.rtconfig)
            poverride(rtconfig, overrides, prepend=True)
        else:
            rtconfig = itask.tdef.rtconfig

        # Determine task host settings now, just before job submission,
        # because dynamic host selection may be used.
        try:
            task_host = self.task_remote_mgr.remote_host_select(
                rtconfig['remote']['host'])
        except TaskRemoteMgmtError as exc:
            # Submit number not yet incremented
            itask.submit_num += 1
            itask.summary['job_hosts'][itask.submit_num] = ''
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(remote host select)', exc)
            return False
        else:
            if task_host is None:  # host select not ready
                itask.set_summary_message(self.REMOTE_SELECT_MSG)
                return
            itask.task_host = task_host
            # Submit number not yet incremented
            itask.submit_num += 1
            # Retry delays, needed for the try_num
            self._set_retry_timers(itask, rtconfig)

        try:
            job_conf = self._prep_submit_task_job_impl(suite, itask, rtconfig)
            local_job_file_path = get_task_job_job_log(
                suite, itask.point, itask.tdef.name, itask.submit_num)
            self.job_file_writer.write(local_job_file_path, job_conf,
                                       check_syntax=check_syntax)
        except Exception as exc:
            # Could be a bad command template, IOError, etc
            self._prep_submit_task_job_error(
                suite, itask, dry_run, '(prepare job file)', exc)
            return False
        itask.local_job_file_path = local_job_file_path

        if dry_run:
            itask.set_summary_message('job file written (edit/dry-run)')
            LOG.debug('[%s] -%s', itask, itask.summary['latest_message'])

        # Return value used by "cylc submit" and "cylc jobscript":
        return itask