Example #1
0
    def _process_message_started(self, itask, event_time):
        """Helper for process_message, handle a started message."""
        if itask.job_vacated:
            itask.job_vacated = False
            LOG.warning("Vacated job restarted", itask=itask)
        self.pflag = True
        itask.state.reset_state(TASK_STATUS_RUNNING)
        itask.set_event_time('started', event_time)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_run": itask.summary['started_time_string']})
        if itask.summary['execution_time_limit']:
            execution_timeout = itask.summary['execution_time_limit']
        else:
            execution_timeout = self._get_events_conf(
                itask, 'execution timeout')
        try:
            itask.timeout_timers[TASK_STATUS_RUNNING] = (
                itask.summary['started_time'] + float(execution_timeout))
        except (TypeError, ValueError):
            itask.timeout_timers[TASK_STATUS_RUNNING] = None

        # submission was successful so reset submission try number
        if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
            itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
        self.setup_event_handlers(itask, 'started', 'job started')
        self.set_poll_time(itask)
Example #2
0
 def check_job_time(self, itask, now):
     """Check/handle job timeout and poll timer"""
     can_poll = self.check_poll_time(itask, now)
     if itask.timeout is None or now <= itask.timeout:
         return can_poll
     # Timeout reached for task, emit event and reset itask.timeout
     if itask.state.status == TASK_STATUS_RUNNING:
         time_ref = itask.summary['started_time']
         event = 'execution timeout'
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         time_ref = itask.summary['submitted_time']
         event = 'submission timeout'
     msg = event
     try:
         msg += ' after %s' % intvl_as_str(itask.timeout - time_ref)
     except (TypeError, ValueError):
         # Badness in time_ref?
         pass
     itask.timeout = None  # emit event only once
     if msg and event:
         LOG.warning(msg, itask=itask)
         self.setup_event_handlers(itask, event, msg)
         return True
     else:
         return can_poll
Example #3
0
 def _report_connection_if_denied(self):
     """Log an (un?)successful connection attempt."""
     prog_name, user, host, uuid = _get_client_info()[1:]
     connection_denied = self._get_client_connection_denied()
     if connection_denied:
         LOG.warning(self.__class__.LOG_CONNECT_DENIED_TMPL % (
             user, host, prog_name, uuid))
Example #4
0
    def _execute_stmt(self, stmt, stmt_args_list):
        """Helper for "self.execute_queued_items".

        Execute a statement. If this is the public database, return True on
        success and False on failure. If this is the private database, return
        True on success, and raise on failure.
        """
        try:
            self.connect()
            self.conn.executemany(stmt, stmt_args_list)
        except sqlite3.Error:
            if not self.is_public:
                raise
            if cylc.flags.debug:
                traceback.print_exc()
            err_log = ("cannot execute database statement:\n"
                       "file=%(file)s:\nstmt=%(stmt)s") % {
                           "file": self.db_file_name,
                           "stmt": stmt
                       }
            for i, stmt_args in enumerate(stmt_args_list):
                err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % {
                    "i": i,
                    "stmt_args": stmt_args
                })
            LOG.warning(err_log)
            raise
 def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line):
     """Callback on job command STDOUT/STDERR."""
     if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("host"):
         owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("user"):
         owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
     else:
         owner_at_host = ""
     try:
         timestamp, _, content = line.split("|")
     except ValueError:
         pass
     else:
         line = "%s %s" % (timestamp, content)
     job_activity_log = self.task_events_mgr.get_task_job_activity_log(
         suite, itask.point, itask.tdef.name)
     try:
         with open(job_activity_log, "ab") as handle:
             if not line.endswith("\n"):
                 line += "\n"
             handle.write(owner_at_host + line)
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
         LOG.warning(owner_at_host + line, itask=itask)
Example #6
0
    def _process_message_started(self, itask, event_time):
        """Helper for process_message, handle a started message."""
        if itask.job_vacated:
            itask.job_vacated = False
            LOG.warning("Vacated job restarted", itask=itask)
        self.pflag = True
        itask.state.reset_state(TASK_STATUS_RUNNING)
        itask.set_summary_time('started', event_time)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_run": itask.summary['started_time_string']})
        if itask.summary['execution_time_limit']:
            execution_timeout = itask.summary['execution_time_limit']
        else:
            execution_timeout = self._get_events_conf(
                itask, 'execution timeout')
        try:
            itask.timeout_timers[TASK_STATUS_RUNNING] = (
                itask.summary['started_time'] + float(execution_timeout))
        except (TypeError, ValueError):
            itask.timeout_timers[TASK_STATUS_RUNNING] = None

        # submission was successful so reset submission try number
        if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
            itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
        self.setup_event_handlers(itask, 'started', 'job started')
        self.set_poll_time(itask)
Example #7
0
 def check_job_time(self, itask, now):
     """Check/handle job timeout and poll timer"""
     can_poll = self.check_poll_time(itask, now)
     if itask.timeout is None or now <= itask.timeout:
         return can_poll
     # Timeout reached for task, emit event and reset itask.timeout
     if itask.state.status == TASK_STATUS_RUNNING:
         time_ref = itask.summary['started_time']
         event = 'execution timeout'
     elif itask.state.status == TASK_STATUS_SUBMITTED:
         time_ref = itask.summary['submitted_time']
         event = 'submission timeout'
     msg = event
     try:
         msg += ' after %s' % intvl_as_str(itask.timeout - time_ref)
     except (TypeError, ValueError):
         # Badness in time_ref?
         pass
     itask.timeout = None  # emit event only once
     if msg and event:
         LOG.warning(msg, itask=itask)
         self.setup_event_handlers(itask, event, msg)
         return True
     else:
         return can_poll
Example #8
0
 def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line):
     """Callback on job command STDOUT/STDERR."""
     if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"):
         user_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("host"):
         user_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs
     elif cmd_ctx.cmd_kwargs.get("user"):
         user_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs
     else:
         user_at_host = ""
     try:
         timestamp, _, content = line.split("|")
     except ValueError:
         pass
     else:
         line = "%s %s" % (timestamp, content)
     job_activity_log = self.task_events_mgr.get_task_job_activity_log(
         suite, itask.point, itask.tdef.name)
     try:
         with open(job_activity_log, "ab") as handle:
             if not line.endswith("\n"):
                 line += "\n"
             handle.write(user_at_host + line)
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
         LOG.warning(user_at_host + line, itask=itask)
Example #9
0
 def _report_connection_if_denied(self):
     """Log an (un?)successful connection attempt."""
     prog_name, user, host, uuid = _get_client_info()[1:]
     connection_denied = self._get_client_connection_denied()
     if connection_denied:
         LOG.warning(self.__class__.LOG_CONNECT_DENIED_TMPL % (
             user, host, prog_name, uuid))
Example #10
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite),
            FILE_BASE_UUID)
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flags.debug:
                cmd.append('--debug')
            cmd.append(os.path.join(glbl_cfg().get_derived_host_item(
                self.suite, 'suite run directory', host, owner)))
            procs[(host, owner)] = (
                cmd,
                Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull)))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = proc.communicate()
                if proc.wait():
                    LOG.warning(TaskRemoteMgmtError(
                        TaskRemoteMgmtError.MSG_TIDY,
                        (host, owner), ' '.join(quote(item) for item in cmd),
                        proc.ret_code, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(TaskRemoteMgmtError(
                    TaskRemoteMgmtError.MSG_TIDY,
                    (host, owner), ' '.join(quote(item) for item in cmd),
                    proc.ret_code, out, err))
Example #11
0
 def recover_pub_from_pri(self):
     """Recover public database from private database."""
     if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES:
         self.copy_pri_to_pub()
         LOG.warning(
             "%(pub_db_name)s: recovered from %(pri_db_name)s" % {
                 "pub_db_name": self.pub_dao.db_file_name,
                 "pri_db_name": self.pri_dao.db_file_name})
         self.pub_dao.n_tries = 0
Example #12
0
 def recover_pub_from_pri(self):
     """Recover public database from private database."""
     if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES:
         self.copy_pri_to_pub()
         LOG.warning(
             "%(pub_db_name)s: recovered from %(pri_db_name)s" % {
                 "pub_db_name": self.pub_dao.db_file_name,
                 "pri_db_name": self.pri_dao.db_file_name})
         self.pub_dao.n_tries = 0
Example #13
0
 def put_command(self, ctx, callback, callback_args=None):
     """Queue a new shell command to execute."""
     try:
         result = self.pool.apply_async(_run_command, [ctx])
     except AssertionError as exc:
         LOG.warning("%s\n  %s\n %s" %
                     (str(exc), "Rejecting command (pool closed)", ctx.cmd))
     else:
         self.results[id(result)] = (result, callback, callback_args)
Example #14
0
 def execute_queued_items(self):
     """Execute queued items for each table."""
     try:
         for table in self.tables.values():
             # DELETE statements may have varying number of WHERE args so we
             # can only executemany for each identical template statement.
             for stmt, stmt_args_list in table.delete_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
             # INSERT statements are uniform for each table, so all INSERT
             # statements can be executed using a single "executemany" call.
             if table.insert_queue:
                 self._execute_stmt(table.get_insert_stmt(),
                                    table.insert_queue)
             # UPDATE statements can have varying number of SET and WHERE
             # args so we can only executemany for each identical template
             # statement.
             for stmt, stmt_args_list in table.update_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
         # Connection should only be opened if we have executed something.
         if self.conn is None:
             return
         self.conn.commit()
     except sqlite3.Error:
         if not self.is_public:
             raise
         self.n_tries += 1
         LOG.warning(
             "%(file)s: write attempt (%(attempt)d) did not complete\n" % {
                 "file": self.db_file_name,
                 "attempt": self.n_tries
             })
         if self.conn is not None:
             try:
                 self.conn.rollback()
             except sqlite3.Error:
                 pass
         return
     else:
         # Clear the queues
         for table in self.tables.values():
             table.delete_queues.clear()
             del table.insert_queue[:]  # list.clear avail from Python 3.3
             table.update_queues.clear()
         # Report public database retry recovery if necessary
         if self.n_tries:
             LOG.warning(
                 "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % {
                     "file": self.db_file_name,
                     "attempt": self.n_tries
                 })
         self.n_tries = 0
     finally:
         # Note: This is not strictly necessary. However, if the suite run
         # directory is removed, a forced reconnection to the private
         # database will ensure that the suite dies.
         self.close()
Example #15
0
 def put_command(self, ctx, callback, callback_args=None):
     """Queue a new shell command to execute."""
     try:
         result = self.pool.apply_async(_run_command, [ctx])
     except AssertionError as exc:
         LOG.warning("%s\n  %s\n %s" % (
             str(exc),
             "Rejecting command (pool closed)",
             ctx.cmd))
     else:
         self.results[id(result)] = (result, callback, callback_args)
Example #16
0
 def execute_queued_items(self):
     """Execute queued items for each table."""
     try:
         for table in self.tables.values():
             # DELETE statements may have varying number of WHERE args so we
             # can only executemany for each identical template statement.
             for stmt, stmt_args_list in table.delete_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
             # INSERT statements are uniform for each table, so all INSERT
             # statements can be executed using a single "executemany" call.
             if table.insert_queue:
                 self._execute_stmt(table.get_insert_stmt(), table.insert_queue)
             # UPDATE statements can have varying number of SET and WHERE
             # args so we can only executemany for each identical template
             # statement.
             for stmt, stmt_args_list in table.update_queues.items():
                 self._execute_stmt(stmt, stmt_args_list)
         # Connection should only be opened if we have executed something.
         if self.conn is None:
             return
         self.conn.commit()
     except sqlite3.Error:
         if not self.is_public:
             raise
         self.n_tries += 1
         LOG.warning(
             "%(file)s: write attempt (%(attempt)d) did not complete\n"
             % {"file": self.db_file_name, "attempt": self.n_tries}
         )
         if self.conn is not None:
             try:
                 self.conn.rollback()
             except sqlite3.Error:
                 pass
         return
     else:
         # Clear the queues
         for table in self.tables.values():
             table.delete_queues.clear()
             del table.insert_queue[:]  # list.clear avail from Python 3.3
             table.update_queues.clear()
         # Report public database retry recovery if necessary
         if self.n_tries:
             LOG.warning(
                 "%(file)s: recovered after (%(attempt)d) attempt(s)\n"
                 % {"file": self.db_file_name, "attempt": self.n_tries}
             )
         self.n_tries = 0
     finally:
         # Note: This is not strictly necessary. However, if the suite run
         # directory is removed, a forced reconnection to the private
         # database will ensure that the suite dies.
         self.close()
Example #17
0
 def poll_task_jobs(self, suite, itasks, warn_skips=False):
     """Poll jobs of specified tasks."""
     active_itasks = []
     for itask in itasks:
         if itask.state.status in TASK_STATUSES_ACTIVE:
             active_itasks.append(itask)
         elif warn_skips:  # and not active
             LOG.warning(
                 '%s: skip poll, task not pollable' % itask.identity)
     self._run_job_cmd(
         self.JOBS_POLL, suite, active_itasks,
         self._poll_task_jobs_callback)
    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        active_itasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                active_itasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(self.JOBS_KILL, suite, active_itasks,
                          self._kill_task_jobs_callback)
 def _manip_task_jobs_callback(ctx,
                               suite,
                               itasks,
                               summary_callback,
                               more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (BatchSysManager.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (KeyError, ValueError):
                     if cylc.flags.debug:
                         LOG.warning('Unhandled %s output: %s' %
                                     (ctx.cmd_key, line))
                         LOG.warning(traceback.format_exc())
Example #20
0
    def unlink_hosts_contacts(self, reg):
        """Remove suite contact files from initialised hosts.

        This is called on shutdown, so we don't want anything to hang.
        Terminate any incomplete SSH commands after 10 seconds.
        """
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), should_unlink in self.init_host_map.items():
            if not should_unlink:
                continue
            user_at_host = host
            if owner:
                user_at_host = owner + '@' + host
            ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner)
            r_suite_contact_file = os.path.join(
                GLOBAL_CFG.get_derived_host_item(
                    reg, 'suite run directory', host, owner),
                self.suite_srv_files_mgr.DIR_BASE_SRV,
                self.suite_srv_files_mgr.FILE_BASE_CONTACT)
            cmd = shlex.split(ssh_tmpl) + [
                '-n', user_at_host, 'rm', '-f', r_suite_contact_file]
            procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for user_at_host, (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[user_at_host]
                out, err = proc.communicate()
                if proc.wait():
                    LOG.warning(RemoteJobHostInitError(
                        RemoteJobHostInitError.MSG_TIDY,
                        user_at_host, ' '.join(quote(item) for item in cmd),
                        proc.returncode, out, err))
        # Terminate any remaining commands
        for user_at_host, (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(RemoteJobHostInitError(
                    RemoteJobHostInitError.MSG_TIDY,
                    user_at_host, ' '.join(quote(item) for item in cmd),
                    proc.returncode, out, err))
Example #21
0
    def _process_message_started(self, itask, event_time):
        """Helper for process_message, handle a started message."""
        if itask.job_vacated:
            itask.job_vacated = False
            LOG.warning("Vacated job restarted", itask=itask)
        self.pflag = True
        if itask.state.reset_state(TASK_STATUS_RUNNING):
            self.setup_event_handlers(itask, 'started', 'job started')
        itask.set_summary_time('started', event_time)
        self._reset_job_timers(itask)
        self.suite_db_mgr.put_update_task_jobs(itask, {
            "time_run": itask.summary['started_time_string']})

        # submission was successful so reset submission try number
        if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
            itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
Example #22
0
    def kill_task_jobs(self, suite, itasks):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        to_kill_tasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                to_kill_tasks.append(itask)
            else:
                LOG.warning('skipping %s: task not killable' % itask.identity)
        self._run_job_cmd(
            self.JOBS_KILL, suite, to_kill_tasks,
            self._kill_task_jobs_callback)
Example #23
0
    def _process_message_started(self, itask, event_time):
        """Helper for process_message, handle a started message."""
        if itask.job_vacated:
            itask.job_vacated = False
            LOG.warning("Vacated job restarted", itask=itask)
        self.pflag = True
        if itask.state.reset_state(TASK_STATUS_RUNNING):
            self.setup_event_handlers(itask, 'started', 'job started')
        itask.set_summary_time('started', event_time)
        self._reset_job_timers(itask)
        self.suite_db_mgr.put_update_task_jobs(
            itask, {"time_run": itask.summary['started_time_string']})

        # submission was successful so reset submission try number
        if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
            itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
Example #24
0
    def _check_access_priv(self, required_privilege_level):
        """Raise an exception if client privilege is insufficient.

        (See the documentation above for the boolean version of this function).

        """
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        if (PRIVILEGE_LEVELS.index(priv_level) <
                PRIVILEGE_LEVELS.index(required_privilege_level)):
            err = self.CONNECT_DENIED_PRIV_TMPL % (
                priv_level, required_privilege_level,
                user, host, prog_name, uuid)
            LOG.warning(err)
            # Raise an exception to be sent back to the client.
            raise cherrypy.HTTPError(403, err)
        return True
Example #25
0
 def report_connection_if_denied(self):
     """Log an (un?)successful connection attempt."""
     try:
         (auth_user, prog_name, user, host, uuid,
          priv_level) = get_client_info()
     except Exception:
         LOG.warning(
             self.__class__.LOG_CONNECT_DENIED_TMPL % (
                 "unknown", "unknown", "unknown", "unknown")
         )
         return
     connection_denied = get_client_connection_denied()
     if connection_denied:
         LOG.warning(
             self.__class__.LOG_CONNECT_DENIED_TMPL % (
                 user, host, prog_name, uuid)
         )
Example #26
0
    def _check_access_priv(self, required_privilege_level):
        """Raise an exception if client privilege is insufficient for server_obj.

        (See the documentation above for the boolean version of this function).

        """
        auth_user, prog_name, user, host, uuid = _get_client_info()
        priv_level = self._get_priv_level(auth_user)
        if (PRIVILEGE_LEVELS.index(priv_level) <
                PRIVILEGE_LEVELS.index(required_privilege_level)):
            err = self.CONNECT_DENIED_PRIV_TMPL % (
                priv_level, required_privilege_level,
                user, host, prog_name, uuid)
            LOG.warning(err)
            # Raise an exception to be sent back to the client.
            raise cherrypy.HTTPError(403, err)
        return True
Example #27
0
 def report_connection_if_denied(self):
     """Log an (un?)successful connection attempt."""
     try:
         (auth_user, prog_name, user, host, uuid,
          priv_level) = get_client_info()
     except Exception:
         LOG.warning(
             self.__class__.LOG_CONNECT_DENIED_TMPL % (
                 "unknown", "unknown", "unknown", "unknown")
         )
         return
     connection_denied = get_client_connection_denied()
     if connection_denied:
         LOG.warning(
             self.__class__.LOG_CONNECT_DENIED_TMPL % (
                 user, host, prog_name, uuid)
         )
Example #28
0
    def kill_task_jobs(self, suite, itasks, warn_skips=False):
        """Kill jobs of active tasks, and hold the tasks.

        If items is specified, kill active tasks matching given IDs.

        """
        active_itasks = []
        for itask in itasks:
            if itask.state.status in TASK_STATUSES_ACTIVE:
                itask.state.set_held()
                active_itasks.append(itask)
            elif warn_skips:  # and not active
                LOG.warning(
                    '%s: skip kill, task not killable' % itask.identity)
        self._run_job_cmd(
            self.JOBS_KILL, suite, active_itasks,
            self._kill_task_jobs_callback)
Example #29
0
 def _manip_task_jobs_callback(
         ctx, suite, itasks, summary_callback, more_callbacks=None):
     """Callback when submit/poll/kill tasks command exits."""
     if ctx.ret_code:
         LOG.error(ctx)
     else:
         LOG.debug(ctx)
     tasks = {}
     # Note for "kill": It is possible for a job to trigger its trap and
     # report back to the suite back this logic is called. If so, the task
     # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and
     # its output line will be ignored here.
     for itask in itasks:
         if itask.point is not None and itask.submit_num:
             submit_num = "%02d" % (itask.submit_num)
             tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask
     handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)]
     if more_callbacks:
         for prefix, callback in more_callbacks.items():
             handlers.append((prefix, callback))
     out = ctx.out
     if not out:
         out = ""
         # Something is very wrong here
         # Fallback to use "job_log_dirs" list to report the problem
         job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", [])
         for job_log_dir in job_log_dirs:
             point, name, submit_num = job_log_dir.split(os.sep, 2)
             itask = tasks[(point, name, submit_num)]
             out += (BatchSysManager.OUT_PREFIX_SUMMARY +
                     "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n")
     for line in out.splitlines(True):
         for prefix, callback in handlers:
             if line.startswith(prefix):
                 line = line[len(prefix):].strip()
                 try:
                     path = line.split("|", 2)[1]  # timestamp, path, status
                     point, name, submit_num = path.split(os.sep, 2)
                     itask = tasks[(point, name, submit_num)]
                     callback(suite, itask, ctx, line)
                 except (KeyError, ValueError):
                     if cylc.flags.debug:
                         LOG.warning('Unhandled %s output: %s' % (
                             ctx.cmd_key, line))
                         LOG.warning(traceback.format_exc())
Example #30
0
 def report_id_requests(self):
     """Report the frequency of identification (scan) requests."""
     current_time = time.time()
     interval = current_time - self._id_start_time
     if interval > self.CLIENT_ID_REPORT_SECONDS:
         rate = float(self._num_id_requests) / interval
         if rate > self.CLIENT_ID_MIN_REPORT_RATE:
             LOG.warning(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         elif cylc.flags.debug:
             LOG.info(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         self._id_start_time = current_time
         self._num_id_requests = 0
Example #31
0
 def log_task_job_activity(self, ctx, suite, point, name, submit_num=NN):
     """Log an activity for a task job."""
     ctx_str = str(ctx)
     if not ctx_str:
         return
     if isinstance(ctx.cmd_key, tuple):  # An event handler
         submit_num = ctx.cmd_key[-1]
     job_activity_log = self.get_task_job_activity_log(
         suite, point, name, submit_num)
     try:
         with open(job_activity_log, "ab") as handle:
             handle.write(ctx_str + '\n')
     except IOError as exc:
         LOG.warning("%s: write failed\n%s" % (job_activity_log, exc))
     if ctx.cmd and ctx.ret_code:
         LOG.error(ctx_str)
     elif ctx.cmd:
         LOG.debug(ctx_str)
Example #32
0
 def report_id_requests(self):
     """Report the frequency of identification (scan) requests."""
     current_time = time.time()
     interval = current_time - self._id_start_time
     if interval > self.CLIENT_ID_REPORT_SECONDS:
         rate = float(self._num_id_requests) / interval
         if rate > self.CLIENT_ID_MIN_REPORT_RATE:
             LOG.warning(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         elif cylc.flags.debug:
             LOG.info(
                 self.__class__.LOG_IDENTIFY_TMPL % (
                     self._num_id_requests, interval)
             )
         self._id_start_time = current_time
         self._num_id_requests = 0
 def _check_timeout(self, itask, now):
     """Check/handle submission/execution timeouts."""
     if itask.state.status == TASK_STATUS_RUNNING:
         timer = itask.poll_timers.get(self.KEY_EXECUTE_TIME_LIMIT)
         if timer is not None:
             if not timer.is_timeout_set():
                 timer.next()
             if not timer.is_delay_done():
                 # Don't poll
                 return False
             if timer.next() is not None:
                 # Poll now, and more retries lined up
                 return True
             # No more retry lined up, can issue execution timeout event
     if itask.state.status in itask.timeout_timers:
         timeout = itask.timeout_timers[itask.state.status]
         if timeout is None or now <= timeout:
             return False
         itask.timeout_timers[itask.state.status] = None
         if (itask.state.status == TASK_STATUS_RUNNING
                 and itask.summary['started_time'] is not None):
             msg = 'job started %s ago, but has not finished' % (
                 get_seconds_as_interval_string(
                     timeout - itask.summary['started_time']))
             event = 'execution timeout'
         elif (itask.state.status == TASK_STATUS_SUBMITTED
               and itask.summary['submitted_time'] is not None):
             msg = 'job submitted %s ago, but has not started' % (
                 get_seconds_as_interval_string(
                     timeout - itask.summary['submitted_time']))
             event = 'submission timeout'
         else:
             return False
         LOG.warning(msg, itask=itask)
         self.task_events_mgr.setup_event_handlers(itask, event, msg)
         return True
Example #34
0
 def _check_timeout(self, itask, now):
     """Check/handle submission/execution timeouts."""
     if itask.state.status == TASK_STATUS_RUNNING:
         timer = itask.poll_timers.get(self.KEY_EXECUTE_TIME_LIMIT)
         if timer is not None:
             if not timer.is_timeout_set():
                 timer.next()
             if not timer.is_delay_done():
                 # Don't poll
                 return False
             if timer.next() is not None:
                 # Poll now, and more retries lined up
                 return True
             # No more retry lined up, can issue execution timeout event
     if itask.state.status in itask.timeout_timers:
         timeout = itask.timeout_timers[itask.state.status]
         if timeout is None or now <= timeout:
             return False
         itask.timeout_timers[itask.state.status] = None
         if (itask.state.status == TASK_STATUS_RUNNING and
                 itask.summary['started_time'] is not None):
             msg = 'job started %s ago, but has not finished' % (
                 get_seconds_as_interval_string(
                     timeout - itask.summary['started_time']))
             event = 'execution timeout'
         elif (itask.state.status == TASK_STATUS_SUBMITTED and
                 itask.summary['submitted_time'] is not None):
             msg = 'job submitted %s ago, but has not started' % (
                 get_seconds_as_interval_string(
                     timeout - itask.summary['submitted_time']))
             event = 'submission timeout'
         else:
             return False
         LOG.warning(msg, itask=itask)
         self.task_events_mgr.setup_event_handlers(itask, event, msg)
         return True
Example #35
0
    def _execute_stmt(self, stmt, stmt_args_list):
        """Helper for "self.execute_queued_items".

        Execute a statement. If this is the public database, return True on
        success and False on failure. If this is the private database, return
        True on success, and raise on failure.
        """
        try:
            self.connect()
            self.conn.executemany(stmt, stmt_args_list)
        except sqlite3.Error:
            if not self.is_public:
                raise
            if cylc.flags.debug:
                traceback.print_exc()
            err_log = (
                "cannot execute database statement:\n"
                "file=%(file)s:\nstmt=%(stmt)s"
            ) % {"file": self.db_file_name, "stmt": stmt}
            for i, stmt_args in enumerate(stmt_args_list):
                err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % {
                    "i": i, "stmt_args": stmt_args})
            LOG.warning(err_log)
            raise
Example #36
0
    def process_message(self,
                        itask,
                        severity,
                        message,
                        event_time=None,
                        flag='',
                        submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag,
                                                   message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' %
                (itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if flag == self.POLLED_FLAG:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(itask,
                                                   {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug('(current:%s) unhandled: %s' %
                      (itask.state.status, message),
                      itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(itask,
                                   ("message %s" % str(severity).lower()),
                                   message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
Example #37
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Schduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" %
                                (point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (point, name, submit_num, key1,
                                      timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                    # Avoid flooding user's mail box with mail notification.
                    # Group together as many notifications as possible within a
                    # given interval.
                    timer.ctx.ctx_type == self.HANDLER_MAIL
                    and not schd_ctx.stop_mode
                    and self.next_mail_time is not None
                    and self.next_mail_time > now):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SuiteProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd,
                        env=os.environ,
                        shell=True,
                    ), self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
Example #38
0
    def process_message(self, itask, severity, message, poll_func,
                        poll_event_time=None, incoming_event_time=None,
                        submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        """

        # Log incoming messages with '>' to distinguish non-message log entries
        if incoming_event_time:
            if submit_num is None or submit_num == itask.submit_num:
                message_flag = self.INCOMING_FLAG
            else:
                message_flag = self.IGNORED_INCOMING_FLAG
            event_time = incoming_event_time
        elif poll_event_time:
            message_flag = self.POLLED_FLAG
            event_time = poll_event_time
        else:
            message_flag = ''
            event_time = get_current_time_string()
        log_message = '(current:%s)%s %s at %s' % (
            itask.state.status, message_flag, message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if message_flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' % (
                    itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if poll_event_time is not None:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func):
                return
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func):
                return
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if self._poll_to_confirm(itask,
                                     TASK_STATUS_SUBMIT_FAILED, poll_func):
                return
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func):
                return
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func):
                return
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            try:
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = (
                    itask.summary['submitted_time'] +
                    float(self._get_events_conf(itask, 'submission timeout')))
            except (TypeError, ValueError):
                itask.timeout_timers[TASK_STATUS_SUBMITTED] = None
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '(current:%s) unhandled: %s' % (itask.state.status, message),
                itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
Example #39
0
    def process_events(self, schd_ctx):
        """Process task events that were created by "setup_event_handlers".

        schd_ctx is an instance of "Schduler" in "cylc.scheduler".
        """
        ctx_groups = {}
        now = time()
        for id_key, timer in self.event_timers.copy().items():
            key1, point, name, submit_num = id_key
            if timer.is_waiting:
                continue
            # Set timer if timeout is None.
            if not timer.is_timeout_set():
                if timer.next() is None:
                    LOG.warning("%s/%s/%02d %s failed" % (
                        point, name, submit_num, key1))
                    del self.event_timers[id_key]
                    continue
                # Report retries and delayed 1st try
                tmpl = None
                if timer.num > 1:
                    tmpl = "%s/%s/%02d %s failed, retrying in %s"
                elif timer.delay:
                    tmpl = "%s/%s/%02d %s will run after %s"
                if tmpl:
                    LOG.debug(tmpl % (
                        point, name, submit_num, key1,
                        timer.delay_timeout_as_str()))
            # Ready to run?
            if not timer.is_delay_done() or (
                # Avoid flooding user's mail box with mail notification.
                # Group together as many notifications as possible within a
                # given interval.
                timer.ctx.ctx_type == self.HANDLER_MAIL and
                not schd_ctx.stop_mode and
                self.next_mail_time is not None and
                self.next_mail_time > now
            ):
                continue

            timer.set_waiting()
            if timer.ctx.ctx_type == self.HANDLER_CUSTOM:
                # Run custom event handlers on their own
                self.proc_pool.put_command(
                    SuiteProcContext(
                        (key1, submit_num),
                        timer.ctx.cmd, env=os.environ, shell=True,
                    ),
                    self._custom_handler_callback, [schd_ctx, id_key])
            else:
                # Group together built-in event handlers, where possible
                if timer.ctx not in ctx_groups:
                    ctx_groups[timer.ctx] = []
                ctx_groups[timer.ctx].append(id_key)

        next_mail_time = now + self.mail_interval
        for ctx, id_keys in ctx_groups.items():
            if ctx.ctx_type == self.HANDLER_MAIL:
                # Set next_mail_time if any mail sent
                self.next_mail_time = next_mail_time
                self._process_event_email(schd_ctx, ctx, id_keys)
            elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE:
                self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
Example #40
0
    def process_message(
            self, itask, severity, message, event_time=None, flag='',
            submit_num=None):
        """Parse an incoming task message and update task state.

        Incoming, e.g. "succeeded at <TIME>", may be from task job or polling.

        It is possible for my current state to be inconsistent with an incoming
        message (whether normal or polled) e.g. due to a late poll result, or a
        network outage, or manual state reset. To handle this, if a message
        would take the task state backward, issue a poll to confirm instead of
        changing state - then always believe the next message. Note that the
        next message might not be the result of this confirmation poll, in the
        unlikely event that a job emits a succession of messages very quickly,
        but this is the best we can do without somehow uniquely associating
        each poll with its result message.

        Arguments:
            itask (cylc.task_proxy.TaskProxy):
                The task proxy object relevant for the message.
            severity (str or int):
                Message severity, should be a recognised logging level.
            message (str):
                Message content.
            event_time (str):
                Event time stamp. Expect ISO8601 date time string.
                If not specified, use current time.
            flag (str):
                If specified, can be INCOMING_FLAG to indicate an incoming
                message, POLLED_FLAG to indicate a message resulted from a
                poll. Otherwise, the message is assumed to be generated by the
                logic in the suite server program.
            submit_num (int):
                The submit number of the task relevant for the message.
                If not specified, use latest submit number.

        Return:
            None: in normal circumstances.
            True: if polling is required to confirm a reversal of status.

        """
        # Log incoming messages
        if event_time is None:
            event_time = get_current_time_string()
        if submit_num is None:
            submit_num = itask.submit_num
        if flag == self.INCOMING_FLAG and submit_num != itask.submit_num:
            flag = self.IGNORED_INCOMING_FLAG
        log_message = '(current:%s)%s %s at %s' % (
            itask.state.status, flag, message, event_time)
        LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask)
        if flag == self.IGNORED_INCOMING_FLAG:
            LOG.warning(
                'submit-num=%d: ignore message from job submit-num=%d' % (
                    itask.submit_num, submit_num),
                itask=itask)
            return

        # always update the suite state summary for latest message
        itask.summary['latest_message'] = message
        if flag == self.POLLED_FLAG:
            itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG
        cylc.flags.iflag = True

        # Satisfy my output, if possible, and record the result.
        an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion(
            message=message, is_completed=True)

        if message == TASK_OUTPUT_STARTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_RUNNING)):
                return True
            self._process_message_started(itask, event_time)
        elif message == TASK_OUTPUT_SUCCEEDED:
            self._process_message_succeeded(itask, event_time)
        elif message == TASK_OUTPUT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message == self.EVENT_SUBMIT_FAILED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)):
                return True
            self._process_message_submit_failed(itask, event_time)
        elif message == TASK_OUTPUT_SUBMITTED:
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_SUBMITTED)):
                return True
            self._process_message_submitted(itask, event_time)
        elif message.startswith(FAIL_MESSAGE_PREFIX):
            # Task received signal.
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            signal = message[len(FAIL_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "signaled", signal)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": signal})
            self._process_message_failed(itask, event_time, self.JOB_FAILED)
        elif message.startswith(ABORT_MESSAGE_PREFIX):
            # Task aborted with message
            if (flag == self.INCOMING_FLAG
                    and itask.state.is_gt(TASK_STATUS_FAILED)):
                return True
            aborted_with = message[len(ABORT_MESSAGE_PREFIX):]
            self._db_events_insert(itask, "aborted", message)
            self.suite_db_mgr.put_update_task_jobs(
                itask, {"run_signal": aborted_with})
            self._process_message_failed(itask, event_time, aborted_with)
        elif message.startswith(VACATION_MESSAGE_PREFIX):
            # Task job pre-empted into a vacation state
            self._db_events_insert(itask, "vacated", message)
            itask.set_summary_time('started')  # unset
            if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers:
                itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
            itask.job_vacated = True
            # Believe this and change state without polling (could poll?).
            self.pflag = True
            itask.state.reset_state(TASK_STATUS_SUBMITTED)
            self._reset_job_timers(itask)
        elif an_output_was_satisfied:
            # Message of an as-yet unreported custom task output.
            # No state change.
            self.pflag = True
            self.suite_db_mgr.put_update_task_outputs(itask)
        else:
            # Unhandled messages. These include:
            #  * general non-output/progress messages
            #  * poll messages that repeat previous results
            # Note that all messages are logged already at the top.
            # No state change.
            LOG.debug(
                '(current:%s) unhandled: %s' % (itask.state.status, message),
                itask=itask)
            if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]:
                severity = getLevelName(severity)
            self._db_events_insert(
                itask, ("message %s" % str(severity).lower()), message)
        if severity in ['WARNING', 'CRITICAL', 'CUSTOM']:
            self.setup_event_handlers(itask, severity.lower(), message)
Example #41
0
 def _run_event_mail_callback(proc_ctx):
     """Callback the mail command for notification of a suite event."""
     if proc_ctx.ret_code:
         LOG.warning(str(proc_ctx))
     else:
         LOG.info(str(proc_ctx))
Example #42
0
 def _run_event_mail_callback(proc_ctx):
     """Callback the mail command for notification of a suite event."""
     if proc_ctx.ret_code:
         LOG.warning(str(proc_ctx))
     else:
         LOG.info(str(proc_ctx))
Example #43
0
    def remote_tidy(self):
        """Remove suite contact files from initialised remotes.

        Call "cylc remote-tidy".
        This method is called on suite shutdown, so we want nothing to hang.
        Timeout any incomplete commands after 10 seconds.

        Also remove UUID file on suite host ".service/uuid".
        """
        # Remove UUID file
        uuid_fname = os.path.join(
            self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid')
        try:
            os.unlink(uuid_fname)
        except OSError:
            pass
        # Issue all SSH commands in parallel
        procs = {}
        for (host, owner), init_with_contact in self.remote_init_map.items():
            if init_with_contact != REMOTE_INIT_DONE:
                continue
            cmd = ['timeout', '10', 'cylc', 'remote-tidy']
            if is_remote_host(host):
                cmd.append('--host=%s' % host)
            if is_remote_user(owner):
                cmd.append('--user=%s' % owner)
            if cylc.flags.debug:
                cmd.append('--debug')
            cmd.append(
                os.path.join(glbl_cfg().get_derived_host_item(
                    self.suite, 'suite run directory', host, owner)))
            procs[(host, owner)] = (cmd,
                                    Popen(cmd,
                                          stdout=PIPE,
                                          stderr=PIPE,
                                          stdin=open(os.devnull)))
        # Wait for commands to complete for a max of 10 seconds
        timeout = time() + 10.0
        while procs and time() < timeout:
            for (host, owner), (cmd, proc) in procs.copy().items():
                if proc.poll() is None:
                    continue
                del procs[(host, owner)]
                out, err = proc.communicate()
                if proc.wait():
                    LOG.warning(
                        TaskRemoteMgmtError(
                            TaskRemoteMgmtError.MSG_TIDY, (host, owner),
                            ' '.join(quote(item) for item in cmd),
                            proc.ret_code, out, err))
        # Terminate any remaining commands
        for (host, owner), (cmd, proc) in procs.items():
            try:
                proc.terminate()
            except OSError:
                pass
            out, err = proc.communicate()
            if proc.wait():
                LOG.warning(
                    TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY,
                                        (host, owner),
                                        ' '.join(quote(item) for item in cmd),
                                        proc.ret_code, out, err))