def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True itask.state.reset_state(TASK_STATUS_RUNNING) itask.set_event_time('started', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) if itask.summary['execution_time_limit']: execution_timeout = itask.summary['execution_time_limit'] else: execution_timeout = self._get_events_conf( itask, 'execution timeout') try: itask.timeout_timers[TASK_STATUS_RUNNING] = ( itask.summary['started_time'] + float(execution_timeout)) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_RUNNING] = None # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 self.setup_event_handlers(itask, 'started', 'job started') self.set_poll_time(itask)
def check_job_time(self, itask, now): """Check/handle job timeout and poll timer""" can_poll = self.check_poll_time(itask, now) if itask.timeout is None or now <= itask.timeout: return can_poll # Timeout reached for task, emit event and reset itask.timeout if itask.state.status == TASK_STATUS_RUNNING: time_ref = itask.summary['started_time'] event = 'execution timeout' elif itask.state.status == TASK_STATUS_SUBMITTED: time_ref = itask.summary['submitted_time'] event = 'submission timeout' msg = event try: msg += ' after %s' % intvl_as_str(itask.timeout - time_ref) except (TypeError, ValueError): # Badness in time_ref? pass itask.timeout = None # emit event only once if msg and event: LOG.warning(msg, itask=itask) self.setup_event_handlers(itask, event, msg) return True else: return can_poll
def _report_connection_if_denied(self): """Log an (un?)successful connection attempt.""" prog_name, user, host, uuid = _get_client_info()[1:] connection_denied = self._get_client_connection_denied() if connection_denied: LOG.warning(self.__class__.LOG_CONNECT_DENIED_TMPL % ( user, host, prog_name, uuid))
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ("cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s") % { "file": self.db_file_name, "stmt": stmt } for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args }) LOG.warning(err_log) raise
def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): owner_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): owner_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: owner_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = self.task_events_mgr.get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write(owner_at_host + line) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning(owner_at_host + line, itask=itask)
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True itask.state.reset_state(TASK_STATUS_RUNNING) itask.set_summary_time('started', event_time) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) if itask.summary['execution_time_limit']: execution_timeout = itask.summary['execution_time_limit'] else: execution_timeout = self._get_events_conf( itask, 'execution timeout') try: itask.timeout_timers[TASK_STATUS_RUNNING] = ( itask.summary['started_time'] + float(execution_timeout)) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_RUNNING] = None # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 self.setup_event_handlers(itask, 'started', 'job started') self.set_poll_time(itask)
def _job_cmd_out_callback(self, suite, itask, cmd_ctx, line): """Callback on job command STDOUT/STDERR.""" if cmd_ctx.cmd_kwargs.get("host") and cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("host"): user_at_host = "(%(host)s) " % cmd_ctx.cmd_kwargs elif cmd_ctx.cmd_kwargs.get("user"): user_at_host = "(%(user)s@localhost) " % cmd_ctx.cmd_kwargs else: user_at_host = "" try: timestamp, _, content = line.split("|") except ValueError: pass else: line = "%s %s" % (timestamp, content) job_activity_log = self.task_events_mgr.get_task_job_activity_log( suite, itask.point, itask.tdef.name) try: with open(job_activity_log, "ab") as handle: if not line.endswith("\n"): line += "\n" handle.write(user_at_host + line) except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) LOG.warning(user_at_host + line, itask=itask)
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), FILE_BASE_UUID) try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append(os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = ( cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err))
def recover_pub_from_pri(self): """Recover public database from private database.""" if self.pub_dao.n_tries >= self.pub_dao.MAX_TRIES: self.copy_pri_to_pub() LOG.warning( "%(pub_db_name)s: recovered from %(pri_db_name)s" % { "pub_db_name": self.pub_dao.db_file_name, "pri_db_name": self.pri_dao.db_file_name}) self.pub_dao.n_tries = 0
def put_command(self, ctx, callback, callback_args=None): """Queue a new shell command to execute.""" try: result = self.pool.apply_async(_run_command, [ctx]) except AssertionError as exc: LOG.warning("%s\n %s\n %s" % (str(exc), "Rejecting command (pool closed)", ctx.cmd)) else: self.results[id(result)] = (result, callback, callback_args)
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % { "file": self.db_file_name, "attempt": self.n_tries }) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % { "file": self.db_file_name, "attempt": self.n_tries }) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def put_command(self, ctx, callback, callback_args=None): """Queue a new shell command to execute.""" try: result = self.pool.apply_async(_run_command, [ctx]) except AssertionError as exc: LOG.warning("%s\n %s\n %s" % ( str(exc), "Rejecting command (pool closed)", ctx.cmd)) else: self.results[id(result)] = (result, callback, callback_args)
def execute_queued_items(self): """Execute queued items for each table.""" try: for table in self.tables.values(): # DELETE statements may have varying number of WHERE args so we # can only executemany for each identical template statement. for stmt, stmt_args_list in table.delete_queues.items(): self._execute_stmt(stmt, stmt_args_list) # INSERT statements are uniform for each table, so all INSERT # statements can be executed using a single "executemany" call. if table.insert_queue: self._execute_stmt(table.get_insert_stmt(), table.insert_queue) # UPDATE statements can have varying number of SET and WHERE # args so we can only executemany for each identical template # statement. for stmt, stmt_args_list in table.update_queues.items(): self._execute_stmt(stmt, stmt_args_list) # Connection should only be opened if we have executed something. if self.conn is None: return self.conn.commit() except sqlite3.Error: if not self.is_public: raise self.n_tries += 1 LOG.warning( "%(file)s: write attempt (%(attempt)d) did not complete\n" % {"file": self.db_file_name, "attempt": self.n_tries} ) if self.conn is not None: try: self.conn.rollback() except sqlite3.Error: pass return else: # Clear the queues for table in self.tables.values(): table.delete_queues.clear() del table.insert_queue[:] # list.clear avail from Python 3.3 table.update_queues.clear() # Report public database retry recovery if necessary if self.n_tries: LOG.warning( "%(file)s: recovered after (%(attempt)d) attempt(s)\n" % {"file": self.db_file_name, "attempt": self.n_tries} ) self.n_tries = 0 finally: # Note: This is not strictly necessary. However, if the suite run # directory is removed, a forced reconnection to the private # database will ensure that the suite dies. self.close()
def poll_task_jobs(self, suite, itasks, warn_skips=False): """Poll jobs of specified tasks.""" active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: active_itasks.append(itask) elif warn_skips: # and not active LOG.warning( '%s: skip poll, task not pollable' % itask.identity) self._run_job_cmd( self.JOBS_POLL, suite, active_itasks, self._poll_task_jobs_callback)
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() active_itasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd(self.JOBS_KILL, suite, active_itasks, self._kill_task_jobs_callback)
def _manip_task_jobs_callback(ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (BatchSysManager.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (KeyError, ValueError): if cylc.flags.debug: LOG.warning('Unhandled %s output: %s' % (ctx.cmd_key, line)) LOG.warning(traceback.format_exc())
def unlink_hosts_contacts(self, reg): """Remove suite contact files from initialised hosts. This is called on shutdown, so we don't want anything to hang. Terminate any incomplete SSH commands after 10 seconds. """ # Issue all SSH commands in parallel procs = {} for (host, owner), should_unlink in self.init_host_map.items(): if not should_unlink: continue user_at_host = host if owner: user_at_host = owner + '@' + host ssh_tmpl = GLOBAL_CFG.get_host_item('ssh command', host, owner) r_suite_contact_file = os.path.join( GLOBAL_CFG.get_derived_host_item( reg, 'suite run directory', host, owner), self.suite_srv_files_mgr.DIR_BASE_SRV, self.suite_srv_files_mgr.FILE_BASE_CONTACT) cmd = shlex.split(ssh_tmpl) + [ '-n', user_at_host, 'rm', '-f', r_suite_contact_file] procs[user_at_host] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE)) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for user_at_host, (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[user_at_host] out, err = proc.communicate() if proc.wait(): LOG.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err)) # Terminate any remaining commands for user_at_host, (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning(RemoteJobHostInitError( RemoteJobHostInitError.MSG_TIDY, user_at_host, ' '.join(quote(item) for item in cmd), proc.returncode, out, err))
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True if itask.state.reset_state(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') itask.set_summary_time('started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs(itask, { "time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
def kill_task_jobs(self, suite, itasks): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ to_kill_tasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() to_kill_tasks.append(itask) else: LOG.warning('skipping %s: task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, to_kill_tasks, self._kill_task_jobs_callback)
def _process_message_started(self, itask, event_time): """Helper for process_message, handle a started message.""" if itask.job_vacated: itask.job_vacated = False LOG.warning("Vacated job restarted", itask=itask) self.pflag = True if itask.state.reset_state(TASK_STATUS_RUNNING): self.setup_event_handlers(itask, 'started', 'job started') itask.set_summary_time('started', event_time) self._reset_job_timers(itask) self.suite_db_mgr.put_update_task_jobs( itask, {"time_run": itask.summary['started_time_string']}) # submission was successful so reset submission try number if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0
def _check_access_priv(self, required_privilege_level): """Raise an exception if client privilege is insufficient. (See the documentation above for the boolean version of this function). """ auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) if (PRIVILEGE_LEVELS.index(priv_level) < PRIVILEGE_LEVELS.index(required_privilege_level)): err = self.CONNECT_DENIED_PRIV_TMPL % ( priv_level, required_privilege_level, user, host, prog_name, uuid) LOG.warning(err) # Raise an exception to be sent back to the client. raise cherrypy.HTTPError(403, err) return True
def report_connection_if_denied(self): """Log an (un?)successful connection attempt.""" try: (auth_user, prog_name, user, host, uuid, priv_level) = get_client_info() except Exception: LOG.warning( self.__class__.LOG_CONNECT_DENIED_TMPL % ( "unknown", "unknown", "unknown", "unknown") ) return connection_denied = get_client_connection_denied() if connection_denied: LOG.warning( self.__class__.LOG_CONNECT_DENIED_TMPL % ( user, host, prog_name, uuid) )
def _check_access_priv(self, required_privilege_level): """Raise an exception if client privilege is insufficient for server_obj. (See the documentation above for the boolean version of this function). """ auth_user, prog_name, user, host, uuid = _get_client_info() priv_level = self._get_priv_level(auth_user) if (PRIVILEGE_LEVELS.index(priv_level) < PRIVILEGE_LEVELS.index(required_privilege_level)): err = self.CONNECT_DENIED_PRIV_TMPL % ( priv_level, required_privilege_level, user, host, prog_name, uuid) LOG.warning(err) # Raise an exception to be sent back to the client. raise cherrypy.HTTPError(403, err) return True
def kill_task_jobs(self, suite, itasks, warn_skips=False): """Kill jobs of active tasks, and hold the tasks. If items is specified, kill active tasks matching given IDs. """ active_itasks = [] for itask in itasks: if itask.state.status in TASK_STATUSES_ACTIVE: itask.state.set_held() active_itasks.append(itask) elif warn_skips: # and not active LOG.warning( '%s: skip kill, task not killable' % itask.identity) self._run_job_cmd( self.JOBS_KILL, suite, active_itasks, self._kill_task_jobs_callback)
def _manip_task_jobs_callback( ctx, suite, itasks, summary_callback, more_callbacks=None): """Callback when submit/poll/kill tasks command exits.""" if ctx.ret_code: LOG.error(ctx) else: LOG.debug(ctx) tasks = {} # Note for "kill": It is possible for a job to trigger its trap and # report back to the suite back this logic is called. If so, the task # will no longer be TASK_STATUS_SUBMITTED or TASK_STATUS_RUNNING, and # its output line will be ignored here. for itask in itasks: if itask.point is not None and itask.submit_num: submit_num = "%02d" % (itask.submit_num) tasks[(str(itask.point), itask.tdef.name, submit_num)] = itask handlers = [(BatchSysManager.OUT_PREFIX_SUMMARY, summary_callback)] if more_callbacks: for prefix, callback in more_callbacks.items(): handlers.append((prefix, callback)) out = ctx.out if not out: out = "" # Something is very wrong here # Fallback to use "job_log_dirs" list to report the problem job_log_dirs = ctx.cmd_kwargs.get("job_log_dirs", []) for job_log_dir in job_log_dirs: point, name, submit_num = job_log_dir.split(os.sep, 2) itask = tasks[(point, name, submit_num)] out += (BatchSysManager.OUT_PREFIX_SUMMARY + "|".join([ctx.timestamp, job_log_dir, "1"]) + "\n") for line in out.splitlines(True): for prefix, callback in handlers: if line.startswith(prefix): line = line[len(prefix):].strip() try: path = line.split("|", 2)[1] # timestamp, path, status point, name, submit_num = path.split(os.sep, 2) itask = tasks[(point, name, submit_num)] callback(suite, itask, ctx, line) except (KeyError, ValueError): if cylc.flags.debug: LOG.warning('Unhandled %s output: %s' % ( ctx.cmd_key, line)) LOG.warning(traceback.format_exc())
def report_id_requests(self): """Report the frequency of identification (scan) requests.""" current_time = time.time() interval = current_time - self._id_start_time if interval > self.CLIENT_ID_REPORT_SECONDS: rate = float(self._num_id_requests) / interval if rate > self.CLIENT_ID_MIN_REPORT_RATE: LOG.warning( self.__class__.LOG_IDENTIFY_TMPL % ( self._num_id_requests, interval) ) elif cylc.flags.debug: LOG.info( self.__class__.LOG_IDENTIFY_TMPL % ( self._num_id_requests, interval) ) self._id_start_time = current_time self._num_id_requests = 0
def log_task_job_activity(self, ctx, suite, point, name, submit_num=NN): """Log an activity for a task job.""" ctx_str = str(ctx) if not ctx_str: return if isinstance(ctx.cmd_key, tuple): # An event handler submit_num = ctx.cmd_key[-1] job_activity_log = self.get_task_job_activity_log( suite, point, name, submit_num) try: with open(job_activity_log, "ab") as handle: handle.write(ctx_str + '\n') except IOError as exc: LOG.warning("%s: write failed\n%s" % (job_activity_log, exc)) if ctx.cmd and ctx.ret_code: LOG.error(ctx_str) elif ctx.cmd: LOG.debug(ctx_str)
def _check_timeout(self, itask, now): """Check/handle submission/execution timeouts.""" if itask.state.status == TASK_STATUS_RUNNING: timer = itask.poll_timers.get(self.KEY_EXECUTE_TIME_LIMIT) if timer is not None: if not timer.is_timeout_set(): timer.next() if not timer.is_delay_done(): # Don't poll return False if timer.next() is not None: # Poll now, and more retries lined up return True # No more retry lined up, can issue execution timeout event if itask.state.status in itask.timeout_timers: timeout = itask.timeout_timers[itask.state.status] if timeout is None or now <= timeout: return False itask.timeout_timers[itask.state.status] = None if (itask.state.status == TASK_STATUS_RUNNING and itask.summary['started_time'] is not None): msg = 'job started %s ago, but has not finished' % ( get_seconds_as_interval_string( timeout - itask.summary['started_time'])) event = 'execution timeout' elif (itask.state.status == TASK_STATUS_SUBMITTED and itask.summary['submitted_time'] is not None): msg = 'job submitted %s ago, but has not started' % ( get_seconds_as_interval_string( timeout - itask.summary['submitted_time'])) event = 'submission timeout' else: return False LOG.warning(msg, itask=itask) self.task_events_mgr.setup_event_handlers(itask, event, msg) return True
def _execute_stmt(self, stmt, stmt_args_list): """Helper for "self.execute_queued_items". Execute a statement. If this is the public database, return True on success and False on failure. If this is the private database, return True on success, and raise on failure. """ try: self.connect() self.conn.executemany(stmt, stmt_args_list) except sqlite3.Error: if not self.is_public: raise if cylc.flags.debug: traceback.print_exc() err_log = ( "cannot execute database statement:\n" "file=%(file)s:\nstmt=%(stmt)s" ) % {"file": self.db_file_name, "stmt": stmt} for i, stmt_args in enumerate(stmt_args_list): err_log += ("\nstmt_args[%(i)d]=%(stmt_args)s" % { "i": i, "stmt_args": stmt_args}) LOG.warning(err_log) raise
def process_message(self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % (itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % (itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs(itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug('(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert(itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Schduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % (point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % (point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SuiteProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
def process_message(self, itask, severity, message, poll_func, poll_event_time=None, incoming_event_time=None, submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. """ # Log incoming messages with '>' to distinguish non-message log entries if incoming_event_time: if submit_num is None or submit_num == itask.submit_num: message_flag = self.INCOMING_FLAG else: message_flag = self.IGNORED_INCOMING_FLAG event_time = incoming_event_time elif poll_event_time: message_flag = self.POLLED_FLAG event_time = poll_event_time else: message_flag = '' event_time = get_current_time_string() log_message = '(current:%s)%s %s at %s' % ( itask.state.status, message_flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if message_flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if poll_event_time is not None: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if self._poll_to_confirm(itask, TASK_STATUS_RUNNING, poll_func): return self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: if self._poll_to_confirm(itask, TASK_STATUS_SUCCEEDED, poll_func): return self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMIT_FAILED, poll_func): return self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if self._poll_to_confirm(itask, TASK_STATUS_SUBMITTED, poll_func): return self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) if self._poll_to_confirm(itask, TASK_STATUS_FAILED, poll_func): return self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True try: itask.timeout_timers[TASK_STATUS_SUBMITTED] = ( itask.summary['submitted_time'] + float(self._get_events_conf(itask, 'submission timeout'))) except (TypeError, ValueError): itask.timeout_timers[TASK_STATUS_SUBMITTED] = None # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def process_events(self, schd_ctx): """Process task events that were created by "setup_event_handlers". schd_ctx is an instance of "Schduler" in "cylc.scheduler". """ ctx_groups = {} now = time() for id_key, timer in self.event_timers.copy().items(): key1, point, name, submit_num = id_key if timer.is_waiting: continue # Set timer if timeout is None. if not timer.is_timeout_set(): if timer.next() is None: LOG.warning("%s/%s/%02d %s failed" % ( point, name, submit_num, key1)) del self.event_timers[id_key] continue # Report retries and delayed 1st try tmpl = None if timer.num > 1: tmpl = "%s/%s/%02d %s failed, retrying in %s" elif timer.delay: tmpl = "%s/%s/%02d %s will run after %s" if tmpl: LOG.debug(tmpl % ( point, name, submit_num, key1, timer.delay_timeout_as_str())) # Ready to run? if not timer.is_delay_done() or ( # Avoid flooding user's mail box with mail notification. # Group together as many notifications as possible within a # given interval. timer.ctx.ctx_type == self.HANDLER_MAIL and not schd_ctx.stop_mode and self.next_mail_time is not None and self.next_mail_time > now ): continue timer.set_waiting() if timer.ctx.ctx_type == self.HANDLER_CUSTOM: # Run custom event handlers on their own self.proc_pool.put_command( SuiteProcContext( (key1, submit_num), timer.ctx.cmd, env=os.environ, shell=True, ), self._custom_handler_callback, [schd_ctx, id_key]) else: # Group together built-in event handlers, where possible if timer.ctx not in ctx_groups: ctx_groups[timer.ctx] = [] ctx_groups[timer.ctx].append(id_key) next_mail_time = now + self.mail_interval for ctx, id_keys in ctx_groups.items(): if ctx.ctx_type == self.HANDLER_MAIL: # Set next_mail_time if any mail sent self.next_mail_time = next_mail_time self._process_event_email(schd_ctx, ctx, id_keys) elif ctx.ctx_type == self.HANDLER_JOB_LOGS_RETRIEVE: self._process_job_logs_retrieval(schd_ctx, ctx, id_keys)
def process_message( self, itask, severity, message, event_time=None, flag='', submit_num=None): """Parse an incoming task message and update task state. Incoming, e.g. "succeeded at <TIME>", may be from task job or polling. It is possible for my current state to be inconsistent with an incoming message (whether normal or polled) e.g. due to a late poll result, or a network outage, or manual state reset. To handle this, if a message would take the task state backward, issue a poll to confirm instead of changing state - then always believe the next message. Note that the next message might not be the result of this confirmation poll, in the unlikely event that a job emits a succession of messages very quickly, but this is the best we can do without somehow uniquely associating each poll with its result message. Arguments: itask (cylc.task_proxy.TaskProxy): The task proxy object relevant for the message. severity (str or int): Message severity, should be a recognised logging level. message (str): Message content. event_time (str): Event time stamp. Expect ISO8601 date time string. If not specified, use current time. flag (str): If specified, can be INCOMING_FLAG to indicate an incoming message, POLLED_FLAG to indicate a message resulted from a poll. Otherwise, the message is assumed to be generated by the logic in the suite server program. submit_num (int): The submit number of the task relevant for the message. If not specified, use latest submit number. Return: None: in normal circumstances. True: if polling is required to confirm a reversal of status. """ # Log incoming messages if event_time is None: event_time = get_current_time_string() if submit_num is None: submit_num = itask.submit_num if flag == self.INCOMING_FLAG and submit_num != itask.submit_num: flag = self.IGNORED_INCOMING_FLAG log_message = '(current:%s)%s %s at %s' % ( itask.state.status, flag, message, event_time) LOG.log(self.LEVELS.get(severity, INFO), log_message, itask=itask) if flag == self.IGNORED_INCOMING_FLAG: LOG.warning( 'submit-num=%d: ignore message from job submit-num=%d' % ( itask.submit_num, submit_num), itask=itask) return # always update the suite state summary for latest message itask.summary['latest_message'] = message if flag == self.POLLED_FLAG: itask.summary['latest_message'] += ' %s' % self.POLLED_FLAG cylc.flags.iflag = True # Satisfy my output, if possible, and record the result. an_output_was_satisfied = itask.state.outputs.set_msg_trg_completion( message=message, is_completed=True) if message == TASK_OUTPUT_STARTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_RUNNING)): return True self._process_message_started(itask, event_time) elif message == TASK_OUTPUT_SUCCEEDED: self._process_message_succeeded(itask, event_time) elif message == TASK_OUTPUT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message == self.EVENT_SUBMIT_FAILED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMIT_FAILED)): return True self._process_message_submit_failed(itask, event_time) elif message == TASK_OUTPUT_SUBMITTED: if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_SUBMITTED)): return True self._process_message_submitted(itask, event_time) elif message.startswith(FAIL_MESSAGE_PREFIX): # Task received signal. if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True signal = message[len(FAIL_MESSAGE_PREFIX):] self._db_events_insert(itask, "signaled", signal) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": signal}) self._process_message_failed(itask, event_time, self.JOB_FAILED) elif message.startswith(ABORT_MESSAGE_PREFIX): # Task aborted with message if (flag == self.INCOMING_FLAG and itask.state.is_gt(TASK_STATUS_FAILED)): return True aborted_with = message[len(ABORT_MESSAGE_PREFIX):] self._db_events_insert(itask, "aborted", message) self.suite_db_mgr.put_update_task_jobs( itask, {"run_signal": aborted_with}) self._process_message_failed(itask, event_time, aborted_with) elif message.startswith(VACATION_MESSAGE_PREFIX): # Task job pre-empted into a vacation state self._db_events_insert(itask, "vacated", message) itask.set_summary_time('started') # unset if TASK_STATUS_SUBMIT_RETRYING in itask.try_timers: itask.try_timers[TASK_STATUS_SUBMIT_RETRYING].num = 0 itask.job_vacated = True # Believe this and change state without polling (could poll?). self.pflag = True itask.state.reset_state(TASK_STATUS_SUBMITTED) self._reset_job_timers(itask) elif an_output_was_satisfied: # Message of an as-yet unreported custom task output. # No state change. self.pflag = True self.suite_db_mgr.put_update_task_outputs(itask) else: # Unhandled messages. These include: # * general non-output/progress messages # * poll messages that repeat previous results # Note that all messages are logged already at the top. # No state change. LOG.debug( '(current:%s) unhandled: %s' % (itask.state.status, message), itask=itask) if severity in [CRITICAL, ERROR, WARNING, INFO, DEBUG]: severity = getLevelName(severity) self._db_events_insert( itask, ("message %s" % str(severity).lower()), message) if severity in ['WARNING', 'CRITICAL', 'CUSTOM']: self.setup_event_handlers(itask, severity.lower(), message)
def _run_event_mail_callback(proc_ctx): """Callback the mail command for notification of a suite event.""" if proc_ctx.ret_code: LOG.warning(str(proc_ctx)) else: LOG.info(str(proc_ctx))
def remote_tidy(self): """Remove suite contact files from initialised remotes. Call "cylc remote-tidy". This method is called on suite shutdown, so we want nothing to hang. Timeout any incomplete commands after 10 seconds. Also remove UUID file on suite host ".service/uuid". """ # Remove UUID file uuid_fname = os.path.join( self.suite_srv_files_mgr.get_suite_srv_dir(self.suite), 'uuid') try: os.unlink(uuid_fname) except OSError: pass # Issue all SSH commands in parallel procs = {} for (host, owner), init_with_contact in self.remote_init_map.items(): if init_with_contact != REMOTE_INIT_DONE: continue cmd = ['timeout', '10', 'cylc', 'remote-tidy'] if is_remote_host(host): cmd.append('--host=%s' % host) if is_remote_user(owner): cmd.append('--user=%s' % owner) if cylc.flags.debug: cmd.append('--debug') cmd.append( os.path.join(glbl_cfg().get_derived_host_item( self.suite, 'suite run directory', host, owner))) procs[(host, owner)] = (cmd, Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=open(os.devnull))) # Wait for commands to complete for a max of 10 seconds timeout = time() + 10.0 while procs and time() < timeout: for (host, owner), (cmd, proc) in procs.copy().items(): if proc.poll() is None: continue del procs[(host, owner)] out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError( TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err)) # Terminate any remaining commands for (host, owner), (cmd, proc) in procs.items(): try: proc.terminate() except OSError: pass out, err = proc.communicate() if proc.wait(): LOG.warning( TaskRemoteMgmtError(TaskRemoteMgmtError.MSG_TIDY, (host, owner), ' '.join(quote(item) for item in cmd), proc.ret_code, out, err))