class LogOutput: name = None config_match = lambda c: False _cls_handlers = lazydict() _cls_reghandlers = list() @classmethod def register(cls, handlercls): cls._cls_reghandlers.append(handlercls) @classmethod def getOutputHandlers(cls, config): return list( filter(None, [h.getHandler(config) for h in cls._cls_reghandlers])) @classmethod def getName(cls, config): return cls.name @classmethod def matchesConfig(cls, config): return config.enabled and cls.config_match(config) @classmethod def getHandler(cls, config): if not cls.matchesConfig(config): return None name = cls.getName(config) if name is None: return None return cls._cls_handlers.setdefault(name, lambda: cls(config)) def __init__(self, config): self.name = config.name self.config = config def close(self): pass def writeLog(self, logattrs, priority, facility): if logattrs.get('format_error'): msg = "??" + logattrs['raw'] else: # Note that 'rest' always starts with a ':', '[' or ' '. msg = (logattrs['date'] + ' ' + (self.config.logrec_hostname or logattrs['host'] or _our_hostname) + ' ' + logattrs['tag'] + logattrs['rest']) if self.config.extended: msg = get_syslog_info(facility, priority) + " " + msg self.write(msg) def write(self, data): h = self.handle h.write(data) h.write("\n") h.flush()
def __init__(self, *args, default = None, uid = None, extra_settings = None, disable_console_log = False): """ Given one or more files, load our configuration. If no configuration is provided, then use the configuration specified by the default. """ debug("CONFIG INPUT (uid={1}): '{0}'".format(args, uid)) self.uid = uid self._conf = lazydict() for fn in args: if os.path.exists(fn): self._merge(yaml.load(open(fn, 'r').read().expandtabs(), Loader=yaml.Loader)) if not self._conf and default: self._conf = lazydict(yaml.load(default)) validator(self._conf) if extra_settings: self.update_settings(extra_settings) s = self.get_settings() self.uid = s.get('uid', self.uid) self.gid = s.get('gid', self.gid) # Special case used by --no-console-log. It really was just easiest to do it this way # rather than try to build some special notion of "console logging" into the log services # backends. if disable_console_log: for k,v in self._conf.items(): if k.endswith('.logging'): if 'stdout' in v: del v['stdout'] if 'stderr' in v: del v['stderr']
def __init__(self, *args, default = None, uid = None, extra_settings = None, disable_console_log = False): """ Given one or more files, load our configuration. If no configuration is provided, then use the configuration specified by the default. """ debug("CONFIG INPUT (uid={1}): '{0}'".format(args, uid)) self.uid = uid self._conf = lazydict() for fn in args: if os.path.exists(fn): self._merge(yaml.load(open(fn, 'r').read().expandtabs())) if not self._conf and default: self._conf = lazydict(yaml.load(default)) validator(self._conf) if extra_settings: self.update_settings(extra_settings) s = self.get_settings() self.uid = s.get('uid', self.uid) self.gid = s.get('gid', self.gid) # Special case used by --no-console-log. It really was just easiest to do it this way # rather than try to build some special notion of "console logging" into the log services # backends. if disable_console_log: for k,v in self._conf.items(): if k.endswith('.logging'): if 'stdout' in v: del v['stdout'] if 'stderr' in v: del v['stderr']
def get_logconfigs(self): env = self.get_environment() settings = self._conf.get('settings') return lazydict( ((k,LogConfig(v,k,env,settings)) for k,v in self._conf.items() if k.endswith('.logging')) )
def get_startup_list(self): """ Returns the list of start-up items in priority order by examining before: and after: attributes. """ if self._ordered_startup is not None: return self._ordered_startup services = self.deepcopy() groups = lazydict() for k,v in services.items(): for g in v.service_groups: groups.setdefault(g, lambda: lazydict())[k] = v #print_services('initial', services.values()) # The "IDLE" and "INIT" groups are special. Revamp things so that any services in the "IDLE" group # have an implicit "after: 'all-others'" and any services in "INIT" have an implicit "before: 'all-others' # where all-others is an explicit list of all services NOT in the respective group if 'IDLE' in groups: nonidle = set(k for k,v in services.items() if "IDLE" not in v.service_groups) for s in groups['IDLE'].values(): s.after.update(nonidle) if 'INIT' in groups: noninit = set(k for k,v in services.items() if "INIT" not in v.service_groups) for s in groups['INIT'].values(): s.before.update(noninit) # We want to only look at the "after:" attribute, so we will eliminate the relevance # of befores... for k,v in services.items(): for bef in v.before: if bef in groups: for g in groups[bef].values(): g.after.add(v.name) elif bef in services: services[bef].after.add(v.name) v.before = None # Before is now gone, make sure that all "after... groups" are translated into "after.... service" for group in groups.values(): afters = set() for item in group.values(): afters.update(item.after) for a in afters: if a in groups: names = groups[a].keys() for item in group.values(): item.after.update(names) # Now remove any undefined services or groups and turn the 'after' attribute into a definitive # graph. # # Note: sorted() occurs a couple times below. The main reason is so that the results # are deterministic in cases where exact order is not defined. afters = set(services.keys()) for v in services.values(): v.refs = sorted(map(lambda n: services[n], v.after.intersection(afters)), key=attrgetter('name')) #print_services('before add nodes', services.values()) svlist = list() # this will be our final list, containing original items svseen = set() def add_nodes(items): for item in items: if hasattr(item, 'active'): raise Exception("circular dependency in service declaration") item.active = True add_nodes(item.refs) del item.active if item.name not in svseen: svseen.add(item.name) svlist.append(self[item.name]) # set startup prerequisite dependencies svlist[-1].prerequisites = set(r.name for r in item.refs) add_nodes(sorted(services.values(), key=attrgetter('name'))) #print_services('final service list', svlist) self._ordered_startup = svlist return svlist
class SubProcess(object): service = None # service object family = None process_timeout = 30.0 # process_timeout will be set to this unless it is overridden by # the service entry syslog_facility = None # specifies any additional syslog facility to use when using # logerror, logdebug, logwarn, etc... start_attempted = False # used to determine if a service is truly dormant defer_exit_kills = False # if true, then exit_kills will wait until a proper PID is returned # from a subprocess, then will kill when the real process exits error_count = 0 # counts errors for informational purposes _proc = None _pid = None # the pid, often associated with _proc, but not necessarily in the # case of notify processes _returncode = None # an alternate returncode, set with returncode property _exit_event = None # an event to be fired if an exit occurs, in the case of an # attached PID _orig_executable = None # original unexpanded exec_args[0] _pwrec = None # the pwrec looked up for execution user/group _cond_starting = None # a condition which, if present, indicates that this service is starting _cond_exception = None # exception which was raised during startup (for other waiters) _started = False # true if a start has occurred, either successful or not _restarts_allowed = None # number of starts permitted before we give up (if None then restarts allowed according to service def) _prereq_cache = None _procenv = None # process environment ready to be expanded _pending = None # pending futures _note = None # Class variables _cls_ptdict = lazydict() # dictionary of process types _cls_serial = 0 # serial number for process creation def __new__(cls, service, family=None): """ New Subprocesses are managed by subclasses derived from SubProcess so that complex process behavior can be isolated and loaded only when needed. That keeps this basic superclass logic less convoluted. """ # If we are trying to create a subclass, just inherit __new__ simply if cls is not SubProcess: return super(SubProcess, cls).__new__(cls) # Lookup and cache the class object used to create this type. stype = service.type ptcls = SubProcess._cls_ptdict.get(stype) if not ptcls: mod = importlib.import_module('chaperone.cproc.pt.' + stype) ptcls = SubProcess._cls_ptdict[stype] = getattr( mod, stype.capitalize() + 'Process') assert issubclass(ptcls, cls) return ptcls(service, family) def __init__(self, service, family=None): self.service = service self.family = family self._pending = set() if service.process_timeout is not None: self.process_timeout = service.process_timeout if not service.environment: self._procenv = Environment() else: self._procenv = service.environment if not service.exec_args: raise ChParameterError( "No command or arguments provided for service") # If the service is enabled, assure we check for the presence of the executable now. This is # to catch any start-up situations (such as cron jobs without their executables being present). # However, we don't check this if a service is disabled. self._orig_executable = service.exec_args[0] if service.enabled: self._try_to_enable() def __getattr__(self, name): "Proxies value from the service description if we don't override them." return getattr(self.service, name) def __setattr__(self, name, value): """ Any service object attribute supercedes our own except for privates or those we keep separately, in which case there is a distinction. """ if name[0:0] != '_' and hasattr(self.service, name) and not hasattr(self, name): setattr(self.service, name, value) else: object.__setattr__(self, name, value) def _setup_subprocess(self): if self._pwrec: os.setgid(self._pwrec.pw_gid) os.setuid(self._pwrec.pw_uid) if self.setpgrp: os.setpgrp() if not self.directory: try: os.chdir(self._pwrec.pw_dir) except Exception as ex: pass return def _get_states(self): states = list() if self.started: states.append('started') if self.failed: states.append('failed') if self.ready: states.append('ready') if self.running: states.append('running') return ' '.join(states) # pid and returncode management @property def pid(self): return self._pid @pid.setter def pid(self, newpid): if self._pid is not None and newpid is not None and self._pid is not newpid: self.logdebug("{0} changing PID to {1} (from {2})", self.name, newpid, self._pid) try: pgid = os.getpgid(newpid) except ProcessLookupError as ex: raise ChProcessError( "{0} attempted to attach the process with PID={1} but there is no such process" .format(self.name, newpid), errno=ex.errno) self._attach_pid(newpid) self._pid = newpid @property def returncode(self): if self._returncode is not None: return self._returncode return self._proc and self._proc.returncode @returncode.setter def returncode(self, val): self._returncode = ProcStatus(val) self.logdebug("{0} got explicit return code '{1}'", self.name, self._returncode) # Logging methods which may do special things for this service def loginfo(self, *args, **kwargs): info(*args, facility=self.syslog_facility, **kwargs) def logerror(self, *args, **kwargs): self.error_count += 1 error(*args, facility=self.syslog_facility, **kwargs) def logwarn(self, *args, **kwargs): warn(*args, facility=self.syslog_facility, **kwargs) def logdebug(self, *args, **kwargs): debug(*args, facility=self.syslog_facility, **kwargs) @property def note(self): return self._note @note.setter def note(self, value): self._note = value @property def status(self): serv = self.service proc = self._proc rs = "" if serv.restart and self._restarts_allowed is not None and self._restarts_allowed > 0: rs = "+r#" + str(self._restarts_allowed) if self._cond_starting: return "starting" if proc: rc = self._returncode if self._returncode is not None else proc.returncode if rc is None: return "running" elif rc.normal_exit and self._started: return "started" elif rc: return rc.briefly + rs if not serv.enabled: return "disabled" return self.default_status() def default_status(self): if self.ready: return 'ready' return None @property def enabled(self): return self.service.enabled @enabled.setter def enabled(self, val): if val and not self.service.enabled: self._try_to_enable() else: self.service.enabled = False def _try_to_enable(self): service = self.service if self._orig_executable: try: service.exec_args[0] = executable_path( self._orig_executable, service.environment.expanded()) except FileNotFoundError: if service.optional: service.enabled = False self.loginfo( "optional service {0} disabled since '{1}' is not present" .format(self.name, self._orig_executable)) return elif service.ignore_failures: service.enabled = False self.logwarn( "(ignored) service {0} executable '{1}' is not present" .format(self.name, self._orig_executable)) return raise ChNotFoundError("executable '{0}' not found".format( service.exec_args[0])) # Now we know this service is truly enabled, we need to assure its credentials # are correct. senv = service.environment if senv and senv.uid is not None and not self._pwrec: self._pwrec = lookup_user(senv.uid, senv.gid) service.enabled = True @property def scheduled(self): """ True if this is a process which WILL fire up a process in the future. A "scheduled" process does not include one which will be started manually, nor does it include proceses which will be started due to dependencies. Processes like "cron" and "inetd" return True if they are active and may start processes in the future. """ return False @property def kill_signal(self): ksig = self.service.kill_signal if ksig is not None: return ksig return signal.SIGTERM @property def running(self): "True if this process has started, is running, and has a pid" return self._proc and self._proc.returncode is None @property def started(self): """ True if this process has started normally. It may have forked, or executed, or is scheduled. """ return self._started @property def stoppable(self): """ True if this process can be stopped. By default, returns True if the service is started, but some job types such as cron and inetd may be stoppable even when processes themselves are not running. """ return self.started @property def failed(self): "True if this process has failed, either during startup or later." return ((self._returncode is not None and not self._returncode.normal_exit) or self._proc and (self._proc.returncode is not None and not self._proc.returncode.normal_exit)) @property def ready(self): """ True if this process is ready to run, or running. If not running, To be ready to run, all prerequisites must also be ready. """ if not self.enabled or self.failed: return False if self.started: return True if any(p.enabled and not p.ready for p in self.prerequisites): return False return True @property def prerequisites(self): """ Return a list of prerequisite objects. Right now, these must be within our family but this may change, so don't refer to the family or the prereq in services. Use this instead. """ if self._prereq_cache is None: prereq = (self.family and self.service.prerequisites) or () prereq = self._prereq_cache = tuple(self.family[p] for p in prereq if p in self.family) return self._prereq_cache @asyncio.coroutine def start(self): """ Runs this service if it is enabled and has not already been started. Starts prerequisite services first. A service is considered started if a) It is enabled, and started up normally. b) It is disabled, and an attempt was made to start it. c) An error occurred, it did not start, but failures we an acceptable outcome and the service has not been reset since the errors occurred. """ service = self.service if self._started: self.logdebug( "service {0} already started. further starts ignored.", service.name) return if not service.enabled: self.logdebug("service {0} not enabled, will be skipped", service.name) return else: self.logdebug("service {0} enabled, queueing start request", service.name) # If this service is already starting, then just wait until it completes. cond_starting = self._cond_starting if cond_starting: yield from cond_starting.acquire() yield from cond_starting.wait() cond_starting.release() # This is an odd situation. Since every waiter expects start() to succeed, or # raise an exception, we need to be sure we raise the exception that happened # in the original start() request. if self._cond_exception: raise self._cond_exception return cond_starting = self._cond_starting = asyncio.Condition() self._cond_exception = None # Now we can procede self.start_attempted = True try: prereq = self.prerequisites if prereq: for p in prereq: yield from p.start() self.logdebug("service {0} prerequisites satisfied", service.name) if self.family: # idle only makes sense for families if "IDLE" in service.service_groups and service.idle_delay and not hasattr( self.family, '_idle_hit'): self.family._idle_hit = True self.logdebug( "IDLE transition hit. delaying for {0} seconds", service.idle_delay) yield from asyncio.sleep(service.idle_delay) # STOP if the system is no longer alive because a prerequisite failed if not self.family.system_alive: return try: yield from self.start_subprocess() except Exception as ex: if service.ignore_failures: self.loginfo( "service {0} ignoring failures. Exception: {1}", service.name, ex) else: self._cond_exception = ex self.logdebug( "{0} received exception during attempted start. Exception: {1}", service.name, ex) raise finally: self._started = True yield from cond_starting.acquire() cond_starting.notify_all() cond_starting.release() self._cond_starting = None self.logdebug("{0} notified waiters upon completion", service.name) def get_expanded_environment(self): SubProcess._cls_serial += 1 penv = self._procenv penv[ENV_SERIAL] = str(SubProcess._cls_serial) penv[ENV_SERVTIME] = str(int(time())) return penv.expanded() @asyncio.coroutine def start_subprocess(self): service = self.service self.logdebug("{0} attempting start '{1}'... ".format( service.name, " ".join(service.exec_args))) kwargs = dict() if service.stdout == 'log': kwargs['stdout'] = asyncio.subprocess.PIPE if service.stderr == 'log': kwargs['stderr'] = asyncio.subprocess.PIPE if service.directory: kwargs['cwd'] = service.directory env = self.get_expanded_environment() yield from self.process_prepare_co(env) if env: env = env.get_public_environment() if service.debug: if not env: self.logdebug("{0} environment is empty", service.name) else: self.logdebug("{0} environment:", service.name) for k, v in env.items(): self.logdebug(" {0} = '{1}'".format(k, v)) create = asyncio.create_subprocess_exec( *service.exec_args, preexec_fn=self._setup_subprocess, env=env, **kwargs) if service.exit_kills: self.logwarn("system will be killed when '{0}' exits", service.exec_args[0]) yield from asyncio.sleep(0.2) proc = self._proc = yield from create self.pid = proc.pid if service.stdout == 'log': self.add_pending( asyncio.ensure_future( _process_logger(proc.stdout, 'stdout', self))) if service.stderr == 'log': self.add_pending( asyncio.ensure_future( _process_logger(proc.stderr, 'stderr', self))) if service.exit_kills and not self.defer_exit_kills: self.add_pending(asyncio.ensure_future(self._wait_kill_on_exit())) yield from self.process_started_co() self.logdebug("{0} successfully started", service.name) @asyncio.coroutine def process_prepare_co(self, environment): pass @asyncio.coroutine def process_started_co(self): pass @asyncio.coroutine def wait_for_pidfile(self): """ If the pidfile option was specified, then wait until we find a valid pidfile, and register the new PID. This is not done automatically, but is implemented here as a utility for process types that need it. """ if not self.pidfile: return self.logdebug("{0} waiting for PID file: {1}".format( self.name, self.pidfile)) pidsleep = 0.02 # work incrementally up to no more than process_timeout minsleep = 3 expires = time() + self.process_timeout last_ex = None while time() < expires: if not self.family.system_alive: return yield from asyncio.sleep(pidsleep) # ramp up until we hit the minsleep ceiling pidsleep = min(pidsleep * 2, minsleep) try: newpid = int(open(self.pidfile, 'r').read().strip()) except FileNotFoundError: continue except Exception as ex: # Don't raise this immediately. The service may create the file before writing the PID. last_ex = ChProcessError( "{0} found pid file '{1}' but contents did not contain an integer" .format(self.name, self.pidfile), errno=errno.EINVAL) continue self.pid = newpid return if last_ex is not None: raise last_ex raise ChProcessError( "{0} did not find pid file '{1}' before {2}sec process_timeout expired" .format(self.name, self.pidfile, self.process_timeout), errno=errno.ENOENT) @asyncio.coroutine def _wait_kill_on_exit(self): yield from self.wait() self._kill_system() def _attach_pid(self, newpid): """ Attach this process to a new PID, creating a condition which will be used by the child watcher to determine when the PID has exited. """ with asyncio.get_child_watcher() as watcher: watcher.add_child_handler(newpid, self._child_watcher_callback) self._exit_event = asyncio.Event() def _child_watcher_callback(self, pid, returncode): asyncio.get_event_loop().call_soon_threadsafe(self.process_exit, returncode) def process_exit(self, code): self.returncode = code if self._exit_event: self._exit_event.set() self._exit_event = None if self.exit_kills: self.logwarn("{0} terminated with exit_kills enabled", self.service.name) # Since we're dead, and the system is going away, disable any process management self._proc = None self.pid = None self._kill_system() if code.normal_exit or self.kill_signal == code.signal: return asyncio.ensure_future(self._abnormal_exit(code)) @asyncio.coroutine def _abnormal_exit(self, code): service = self.service if service.exit_kills: self.logwarn("{0} terminated abnormally with {1}", service.name, code) return # A disabled service should not do recovery if not service.enabled: return if self._started and service.restart: if self._restarts_allowed is None: self._restarts_allowed = service.restart_limit if self._restarts_allowed > 0: self._restarts_allowed -= 1 controller = self.family.controller if controller.system_alive: if service.restart_delay: self.loginfo( "{0} pausing between restart retries ({1} left)", service.name, self._restarts_allowed) yield from asyncio.sleep(service.restart_delay) if controller.system_alive: yield from self.reset() #yield from self.start() f = asyncio.ensure_future(self.start( )) # queue it since we will just return here f.add_done_callback(self._restart_callback) return if service.ignore_failures: self.logdebug( "{0} abnormal process exit ignored due to ignore_failures=true", service.name) yield from self.reset() return self.logerror("{0} terminated abnormally with {1}", service.name, code) def _restart_callback(self, fut): # Catches a restart result, reporting it as a warning, and either passing back to _abnormal_exit # or accepting glorious success. ex = fut.exception() if not ex: self.logdebug("{0} restart succeeded", self.name) else: self.logwarn("{0} restart failed: {1}", self.name, ex) asyncio.ensure_future( self._abnormal_exit(self._proc and self._proc.returncode)) def _kill_system(self): self.family.controller.kill_system(self.returncode) def add_pending(self, future): self._pending.add(future) future.add_done_callback(lambda f: self._pending.discard(future)) @asyncio.coroutine def reset(self, dependents=False, enable=False, restarts_ok=False): self.logdebug("{0} received reset", self.name) if self._exit_event: self.terminate() elif self._proc: if self._proc.returncode is None: self.terminate() yield from self.wait() self.pid = None self._proc = None self._started = False if restarts_ok: self._restarts_allowed = None if enable: self.enabled = True # If there is a pidfile, then remove it if self.pidfile: try: os.remove(self.pidfile) except Exception: pass # Reset any non-ready dependents if dependents: for p in self.prerequisites: if not p.ready and (enable or p.enabled): yield from p.reset(dependents, enable, restarts_ok) @asyncio.coroutine def stop(self): yield from self.reset(restarts_ok=True) @asyncio.coroutine def final_stop(self): "Called when the whole system is killed, but before drastic measures are taken." self._exit_event = None self.terminate() for p in list(self._pending): if not p.cancelled(): p.cancel() def terminate(self): proc = self._proc otherpid = self.pid if proc: if otherpid == proc.pid: otherpid = None if proc.returncode is None: if self.service.kill_signal is not None: # explicitly check service self.logdebug("using {0} to terminate {1}", get_signal_name(self.kill_signal), self.name) proc.send_signal(self.kill_signal) else: proc.terminate() if otherpid: self.logdebug("using {0} to terminate {1}", get_signal_name(self.kill_signal), self.name) try: os.kill(otherpid, self.kill_signal) except Exception as ex: warn("{0} could not be killed using PID={1}: ".format( ex, otherpid)) self._pid = None @asyncio.coroutine def do_startup_pause(self): """ Wait a short time just to see if the process errors out immediately. This avoids a retry loop and catches any immediate failures now. Can be used by process implementations if needed. """ if not self.startup_pause: return try: result = yield from self.timed_wait(self.startup_pause) except asyncio.TimeoutError: result = None if result is not None and not result.normal_exit: if self.ignore_failures: warn("{0} (ignored) failure on start-up with result '{1}'". format(self.name, result)) else: raise ChProcessError( "{0} failed on start-up with result '{1}'".format( self.name, result), resultcode=result) @asyncio.coroutine def timed_wait(self, timeout, func=None): """ Timed wait waits for process completion. If process completion occurs normally, the returncode for process startup is returned. Upon timeout either: 1. asyncio.TimeoutError is raised if 'func' is not provided, or... 2. func is called and the result is returned from timed_wait(). """ try: if not timeout: raise asyncio.TimeoutError( ) # funny situation, but settings can cause this if users attempt it result = yield from asyncio.wait_for(asyncio.shield(self.wait()), timeout) except asyncio.TimeoutError: if not func: raise result = func() except asyncio.CancelledError: result = self.returncode return result @asyncio.coroutine def wait(self): proc = self._proc if self._exit_event: yield from self._exit_event.wait() elif proc: yield from proc.wait() else: raise Exception("Process not running (or attached), can't wait") if proc.returncode is not None and proc.returncode.normal_exit: self.logdebug("{2} exit status for pid={0} is '{1}'".format( proc.pid, proc.returncode, self.name)) else: self.loginfo("{2} exit status for pid={0} is '{1}'".format( proc.pid, proc.returncode, self.name)) return proc.returncode