def manage_watchers(self): if not self.busy and self.alive: self.busy = True # manage and reap processes self.reap_processes() for watcher in self.iter_watchers(): watcher.manage_processes() if self.check_flapping and not self.flapping.is_alive(): self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start() self.busy = False
def __init__(self, name, cmd, num_flies=1, warmup_delay=0., working_dir=None, shell=False, uid=None, gid=None, send_hup=False, env=None, stopped=False, times=2, within=1., retry_in=7., max_retry=5): self.name = name self.num_flies = int(num_flies) self.warmup_delay = warmup_delay self.cmd = cmd self._fly_counter = 0 self.stopped = stopped self.max_retry = max_retry self.optnames = ("num_flies", "warmup_delay", "working_dir", "uid", "gid", "send_hup", "shell", "env", "cmd", "times", "within", "retry_in", "max_retry") if not working_dir: # working dir hasn't been set working_dir = util.get_working_dir() self.working_dir = working_dir self.flies = {} self.shell = shell self.uid = uid self.gid = gid self.env = env self.send_hup = send_hup # define flapping object self.flapping = Flapping(self, times, within, retry_in, max_retry)
def initialize(self): # set process title _setproctitle("circusd") # event pub socket self.evpub_socket = self.context.socket(zmq.PUB) self.evpub_socket.bind(self.pubsub_endpoint) self.evpub_socket.linger = 0 # initialize flapping if self.check_flapping: self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) # initialize watchers for watcher in self.iter_watchers(): self._watchers_names[watcher.name.lower()] = watcher watcher.initialize(self.evpub_socket)
def manage_watchers(self): if not self.busy and self.alive: self.busy = True # manage and reap processes for watcher in self.watchers: watcher.reap_processes() watcher.manage_processes() if self.check_flapping and not self.flapping.is_alive(): self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start() self.busy = False
def initialize(self): # event pub socket self.evpub_socket = self.context.socket(zmq.PUB) self.evpub_socket.bind(self.pubsub_endpoint) self.evpub_socket.linger = 0 # initialize flapping if self.check_flapping: self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) # initialize watchers for watcher in self.watchers: self._watchers_names[watcher.name.lower()] = watcher watcher.initialize(self.evpub_socket)
class Arbiter(object): """Class used to control a list of watchers. Options: - **watchers** -- a list of Watcher objects - **endpoint** -- the controller ZMQ endpoint - **pubsub_endpoint** -- the pubsub endpoint - **stats_endpoint** -- the stats endpoint. If not provided, the *circusd-stats* process will not be launched. - **check_delay** -- the delay between two controller points (default: 1 s) - **prereload_fn** -- callable that will be executed on each reload (default: None) - **context** -- if provided, the zmq context to reuse. (default: None) - **loop**: if provided, a :class:`zmq.eventloop.ioloop.IOLoop` instance to reuse. (default: None) - **check_flapping** -- when True, Circus will check for flapping processes and automatically restart them. (default: True) """ def __init__(self, watchers, endpoint, pubsub_endpoint, check_delay=1., prereload_fn=None, context=None, loop=None, check_flapping=True, stats_endpoint=None): self.watchers = watchers self.endpoint = endpoint self.check_delay = check_delay self.prereload_fn = prereload_fn self.pubsub_endpoint = pubsub_endpoint # initialize zmq context self.context = context or zmq.Context.instance() self.loop = loop or ioloop.IOLoop() self.ctrl = Controller(endpoint, self.context, self.loop, self, check_delay) self.pid = os.getpid() self._watchers_names = {} self.alive = True self.busy = False self.check_flapping = check_flapping # initializing circusd-stats as a watcher when configured self.stats_endpoint = stats_endpoint if self.stats_endpoint is not None: cmd = "%s -c 'from circus import stats; stats.main()'" % \ sys.executable cmd += ' --endpoint %s' % self.endpoint cmd += ' --pubsub %s' % self.pubsub_endpoint cmd += ' --statspoint %s' % self.stats_endpoint stats_watcher = Watcher('circusd-stats', cmd) self.watchers.append(stats_watcher) @classmethod def load_from_config(cls, config_file): cfg = get_config(config_file) # hack reload ioloop to use the monkey patched version reload(ioloop) watchers = [] for watcher in cfg.get('watchers', []): watchers.append(Watcher.load_from_config(watcher)) # creating arbiter arbiter = cls(watchers, cfg['endpoint'], cfg['pubsub_endpoint'], check_delay=cfg.get('check_delay', 1.), prereload_fn=cfg.get('prereload_fn'), stats_endpoint=cfg.get('stats_endpoint')) return arbiter def iter_watchers(self): watchers = [(watcher.priority, watcher) for watcher in self.watchers] watchers.sort() watchers.reverse() for __, watcher in watchers: yield watcher @debuglog def initialize(self): # set process title _setproctitle("circusd") # event pub socket self.evpub_socket = self.context.socket(zmq.PUB) self.evpub_socket.bind(self.pubsub_endpoint) self.evpub_socket.linger = 0 # initialize flapping if self.check_flapping: self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) # initialize watchers for watcher in self.iter_watchers(): self._watchers_names[watcher.name.lower()] = watcher watcher.initialize(self.evpub_socket) @debuglog def start(self): """Starts all the watchers. The start command is an infinite loop that waits for any command from a client and that watches all the processes and restarts them if needed. """ logger.info("Starting master on pid %s", self.pid) self.initialize() # start controller self.ctrl.start() # start flapping if self.check_flapping: logger.debug('Starting flapping') self.flapping.start() # initialize processes logger.debug('Initializing watchers') for watcher in self.iter_watchers(): watcher.start() logger.info('Arbiter now waiting for commands') while True: try: self.loop.start() except zmq.ZMQError as e: if e.errno == errno.EINTR: continue else: raise else: break if self.check_flapping: self.flapping.stop() self.ctrl.stop() self.evpub_socket.close() def stop(self): if self.alive: self.stop_watchers(stop_alive=True) self.loop.stop() def reap_processes(self): # map watcher to pids watchers_pids = {} for watcher in self.iter_watchers(): if not watcher.stopped: for pid, wid in watcher.pids.items(): watchers_pids[pid] = (watcher, wid) # detect dead children while True: try: pid, status = os.waitpid(-1, os.WNOHANG) if not pid: break if pid in watchers_pids: watcher, wid = watchers_pids[pid] watcher.reap_process(wid, status) except OSError as e: if e.errno == errno.EAGAIN: time.sleep(0.001) continue elif e.errno == errno.ECHILD: # process already reaped return else: raise def manage_watchers(self): if not self.busy and self.alive: self.busy = True # manage and reap processes self.reap_processes() for watcher in self.iter_watchers(): watcher.manage_processes() if self.check_flapping and not self.flapping.is_alive(): self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start() self.busy = False @debuglog def reload(self, graceful=True): """Reloads everything. Run the :func:`prereload_fn` callable if any, then gracefuly reload all watchers. """ if self.prereload_fn is not None: self.prereload_fn(self) # reopen log files for handler in logger.handlers: if isinstance(handler, logging.FileHandler): handler.acquire() handler.stream.close() handler.stream = open(handler.baseFilename, handler.mode) handler.release() # gracefully reload watchers for watcher in self.iter_watchers(): watcher.reload(graceful=graceful) def numprocesses(self): """Return the number of processes running across all watchers.""" return sum([len(watcher) for watcher in self.watchers]) def numwatchers(self): """Return the number of watchers.""" return len(self.watchers) def get_watcher(self, name): """Return the watcher *name*.""" return self._watchers_names[name] def statuses(self): return dict([(watcher.name, watcher.status()) for watcher in self.watchers]) def add_watcher(self, name, cmd, **kw): """Adds a watcher. Options: - **name**: name of the watcher to add - **cmd**: command to run. - all other options defined in the Watcher constructor. """ if name in self._watchers_names: raise AlreadyExist("%r already exist" % name) if not name: return ValueError("command name shouldn't be empty") watcher = Watcher(name, cmd, **kw) watcher.initialize(self.evpub_socket) self.watchers.append(watcher) self._watchers_names[watcher.name.lower()] = watcher return watcher def rm_watcher(self, name): """Deletes a watcher. Options: - **name**: name of the watcher to delete """ logger.debug('Deleting %r watcher', name) # remove the watcher from the list watcher = self._watchers_names.pop(name) del self.watchers[self.watchers.index(watcher)] # stop the watcher watcher.stop() def start_watchers(self): for watcher in self.iter_watchers(): watcher.start() def stop_watchers(self, stop_alive=False): if not self.alive: return if stop_alive: logger.info('Arbiter exiting') self.alive = False for watcher in self.iter_watchers(): watcher.stop() def restart(self): self.stop_watchers() self.start_watchers()
class Arbiter(object): """Class used to control a list of watchers. Options: - **watchers**: a list of Watcher objects - **endpoint**: the controller ZMQ endpoint - **pubsub_endpoint**: the pubsub endpoint - **check_delay**: the delay between two controller points (default: 1 s) - **prereload_fn**: callable that will be executed on each reload (default: None) """ def __init__( self, watchers, endpoint, pubsub_endpoint, check_delay=1.0, prereload_fn=None, context=None, loop=None, check_flapping=True, ): self.watchers = watchers self.endpoint = endpoint self.check_delay = check_delay self.prereload_fn = prereload_fn self.pubsub_endpoint = pubsub_endpoint # initialize zmq context self.context = context or zmq.Context.instance() self.loop = loop or ioloop.IOLoop() self.ctrl = Controller(endpoint, self.context, self.loop, self, check_delay) self.pid = os.getpid() self._watchers_names = {} self.alive = True self.busy = False self.check_flapping = check_flapping @debuglog def initialize(self): # event pub socket self.evpub_socket = self.context.socket(zmq.PUB) self.evpub_socket.bind(self.pubsub_endpoint) self.evpub_socket.linger = 0 # initialize flapping if self.check_flapping: self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) # initialize watchers for watcher in self.watchers: self._watchers_names[watcher.name.lower()] = watcher watcher.initialize(self.evpub_socket) @debuglog def start(self): """Starts all the watchers. The start command is an infinite loop that waits for any command from a client and that watches all the processes and restarts them if needed. """ logger.info("Starting master on pid %s", self.pid) self.initialize() # start controller self.ctrl.start() # start flapping if self.check_flapping: logger.debug("Starting flapping") self.flapping.start() # initialize processes logger.debug("Initializing watchers") for watcher in self.watchers: watcher.start() logger.info("Arbiter now waiting for commands") while True: try: self.loop.start() except zmq.ZMQError as e: if e.errno == errno.EINTR: continue else: raise else: break if self.check_flapping: self.flapping.stop() self.ctrl.stop() self.evpub_socket.close() def stop(self, graceful=False): if self.alive: self.stop_watchers(graceful=graceful, stop_alive=True) self.loop.stop() def manage_watchers(self): if not self.busy and self.alive: self.busy = True # manage and reap processes for watcher in self.watchers: watcher.reap_processes() watcher.manage_processes() if self.check_flapping and not self.flapping.is_alive(): self.flapping = Flapping(self.context, self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start() self.busy = False @debuglog def reload(self, graceful=True): """Reloads everything. Run the :func:`prereload_fn` callable if any, then gracefuly reload all watchers. """ if self.prereload_fn is not None: self.prereload_fn(self) # reopen log files for handler in logger.handlers: if isinstance(handler, logging.FileHandler): handler.acquire() handler.stream.close() handler.stream = open(handler.baseFilename, handler.mode) handler.release() # gracefully reload watchers for watcher in self.watchers: watcher.reload(graceful=graceful) def numprocesses(self): """Return the number of processes running across all watchers.""" return sum([len(watcher) for watcher in self.watchers]) def numwatchers(self): """Return the number of watchers.""" return len(self.watchers) def get_watcher(self, name): """Return the watcher *name*.""" return self._watchers_names[name] def statuses(self): return dict([(watcher.name, watcher.status()) for watcher in self.watchers]) def add_watcher(self, name, cmd): """Adds a watcher. Options: - **name**: name of the watcher to add - **cmd**: command to run. """ if name in self._watchers_names: raise AlreadyExist("%r already exist" % name) if not name: return ValueError("command name shouldn't be empty") watcher = Watcher(name, cmd, stopped=True) watcher.initialize(self.evpub_socket) self.watchers.append(watcher) self._watchers_names[watcher.name.lower()] = watcher def rm_watcher(self, name): """Deletes a watcher. Options: - **name**: name of the watcher to delete """ logger.debug("Deleting %r watcher", name) # remove the watcher from the list watcher = self._watchers_names.pop(name) del self.watchers[self.watchers.index(watcher)] # stop the watcher watcher.stop() def start_watchers(self): for watcher in self.watchers: watcher.start() def stop_watchers(self, graceful=True, stop_alive=False): if stop_alive: logger.info("Arbiter exiting") if not self.alive: return self.alive = False for watcher in self.watchers: watcher.stop(graceful=graceful)
class Show(object): def __init__(self, name, cmd, num_flies=1, warmup_delay=0., working_dir=None, shell=False, uid=None, gid=None, send_hup=False, env=None, stopped=False, times=2, within=1., retry_in=7., max_retry=5): self.name = name self.num_flies = int(num_flies) self.warmup_delay = warmup_delay self.cmd = cmd self._fly_counter = 0 self.stopped = stopped self.max_retry = max_retry self.optnames = ("num_flies", "warmup_delay", "working_dir", "uid", "gid", "send_hup", "shell", "env", "cmd", "times", "within", "retry_in", "max_retry") if not working_dir: # working dir hasn't been set working_dir = util.get_working_dir() self.working_dir = working_dir self.flies = {} self.shell = shell self.uid = uid self.gid = gid self.env = env self.send_hup = send_hup # define flapping object self.flapping = Flapping(self, times, within, retry_in, max_retry) def __len__(self): return len(self.flies) def reap_flies(self): if self.stopped: return for wid, fly in self.flies.items(): if fly.poll() is not None: self.flapping.notify() if self.stopped: break self.flies.pop(wid) def manage_flies(self): if self.stopped: return if len(self.flies.keys()) < self.num_flies: self.spawn_flies() flies = self.flies.keys() flies.sort() while len(flies) > self.num_flies: wid = flies.pop(0) fly = self.flies.pop(wid) self.kill_fly(fly) def reap_and_manage_flies(self): self.reap_flies() self.manage_flies() def spawn_flies(self): for i in range(self.num_flies - len(self.flies.keys())): self.spawn_fly() time.sleep(self.warmup_delay) def spawn_fly(self): if self.stopped: return self._fly_counter += 1 nb_tries = 0 while nb_tries < self.max_retry: fly = None try: fly = Fly(self._fly_counter, self.cmd, wdir=self.working_dir, shell=self.shell, uid=self.uid, gid=self.gid, env=self.env) self.flies[self._fly_counter] = fly logger.info('running %s fly [pid %d]' % (self.name, fly.pid)) except OSError, e: logger.warning('error in %r: %s' % (self.name, str(e))) if fly is None: nb_tries += 1 continue else: return self.stop()
def start_flapping(self): self.flapping = Flapping(self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start()
class Trainer(object): """Class used to control a list of shows. Options: - **shows**: a list of Show objects - **endpoint**: the controller ZMQ endpoint - **pubsub_endpoint**: the pubsub endpoint - **check_delay**: the delay between two controller points (defaults: 1 s) - **prereload_fn**: callable that will be executed on each reload (defaults: None) """ def __init__(self, shows, endpoint, pubsub_endpoint, check_delay=1., prereload_fn=None): self.shows = shows self.endpoint = endpoint self.check_delay = check_delay self.prereload_fn = prereload_fn self.pubsub_endpoint = pubsub_endpoint self.context = zmq.Context() self.ctrl = Controller(self.context, endpoint, self, self.check_delay) self.pid = os.getpid() self._shows_names = {} self.alive = True self._lock = Lock() self._setup() logger.info("Starting master on pid %s" % self.pid) def _setup(self): # set pubsub endpoint self.pubsub_io = self.context.socket(zmq.PUB) self.pubsub_io.bind(self.pubsub_endpoint) for show in self.shows: self._shows_names[show.name.lower()] = show show.pubsub_io = self.pubsub_io def start_flapping(self): self.flapping = Flapping(self.endpoint, self.pubsub_endpoint, self.check_delay) self.flapping.start() @debuglog def start(self): """Starts all the shows. The start command is an infinite loop that waits for any command from a client and that watches all the flies and restarts them if needed. """ # start flapping self.start_flapping() # launch flies for show in self.shows: show.manage_flies() while self.alive: # manage and reap flies for show in self.shows: show.reap_flies() show.manage_flies() if not self.flapping.is_alive(): # flapping is dead, relaunch it. self.start_flapping() # wait for the controller self.ctrl.poll() @debuglog def stop(self, graceful=True): """Stops all shows and their flies. Options: - **graceful**: sends a SIGTERM to every fly and waits a bit before killing it (default: True) """ if not self.alive: return self.alive = False self.flapping.stop() # kill flies for show in self.shows: show.stop(graceful=graceful) time.sleep(0.5) try: self.context.destroy(0) except zmq.ZMQError as e: if e.errno == errno.EINTR: pass else: raise @debuglog def reload(self): """Reloads everything. Run the :func:`prereload_fn` callable if any, then gracefuly reload all shows. """ if self.prereload_fn is not None: self.prereload_fn(self) # reopen log files for handler in logger.handlers: if isinstance(handler, logging.FileHandler): handler.acquire() handler.stream.close() handler.stream = open(handler.baseFilename, handler.mode) handler.release() # gracefully reload shows for show in self.shows: show.reload() def numflies(self): """Return the number of flies running across all shows.""" return sum([len(show) for show in self.shows]) def num_shows(self): """Return the number of shows.""" return len(self.shows) def get_show(self, name): """Return the show *name*.""" return self._shows_names[name] def add_show(self, name, cmd): """Adds a show. Options: - **name**: name of the show to add - **cmd**: command to run. """ with self._lock: if name in self._shows_names: raise AlreadyExist("%r already exist" % show.name) show = Show(name, cmd, stopped=True) show.pubsub_io = self.pubsub_io self.shows.append(show) self._shows_names[show.name.lower()] = show def del_show(self, name): """Deletes a show. Options: - **name**: name of the show to delete """ logger.debug('Deleting %r show' % name) with self._lock: # remove the show from the list show = self._shows_names.pop(name) del self.shows[self.shows.index(show)] # stop the show show.stop() ################### # commands ################### @debuglog def handle_stop(self): self.stop() handle_quit = handle_stop @debuglog def handle_terminate(self): self.stop(graceful=False) @debuglog def handle_numflies(self): return str(self.numflies()) @debuglog def handle_numshows(self): return str(self.num_shows()) @debuglog def handle_shows(self): return ",".join(self._shows_names.keys()) @debuglog def handle_flies(self): flies = [] for show in self.shows: flies.append("%s: %s" % (show.name, show.handle_flies())) return buffer("\n".join(flies)) @debuglog def handle_info_shows(self): infos = [] for show in self.shows: infos.append("%s:\n" % show.name) infos.append("%s\n" % show.handle_info()) return buffer("".join(infos)) @debuglog def handle_reload(self): self.reload() return "ok" @debuglog def handle_add_show(self, name, cmd): self.add_show(name, cmd) return "ok" @debuglog def handle_del_show(self, name): self.del_show(name) return "ok" @debuglog def handle_stop_shows(self): for show in self.shows: show.stop() return "ok" @debuglog def handle_start_shows(self): for show in self.shows: show.start() return "ok" @debuglog def handle_restart_shows(self): for show in self.shows: show.restart() return "ok"