class RedirectorManager (object): def __init__ (self,configuration,poller): self.configuration = configuration self.low = configuration.redirector.minimum # minimum number of workers at all time self.high = configuration.redirector.maximum # maximum numbe of workers at all time self.program = configuration.redirector.program # what program speaks the squid redirector API self.nextid = 1 # incremental number to make the name of the next worker self.queue = Queue() # queue with HTTP headers to process self.poller = poller # poller interface that checks for events on sockets self.worker = {} # our workers threads self.closing = set() # workers that are currently closing self.running = True # we are running self.log = Logger('manager', configuration.log.manager) def _getid(self): id = str(self.nextid) self.nextid +=1 return id def _spawn (self): """add one worker to the pool""" wid = self._getid() worker = Redirector(self.configuration,wid,self.queue,self.program) self.poller.addReadSocket('read_workers', worker.response_box_read) self.worker[wid] = worker self.log.info("added a worker") self.log.info("we have %d workers. defined range is ( %d / %d )" % (len(self.worker),self.low,self.high)) self.worker[wid].start() def spawn (self,number=1): """create the set number of worker""" self.log.info("spawning %d more worker" % number) for _ in range(number): self._spawn() def respawn (self): """make sure we reach the minimum number of workers""" number = max(min(len(self.worker),self.high),self.low) for wid in set(self.worker): self.reap(wid) self.spawn(number) def reap (self,wid): self.log.info('we are killing worker %s' % wid) worker = self.worker[wid] self.closing.add(wid) worker.stop() # will cause the worker to stop when it can def decrease (self): if self.low < len(self.worker): worker = self._oldest() if worker: self.reap(worker.wid) def increase (self): if len(self.worker) < self.high: self.spawn() def start (self): """spawn our minimum number of workers""" self.log.info("starting workers.") self.spawn(max(0,self.low-len(self.worker))) def stop (self): """tell all our worker to stop reading the queue and stop""" self.running = False threads = self.worker.values() if len(self.worker): self.log.info("stopping %d workers." % len(self.worker)) for wid in set(self.worker): self.reap(wid) for thread in threads: self.request(None, None, None, 'nop') for thread in threads: thread.destroyProcess() thread.join() self.worker = {} def _oldest (self): """find the oldest worker""" oldest = None past = time.time() for wid in set(self.worker): creation = self.worker[wid].creation if creation < past and wid not in self.closing: past = creation oldest = self.worker[wid] return oldest def provision (self): """manage our workers to make sure we have enough to consume the queue""" if not self.running: return num_workers = len(self.worker) # bad we are bleeding workers ! if num_workers < self.low: self.log.info("we lost some workers, respawing %d new workers" % (self.low-num_workers)) self.spawn(self.low-num_workers) size = self.queue.qsize() # we need more workers if size >= num_workers: # nothing we can do we have reach our limit if num_workers >= self.high: self.log.warning("help ! we need more workers but we reached our ceiling ! %d request are queued for %d processes" % (size,num_workers)) return # try to figure a good number to add .. # no less than one, no more than to reach self.high, lower between self.low and a quarter of the allowed growth nb_to_add = int(min(max(1,min(self.low,(self.high-self.low)/4)),self.high-num_workers)) self.log.warning("we are low on workers adding a few (%d), the queue has %d unhandled url" % (nb_to_add,size)) self.spawn(nb_to_add) def deprovision (self): """manage our workers to make sure we have enough to consume the queue""" if not self.running: return size = self.queue.qsize() num_workers = len(self.worker) # we are now overprovisioned if size < 2 and num_workers > self.low: self.log.info("we have too many workers (%d), stopping the oldest" % num_workers) # if we have to kill one, at least stop the one who had the most chance to memory leak :) worker = self._oldest() if worker: self.reap(worker.wid) def request(self, client_id, peer, request, source): return self.queue.put((client_id,peer,request,source,False)) def getDecision(self, box): # NOTE: reads may block if we send badly formatted data try: r_buffer = box.read(3) while r_buffer.isdigit(): r_buffer += box.read(1) if ':' in r_buffer: size, response = r_buffer.split(':', 1) if size.isdigit(): size = int(size) else: size, response = None, None else: # not a netstring size, response = None, None if size is not None: required = size + 1 - len(response) response += box.read(required) if response is not None: if response.endswith(','): response = response[:-1] else: response = None except ValueError: # I/O operation on closed file worker = self.worker.get(box, None) if worker is not None: worker.destroyProcess() response = None except TypeError: response = None try: if response: client_id, command, decision = response.split('\0', 2) else: client_id = None command = None decision = None except (ValueError, TypeError): client_id = None command = None decision = None if command == 'requeue': _client_id, _peer, _source, _header = response.split('\0', 3) self.queue.put((_client_id,_peer,_header,_source,True)) client_id = None command = None decision = None elif command == 'hangup': wid = decision client_id = None command = None decision = None worker = self.worker.pop(wid, None) if worker: self.poller.removeReadSocket('read_workers', worker.response_box_read) if wid in self.closing: self.closing.remove(wid) worker.shutdown() worker.join() elif command == 'stats': wid, timestamp, stats = decision self.storeStats(timestamp, wid, stats) client_id = None command = None decision = None return client_id, command, decision def showInternalError(self): return 'file', '\0'.join(('200', 'internal_error.html')) def requestStats(self): for wid, worker in self.worker.iteritems(): worker.requestStats() def storeStats(self, timestamp, wid, stats): pairs = (d.split('=',1) for d in stats.split('?', 1).split('&')) d = self.cache.setdefault(timestamp, {}) for k, v in pairs: d.setdefault(k, []).append(v)
class Supervisor (object): alarm_time = 0.1 # regular backend work second_frequency = int(1/alarm_time) # when we record history minute_frequency = int(60/alarm_time) # when we want to average history increase_frequency = int(5/alarm_time) # when we add workers decrease_frequency = int(60/alarm_time) # when we remove workers saturation_frequency = int(20/alarm_time) # when we report connection saturation interface_frequency = int(300/alarm_time) # when we check for new interfaces # import os # clear = [hex(ord(c)) for c in os.popen('clear').read()] # clear = ''.join([chr(int(c,16)) for c in ['0x1b', '0x5b', '0x48', '0x1b', '0x5b', '0x32', '0x4a']]) def __init__ (self,configuration): configuration = load() self.configuration = configuration # Only here so the introspection code can find them self.log = Logger('supervisor', configuration.log.supervisor) self.log.error('Starting exaproxy version %s' % configuration.proxy.version) self.signal_log = Logger('signal', configuration.log.signal) self.log_writer = SysLogWriter('log', configuration.log.destination, configuration.log.enable, level=configuration.log.level) self.usage_writer = UsageWriter('usage', configuration.usage.destination, configuration.usage.enable) self.log_writer.setIdentifier(configuration.daemon.identifier) #self.usage_writer.setIdentifier(configuration.daemon.identifier) if configuration.debug.log: self.log_writer.toggleDebug() self.usage_writer.toggleDebug() self.log.error('python version %s' % sys.version.replace(os.linesep,' ')) self.log.debug('starting %s' % sys.argv[0]) self.pid = PID(self.configuration) self.daemon = Daemon(self.configuration) self.poller = Poller(self.configuration.daemon) self.poller.setupRead('read_proxy') # Listening proxy sockets self.poller.setupRead('read_web') # Listening webserver sockets self.poller.setupRead('read_icap') # Listening icap sockets self.poller.setupRead('read_workers') # Pipes carrying responses from the child processes self.poller.setupRead('read_resolver') # Sockets currently listening for DNS responses self.poller.setupRead('read_client') # Active clients self.poller.setupRead('opening_client') # Clients we have not yet read a request from self.poller.setupWrite('write_client') # Active clients with buffered data to send self.poller.setupWrite('write_resolver') # Active DNS requests with buffered data to send self.poller.setupRead('read_download') # Established connections self.poller.setupWrite('write_download') # Established connections we have buffered data to send to self.poller.setupWrite('opening_download') # Opening connections self.monitor = Monitor(self) self.page = Page(self) self.manager = RedirectorManager( self.configuration, self.poller, ) self.content = ContentManager(self,configuration) self.client = ClientManager(self.poller, configuration) self.resolver = ResolverManager(self.poller, self.configuration, configuration.dns.retries*10) self.proxy = Server('http proxy',self.poller,'read_proxy', configuration.http.connections) self.web = Server('web server',self.poller,'read_web', configuration.web.connections) self.icap = Server('icap server',self.poller,'read_icap', configuration.icap.connections) self.reactor = Reactor(self.configuration, self.web, self.proxy, self.icap, self.manager, self.content, self.client, self.resolver, self.log_writer, self.usage_writer, self.poller) self._shutdown = True if self.daemon.filemax == 0 else False # stop the program self._softstop = False # stop once all current connection have been dealt with self._reload = False # unimplemented self._toggle_debug = False # start logging a lot self._decrease_spawn_limit = 0 self._increase_spawn_limit = 0 self._refork = False # unimplemented self._pdb = False # turn on pdb debugging self._listen = None # listening change ? None: no, True: listen, False: stop listeing self.wait_time = 5.0 # how long do we wait at maximum once we have been soft-killed self.local = set() # what addresses are on our local interfaces self.interfaces() signal.signal(signal.SIGQUIT, self.sigquit) signal.signal(signal.SIGINT, self.sigterm) signal.signal(signal.SIGTERM, self.sigterm) # signal.signal(signal.SIGABRT, self.sigabrt) # signal.signal(signal.SIGHUP, self.sighup) signal.signal(signal.SIGTRAP, self.sigtrap) signal.signal(signal.SIGUSR1, self.sigusr1) signal.signal(signal.SIGUSR2, self.sigusr2) signal.signal(signal.SIGTTOU, self.sigttou) signal.signal(signal.SIGTTIN, self.sigttin) signal.signal(signal.SIGALRM, self.sigalrm) # make sure we always have data in history # (done in zero for dependencies reasons) self.monitor.zero() def sigquit (self,signum, frame): if self._softstop: self.signal_log.critical('multiple SIG INT received, shutdown') self._shutdown = True else: self.signal_log.critical('SIG INT received, soft-stop') self._softstop = True self._listen = False def sigterm (self,signum, frame): self.signal_log.critical('SIG TERM received, shutdown request') if os.environ.get('PDB',False): self._pdb = True else: self._shutdown = True # def sigabrt (self,signum, frame): # self.signal_log.info('SIG INFO received, refork request') # self._refork = True # def sighup (self,signum, frame): # self.signal_log.info('SIG HUP received, reload request') # self._reload = True def sigtrap (self,signum, frame): self.signal_log.critical('SIG TRAP received, toggle debug') self._toggle_debug = True def sigusr1 (self,signum, frame): self.signal_log.critical('SIG USR1 received, decrease worker number') self._decrease_spawn_limit += 1 def sigusr2 (self,signum, frame): self.signal_log.critical('SIG USR2 received, increase worker number') self._increase_spawn_limit += 1 def sigttou (self,signum, frame): self.signal_log.critical('SIG TTOU received, stop listening') self._listen = False def sigttin (self,signum, frame): self.signal_log.critical('SIG IN received, star listening') self._listen = True def sigalrm (self,signum, frame): self.signal_log.debug('SIG ALRM received, timed actions') self.reactor.running = False signal.setitimer(signal.ITIMER_REAL,self.alarm_time,self.alarm_time) def interfaces (self): local = set(['127.0.0.1','::1']) for interface in getifaddrs(): if interface.family not in (AF_INET,AF_INET6): continue if interface.address not in self.local: self.log.info('found new local ip %s (%s)' % (interface.address,interface.name)) local.add(interface.address) for ip in self.local: if ip not in local: self.log.info('removed local ip %s' % ip) if local == self.local: self.log.info('no ip change') else: self.local = local def run (self): if self.daemon.drop_privileges(): self.log.critical('Could not drop privileges to \'%s\'. Refusing to run as root' % self.daemon.user) self.log.critical('Set the environment value USER to change the unprivileged user') self._shutdown = True elif not self.initialise(): self._shutdown = True signal.setitimer(signal.ITIMER_REAL,self.alarm_time,self.alarm_time) count_second = 0 count_minute = 0 count_increase = 0 count_decrease = 0 count_saturation = 0 count_interface = 0 while True: count_second = (count_second + 1) % self.second_frequency count_minute = (count_minute + 1) % self.minute_frequency count_increase = (count_increase + 1) % self.increase_frequency count_decrease = (count_decrease + 1) % self.decrease_frequency count_saturation = (count_saturation + 1) % self.saturation_frequency count_interface = (count_interface + 1) % self.interface_frequency try: if self._pdb: self._pdb = False import pdb pdb.set_trace() # check for IO change with select self.reactor.run() # must follow the reactor so we are sure to go through the reactor at least once # and flush any logs if self._shutdown: self._shutdown = False self.shutdown() break elif self._reload: self._reload = False self.reload() elif self._refork: self._refork = False self.signal_log.warning('refork not implemented') # stop listening to new connections # refork the program (as we have been updated) # just handle current open connection if self._softstop: if self._listen == False: self.proxy.rejecting() self._listen = None if self.client.softstop(): self._shutdown = True # only change listening if we are not shutting down elif self._listen is not None: if self._listen: self._shutdown = not self.proxy.accepting() self._listen = None else: self.proxy.rejecting() self._listen = None if self._toggle_debug: self._toggle_debug = False self.log_writer.toggleDebug() if self._increase_spawn_limit: number = self._increase_spawn_limit self._increase_spawn_limit = 0 self.manager.low += number self.manager.high = max(self.manager.low,self.manager.high) for _ in range(number): self.manager.increase() if self._decrease_spawn_limit: number = self._decrease_spawn_limit self._decrease_spawn_limit = 0 self.manager.high = max(1,self.manager.high-number) self.manager.low = min(self.manager.high,self.manager.low) for _ in range(number): self.manager.decrease() # save our monitoring stats if count_second == 0: self.monitor.second() expired = self.reactor.client.expire() self.reactor.log.debug('events : ' + ', '.join('%s:%d' % (k,len(v)) for (k,v) in self.reactor.events.items())) else: expired = 0 if expired: self.proxy.notifyClose(None, count=expired) if count_minute == 0: self.monitor.minute() # make sure we have enough workers if count_increase == 0: self.manager.provision() # and every so often remove useless workers if count_decrease == 0: self.manager.deprovision() # report if we saw too many connections if count_saturation == 0: self.proxy.saturation() self.web.saturation() if self.configuration.daemon.poll_interfaces and count_interface == 0: self.interfaces() except KeyboardInterrupt: self.log.critical('^C received') self._shutdown = True except OSError,e: # This shoould never happen as we are limiting how many connections we accept if e.errno == 24: # Too many open files self.log.critical('Too many opened files, shutting down') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True else: self.log.critical('unrecoverable io error') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True finally:
class Supervisor (object): alarm_time = 0.1 # regular backend work second_frequency = int(1/alarm_time) # when we record history minute_frequency = int(60/alarm_time) # when we want to average history increase_frequency = int(5/alarm_time) # when we add workers decrease_frequency = int(60/alarm_time) # when we remove workers saturation_frequency = int(20/alarm_time) # when we report connection saturation interface_frequency = int(300/alarm_time) # when we check for new interfaces # import os # clear = [hex(ord(c)) for c in os.popen('clear').read()] # clear = ''.join([chr(int(c,16)) for c in ['0x1b', '0x5b', '0x48', '0x1b', '0x5b', '0x32', '0x4a']]) def __init__ (self,configuration): configuration = load() self.configuration = configuration # Only here so the introspection code can find them self.log = Logger('supervisor', configuration.log.supervisor) self.log.error('Starting exaproxy version %s' % configuration.proxy.version) self.signal_log = Logger('signal', configuration.log.signal) self.log_writer = SysLogWriter('log', configuration.log.destination, configuration.log.enable, level=configuration.log.level) self.usage_writer = UsageWriter('usage', configuration.usage.destination, configuration.usage.enable) sys.exitfunc = self.log_writer.writeMessages self.log_writer.setIdentifier(configuration.daemon.identifier) #self.usage_writer.setIdentifier(configuration.daemon.identifier) if configuration.debug.log: self.log_writer.toggleDebug() self.usage_writer.toggleDebug() self.log.error('python version %s' % sys.version.replace(os.linesep,' ')) self.log.debug('starting %s' % sys.argv[0]) self.pid = PID(self.configuration) self.daemon = Daemon(self.configuration) self.poller = Poller(self.configuration.daemon) self.poller.setupRead('read_proxy') # Listening proxy sockets self.poller.setupRead('read_web') # Listening webserver sockets self.poller.setupRead('read_icap') # Listening icap sockets self.poller.setupRead('read_redirector') # Pipes carrying responses from the redirector process self.poller.setupRead('read_resolver') # Sockets currently listening for DNS responses self.poller.setupRead('read_client') # Active clients self.poller.setupRead('opening_client') # Clients we have not yet read a request from self.poller.setupWrite('write_client') # Active clients with buffered data to send self.poller.setupWrite('write_resolver') # Active DNS requests with buffered data to send self.poller.setupRead('read_download') # Established connections self.poller.setupWrite('write_download') # Established connections we have buffered data to send to self.poller.setupWrite('opening_download') # Opening connections self.monitor = Monitor(self) self.page = Page(self) self.content = ContentManager(self,configuration) self.client = ClientManager(self.poller, configuration) self.resolver = ResolverManager(self.poller, self.configuration, configuration.dns.retries*10) self.proxy = Server('http proxy',self.poller,'read_proxy', configuration.http.connections) self.web = Server('web server',self.poller,'read_web', configuration.web.connections) self.icap = Server('icap server',self.poller,'read_icap', configuration.icap.connections) self._shutdown = True if self.daemon.filemax == 0 else False # stop the program self._softstop = False # stop once all current connection have been dealt with self._reload = False # unimplemented self._toggle_debug = False # start logging a lot self._decrease_spawn_limit = 0 self._increase_spawn_limit = 0 self._refork = False # unimplemented self._pdb = False # turn on pdb debugging self._listen = None # listening change ? None: no, True: listen, False: stop listeing self.wait_time = 5.0 # how long do we wait at maximum once we have been soft-killed self.local = set() # what addresses are on our local interfaces if not self.initialise(): self._shutdown = True elif self.daemon.drop_privileges(): self.log.critical('Could not drop privileges to \'%s\'. Refusing to run as root' % self.daemon.user) self.log.critical('Set the environment value USER to change the unprivileged user') self._shutdown = True # fork the redirector process before performing any further setup redirector = fork_redirector(self.poller, self.configuration) # create threads _after_ all forking is done self.redirector = redirector_message_thread(redirector) self.reactor = Reactor(self.configuration, self.web, self.proxy, self.icap, self.redirector, self.content, self.client, self.resolver, self.log_writer, self.usage_writer, self.poller) self.interfaces() signal.signal(signal.SIGQUIT, self.sigquit) signal.signal(signal.SIGINT, self.sigterm) signal.signal(signal.SIGTERM, self.sigterm) # signal.signal(signal.SIGABRT, self.sigabrt) # signal.signal(signal.SIGHUP, self.sighup) signal.signal(signal.SIGTRAP, self.sigtrap) signal.signal(signal.SIGUSR1, self.sigusr1) signal.signal(signal.SIGUSR2, self.sigusr2) signal.signal(signal.SIGTTOU, self.sigttou) signal.signal(signal.SIGTTIN, self.sigttin) signal.signal(signal.SIGALRM, self.sigalrm) # make sure we always have data in history # (done in zero for dependencies reasons) self.monitor.zero() def exit (self): sys.exit() def sigquit (self,signum, frame): if self._softstop: self.signal_log.critical('multiple SIG INT received, shutdown') self._shutdown = True else: self.signal_log.critical('SIG INT received, soft-stop') self._softstop = True self._listen = False def sigterm (self,signum, frame): self.signal_log.critical('SIG TERM received, shutdown request') if os.environ.get('PDB',False): self._pdb = True else: self._shutdown = True # def sigabrt (self,signum, frame): # self.signal_log.info('SIG INFO received, refork request') # self._refork = True # def sighup (self,signum, frame): # self.signal_log.info('SIG HUP received, reload request') # self._reload = True def sigtrap (self,signum, frame): self.signal_log.critical('SIG TRAP received, toggle debug') self._toggle_debug = True def sigusr1 (self,signum, frame): self.signal_log.critical('SIG USR1 received, decrease worker number') self._decrease_spawn_limit += 1 def sigusr2 (self,signum, frame): self.signal_log.critical('SIG USR2 received, increase worker number') self._increase_spawn_limit += 1 def sigttou (self,signum, frame): self.signal_log.critical('SIG TTOU received, stop listening') self._listen = False def sigttin (self,signum, frame): self.signal_log.critical('SIG IN received, star listening') self._listen = True def sigalrm (self,signum, frame): self.reactor.running = False signal.setitimer(signal.ITIMER_REAL,self.alarm_time,self.alarm_time) def interfaces (self): local = set(['127.0.0.1','::1']) for interface in getifaddrs(): if interface.family not in (AF_INET,AF_INET6): continue if interface.address not in self.local: self.log.info('found new local ip %s (%s)' % (interface.address,interface.name)) local.add(interface.address) for ip in self.local: if ip not in local: self.log.info('removed local ip %s' % ip) if local == self.local: self.log.info('no ip change') else: self.local = local def run (self): signal.setitimer(signal.ITIMER_REAL,self.alarm_time,self.alarm_time) count_second = 0 count_minute = 0 count_saturation = 0 count_interface = 0 while True: count_second = (count_second + 1) % self.second_frequency count_minute = (count_minute + 1) % self.minute_frequency count_saturation = (count_saturation + 1) % self.saturation_frequency count_interface = (count_interface + 1) % self.interface_frequency try: if self._pdb: self._pdb = False import pdb pdb.set_trace() # check for IO change with select status = self.reactor.run() if status is False: self._shutdown = True # must follow the reactor so we are sure to go through the reactor at least once # and flush any logs if self._shutdown: self._shutdown = False self.shutdown() break elif self._reload: self._reload = False self.reload() elif self._refork: self._refork = False self.signal_log.warning('refork not implemented') # stop listening to new connections # refork the program (as we have been updated) # just handle current open connection if self._softstop: if self._listen == False: self.proxy.rejecting() self._listen = None if self.client.softstop(): self._shutdown = True # only change listening if we are not shutting down elif self._listen is not None: if self._listen: self._shutdown = not self.proxy.accepting() self._listen = None else: self.proxy.rejecting() self._listen = None if self._toggle_debug: self._toggle_debug = False self.log_writer.toggleDebug() if self._decrease_spawn_limit: count = self._decrease_spawn_limit self.redirector.decreaseSpawnLimit(count) self._decrease_spawn_limit = 0 if self._increase_spawn_limit: count = self._increase_spawn_limit self.redirector.increaseSpawnLimit(count) self._increase_spawn_limit = 0 # save our monitoring stats if count_second == 0: self.monitor.second() expired = self.reactor.client.expire() else: expired = 0 if expired: self.proxy.notifyClose(None, count=expired) if count_minute == 0: self.monitor.minute() # report if we saw too many connections if count_saturation == 0: self.proxy.saturation() self.web.saturation() if self.configuration.daemon.poll_interfaces and count_interface == 0: self.interfaces() except KeyboardInterrupt: self.log.critical('^C received') self._shutdown = True except OSError,e: # This shoould never happen as we are limiting how many connections we accept if e.errno == 24: # Too many open files self.log.critical('Too many opened files, shutting down') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True else: self.log.critical('unrecoverable io error') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True finally:
class RedirectorManager (object): def __init__ (self, configuration, poller): self.low = configuration.redirector.minimum # minimum concurrent redirector workers self.high = configuration.redirector.maximum # maximum concurrent redirector workers self.poller = poller self.configuration = configuration self.queue = Queue() # store requests we do not immediately have the resources to process self.nextid = 1 # unique id to give to the next spawned worker self.worker = {} # worker tasks for each spawned child self.processes = {} # worker tasks indexed by file descriptors we can poll self.available = set() # workers that are currently available to handle new requests self.active = {} # workers that are currently busy waiting for a response from the spawned process self.stopping = set() # workers we want to stop as soon as they stop being active program = configuration.redirector.program protocol = configuration.redirector.protocol self.redirector_factory = RedirectorFactory(configuration, program, protocol) self.log = Logger('manager', configuration.log.manager) def _getid(self): wid = str(self.nextid) self.nextid += 1 return wid def _spawn (self): """add one worker to the pool""" wid = self._getid() worker = self.redirector_factory.create(wid) self.worker[wid] = worker self.available.add(wid) if worker.process is not None: identifier = worker.process.stdout self.processes[identifier] = worker self.poller.addReadSocket('read_workers', identifier) self.log.info("added a worker") self.log.info("we have %d workers. defined range is ( %d / %d )" % (len(self.worker), self.low, self.high)) def spawn (self, number=1): """create the request number of worker processes""" self.log.info("spawning %d more workers" % number) for _ in range(number): self._spawn() def respawn (self): """make sure we reach the minimum number of workers""" number = max(min(len(self.worker), self.high), self.low) for wid in set(self.worker): self.stopWorker(wid) self.spawn(number) def stopWorker (self, wid): self.log.info('want worker %s to go away' % wid) if wid not in self.active: self.reap(wid) else: self.stopping.add(wid) def reap (self, wid): self.log.info('we are killing worker %s' % wid) worker = self.worker[wid] if wid in self.active: self.log.error('reaping worker %s even though it is still active' % wid) self.active.pop(wid) if wid in self.stopping: self.stopping.remove(wid) if wid in self.available: self.available.remove(wid) if worker.process is not None: self.poller.removeReadSocket('read_workers', worker.process.stdout) self.processes.pop(worker.process.stdout) worker.shutdown() self.worker.pop(wid) def _decrease (self): if self.low < len(self.worker): wid = self._oldest() if wid: self.stopWorker(wid) def _increase (self): if len(self.worker) < self.high: self.spawn() def decrease (self, count=1): for _ in xrange(count): self._decrease() def increase (self, count=1): for _ in xrange(count): self._increase() def start (self): """spawn our minimum number of workers""" self.log.info("starting workers.") self.spawn(max(0,self.low-len(self.worker))) def stop (self): """tell all our worker to stop reading the queue and stop""" for wid in self.worker: self.reap(wid) self.worker = {} def _oldest (self): """find the oldest worker""" oldest = None past = time.time() for wid in set(self.worker): creation = self.worker[wid].creation if creation < past and wid not in self.stopping: past = creation oldest = wid return oldest def provision (self): """manage our workers to make sure we have enough to consume the queue""" size = self.queue.qsize() num_workers = len(self.worker) # bad we are bleeding workers ! if num_workers < self.low: self.log.info("we lost some workers, respawing %d new workers" % (self.low - num_workers)) self.spawn(self.low - num_workers) # we need more workers if size >= num_workers: # nothing we can do we have reach our limit if num_workers >= self.high: self.log.warning("help ! we need more workers but we reached our ceiling ! %d request are queued for %d processes" % (size,num_workers)) return # try to figure a good number to add .. # no less than one, no more than to reach self.high, lower between self.low and a quarter of the allowed growth nb_to_add = int(min(max(1,min(self.low,(self.high-self.low)/4)),self.high-num_workers)) self.log.warning("we are low on workers adding a few (%d), the queue has %d unhandled url" % (nb_to_add,size)) self.spawn(nb_to_add) def deprovision (self): """manage our workers to make sure we have enough to consume the queue""" size = self.queue.qsize() num_workers = len(self.worker) # we are now overprovisioned if size < 2 and num_workers > self.low: self.log.info("we have too many workers (%d), stopping the oldest" % num_workers) # if we have to kill one, at least stop the one who had the most chance to memory leak :) wid = self._oldest() if wid: self.stopWorker(wid) def acquire (self): if self.available: identifier = self.available.pop() worker = self.worker[identifier] else: worker = None return worker def release (self, wid): if wid not in self.stopping: self.available.add(wid) else: self.reap(wid) def persist (self, wid, client_id, peer, data, header, subheader, source, tainted): self.active[wid] = client_id, peer, data, header, subheader, source, tainted def progress (self, wid): return self.active.pop(wid) def doqueue (self): if self.available and not self.queue.isempty(): client_id, peer, header, subheader, source, tainted = self.queue.get() _, command, decision = self.request(client_id, peer, header, subheader, source, tainted=tainted) else: client_id, command, decision = None, None, None return client_id, command, decision def request (self, client_id, peer, header, subheader, source, tainted=False): worker = self.acquire() if worker is not None: try: _, command, decision = worker.decide(client_id, peer, header, subheader, source) except: command, decision = None, None if command is None: self.reap(worker.wid) if tainted is False: _, command, decision = self.request(client_id, peer, header, subheader, source, tainted=True) else: _, command, decision = Respond.close(client_id) else: command, decision = None, None self.queue.put((client_id, peer, header, subheader, source, tainted)) if command == 'defer': self.persist(worker.wid, client_id, peer, decision, header, subheader, source, tainted) command, decision = None, None elif worker is not None: self.release(worker.wid) return client_id, command, decision def getDecision (self, pipe_in): worker = self.processes.get(pipe_in, None) if worker is not None and worker.wid in self.active: client_id, peer, request, header, subheader, source, tainted = self.progress(worker.wid) try: _, command, decision = worker.progress(client_id, peer, request, header, subheader, source) except Exception, e: command, decision = None, None self.release(worker.wid) if command is None: self.reap(worker.wid) if tainted is False: _, command, decision = self.request(client_id, peer, header, subheader, source, tainted=True) else: _, command, decision = Respond.close(client_id) else:
def poll(self): try: res = self.master.control(None, self.max_events, self.speed) except EnvironmentError, e: if e.errno != errno.EINTR: log.critical('KQueue master poller - unexpected error') raise log.warning('KQueue master poller - got EINTR, ignoring it.') res = [] response = {} else: # response['poller1']=[] ; response['poller2']=[] etc. response = dict((name, []) for (name, _, _, _) in self.pollers.values()) if (len(res) == self.max_events): log.warning("polled max_events from master kqueue") for events in res: fd = events.ident name, poller, sockets, fdtosock = self.pollers[fd] events = poller.control(None, self.max_events, 0) if (len(events) == self.max_events): log.warning("polled max_events from queue %s" % (name)) for sock_events in events: sock_fd = sock_events.ident try: response[name].append(fdtosock[sock_fd]) except KeyError, e:
def poll(self): try: res = self.master.control(None, self.max_events, self.speed) except EnvironmentError, e: if e.errno != errno.EINTR: log.critical('KQueue master poller - unexpected error') raise log.warning('KQueue master poller - got EINTR, ignoring it.') res = [] response = {} else: # response['poller1']=[] ; response['poller2']=[] etc. response = dict((name, []) for (name, _, _, _) in self.pollers.values()) if len(res) == self.max_events: log.warning("polled max_events from master kqueue") for events in res: fd = events.ident name, poller, sockets, fdtosock = self.pollers[fd] events = poller.control(None, self.max_events, 0) if len(events) == self.max_events: log.warning("polled max_events from queue %s" % name) for sock_events in events: sock_fd = sock_events.ident try: response[name].append(fdtosock[sock_fd]) except KeyError:
class Supervisor(object): alarm_time = 0.1 # regular backend work second_frequency = int(1 / alarm_time) # when we record history minute_frequency = int(60 / alarm_time) # when we want to average history increase_frequency = int(5 / alarm_time) # when we add workers decrease_frequency = int(60 / alarm_time) # when we remove workers saturation_frequency = int( 20 / alarm_time) # when we report connection saturation interface_frequency = int(300 / alarm_time) # when we check for new interfaces # import os # clear = [hex(ord(c)) for c in os.popen('clear').read()] # clear = ''.join([chr(int(c,16)) for c in ['0x1b', '0x5b', '0x48', '0x1b', '0x5b', '0x32', '0x4a']]) def __init__(self, configuration): self.configuration = configuration # Only here so the introspection code can find them self.log = Logger('supervisor', configuration.log.supervisor) self.log.error('Starting exaproxy version %s' % configuration.proxy.version) self.signal_log = Logger('signal', configuration.log.signal) self.log_writer = SysLogWriter('log', configuration.log.destination, configuration.log.enable, level=configuration.log.level) self.usage_writer = UsageWriter('usage', configuration.usage.destination, configuration.usage.enable) sys.exitfunc = self.log_writer.writeMessages self.log_writer.setIdentifier(configuration.daemon.identifier) #self.usage_writer.setIdentifier(configuration.daemon.identifier) if configuration.debug.log: self.log_writer.toggleDebug() self.usage_writer.toggleDebug() self.log.error('python version %s' % sys.version.replace(os.linesep, ' ')) self.log.debug('starting %s' % sys.argv[0]) self.pid = PID(self.configuration) self.daemon = Daemon(self.configuration) self.poller = Poller(self.configuration.daemon) self.poller.setupRead('read_proxy') # Listening proxy sockets self.poller.setupRead('read_web') # Listening webserver sockets self.poller.setupRead('read_icap') # Listening icap sockets self.poller.setupRead('read_tls') # Listening tls sockets self.poller.setupRead('read_passthrough') # Listening raw data sockets self.poller.setupRead( 'read_redirector' ) # Pipes carrying responses from the redirector process self.poller.setupRead( 'read_resolver') # Sockets currently listening for DNS responses self.poller.setupRead('read_client') # Active clients self.poller.setupRead( 'opening_client') # Clients we have not yet read a request from self.poller.setupWrite( 'write_client') # Active clients with buffered data to send self.poller.setupWrite( 'write_resolver') # Active DNS requests with buffered data to send self.poller.setupRead('read_download') # Established connections self.poller.setupWrite( 'write_download' ) # Established connections we have buffered data to send to self.poller.setupWrite('opening_download') # Opening connections self.poller.setupRead('read_interrupt') # Scheduled events self.poller.setupRead( 'read_control' ) # Responses from commands sent to the redirector process self.monitor = Monitor(self) self.page = Page(self) self.content = ContentManager(self, configuration) self.client = ClientManager(self.poller, configuration) self.resolver = ResolverManager(self.poller, self.configuration, configuration.dns.retries * 10) self.proxy = Server('http proxy', self.poller, 'read_proxy', configuration.http) self.web = Server('web server', self.poller, 'read_web', configuration.web) self.icap = Server('icap server', self.poller, 'read_icap', configuration.icap) self.tls = Server('tls server', self.poller, 'read_tls', configuration.tls) self.passthrough = InterceptServer('passthrough server', self.poller, 'read_passthrough', configuration.passthrough) self._shutdown = True if self.daemon.filemax == 0 else False # stop the program self._softstop = False # stop once all current connection have been dealt with self._reload = False # unimplemented self._toggle_debug = False # start logging a lot self._decrease_spawn_limit = 0 self._increase_spawn_limit = 0 self._refork = False # unimplemented self._pdb = False # turn on pdb debugging self._listen = None # listening change ? None: no, True: listen, False: stop listeing self.wait_time = 5.0 # how long do we wait at maximum once we have been soft-killed self.local = set() # what addresses are on our local interfaces if not self.initialise(): self._shutdown = True elif self.daemon.drop_privileges(): self.log.critical( 'Could not drop privileges to \'%s\'. Refusing to run as root' % self.daemon.user) self.log.critical( 'Set the environment value USER to change the unprivileged user' ) self._shutdown = True # fork the redirector process before performing any further setup redirector = fork_redirector(self.poller, self.configuration) # use simple blocking IO for communication with the redirector process self.redirector = redirector_message_thread(redirector) # NOTE: create threads _after_ all forking is done # regularly interrupt the reactor for maintenance self.interrupt_scheduler = alarm_thread(self.poller, self.alarm_time) self.reactor = Reactor(self.configuration, self.web, self.proxy, self.passthrough, self.icap, self.tls, self.redirector, self.content, self.client, self.resolver, self.log_writer, self.usage_writer, self.poller) self.interfaces() signal.signal(signal.SIGQUIT, self.sigquit) signal.signal(signal.SIGINT, self.sigterm) signal.signal(signal.SIGTERM, self.sigterm) # signal.signal(signal.SIGABRT, self.sigabrt) # signal.signal(signal.SIGHUP, self.sighup) signal.signal(signal.SIGTRAP, self.sigtrap) signal.signal(signal.SIGUSR1, self.sigusr1) signal.signal(signal.SIGUSR2, self.sigusr2) signal.signal(signal.SIGTTOU, self.sigttou) signal.signal(signal.SIGTTIN, self.sigttin) # make sure we always have data in history # (done in zero for dependencies reasons) if self._shutdown is False: self.redirector.requestStats() command, control_data = self.redirector.readResponse() stats_data = control_data if command == 'STATS' else None stats = self.monitor.statistics(stats_data) ok = self.monitor.zero(stats) if ok: self.redirector.requestStats() else: self._shutdown = True def exit(self): sys.exit() def sigquit(self, signum, frame): if self._softstop: self.signal_log.critical('multiple SIG INT received, shutdown') self._shutdown = True else: self.signal_log.critical('SIG INT received, soft-stop') self._softstop = True self._listen = False def sigterm(self, signum, frame): self.signal_log.critical('SIG TERM received, shutdown request') if os.environ.get('PDB', False): self._pdb = True else: self._shutdown = True # def sigabrt (self,signum, frame): # self.signal_log.info('SIG INFO received, refork request') # self._refork = True # def sighup (self,signum, frame): # self.signal_log.info('SIG HUP received, reload request') # self._reload = True def sigtrap(self, signum, frame): self.signal_log.critical('SIG TRAP received, toggle debug') self._toggle_debug = True def sigusr1(self, signum, frame): self.signal_log.critical('SIG USR1 received, decrease worker number') self._decrease_spawn_limit += 1 def sigusr2(self, signum, frame): self.signal_log.critical('SIG USR2 received, increase worker number') self._increase_spawn_limit += 1 def sigttou(self, signum, frame): self.signal_log.critical('SIG TTOU received, stop listening') self._listen = False def sigttin(self, signum, frame): self.signal_log.critical('SIG IN received, star listening') self._listen = True def interfaces(self): local = {'127.0.0.1', '::1'} for interface in getifaddrs(): if interface.family not in (AF_INET, AF_INET6): continue if interface.address not in self.local: self.log.info('found new local ip %s (%s)' % (interface.address, interface.name)) local.add(interface.address) for ip in self.local: if ip not in local: self.log.info('removed local ip %s' % ip) if local == self.local: self.log.info('no ip change') else: self.local = local def run(self): count_second = 0 count_minute = 0 count_saturation = 0 count_interface = 0 events = {'read_interrupt'} while True: count_second = (count_second + 1) % self.second_frequency count_minute = (count_minute + 1) % self.minute_frequency count_saturation = (count_saturation + 1) % self.saturation_frequency count_interface = (count_interface + 1) % self.interface_frequency try: if self._pdb: self._pdb = False import pdb pdb.set_trace() # prime the alarm if 'read_interrupt' in events: self.interrupt_scheduler.setAlarm() # check for IO change with select status, events = self.reactor.run() # shut down the server if a child process disappears if status is False: self._shutdown = True # respond to control responses immediately if 'read_control' in events: command, control_data = self.redirector.readResponse() if command == 'STATS': ok = self.doStats(count_second, count_minute, control_data) if ok is False: self._shutdown = True # jump straight back into the reactor if we haven't yet received an # interrupt event if 'read_interrupt' not in events: continue # clear the alarm condition self.interrupt_scheduler.acknowledgeAlarm() # must follow the reactor so we are sure to go through the reactor at least once # and flush any logs if self._shutdown: self._shutdown = False self.shutdown() break elif self._reload: self._reload = False self.reload() elif self._refork: self._refork = False self.signal_log.warning('refork not implemented') # stop listening to new connections # refork the program (as we have been updated) # just handle current open connection # ask the redirector process for stats self.redirector.requestStats() if self._softstop: if self._listen == False: self.proxy.rejecting() self._listen = None if self.client.softstop(): self._shutdown = True # only change listening if we are not shutting down elif self._listen is not None: if self._listen: self._shutdown = not self.proxy.accepting() self._listen = None else: self.proxy.rejecting() self._listen = None if self._toggle_debug: self._toggle_debug = False self.log_writer.toggleDebug() if self._decrease_spawn_limit: count = self._decrease_spawn_limit self.redirector.decreaseSpawnLimit(count) self._decrease_spawn_limit = 0 if self._increase_spawn_limit: count = self._increase_spawn_limit self.redirector.increaseSpawnLimit(count) self._increase_spawn_limit = 0 # cleanup idle connections # TODO: track all idle connections, not just the ones that have never sent data expired = self.reactor.client.expire() for expire_source, expire_count in expired.items(): if expire_source == 'proxy': self.proxy.notifyClose(None, count=expire_count) elif expire_source == 'icap': self.icap.notifyClose(None, count=expire_count) elif expire_source == 'passthrough': self.passthrough.notifyClose(None, count=expire_count) elif expire_source == 'tls': self.tls.notifyClose(None, count=expire_count) elif expire_source == 'web': self.web.notifyClose(None, count=expire_count) # report if we saw too many connections if count_saturation == 0: self.proxy.saturation() self.web.saturation() if self.configuration.daemon.poll_interfaces and count_interface == 0: self.interfaces() except KeyboardInterrupt: self.log.critical('^C received') self._shutdown = True except OSError, e: # This shoould never happen as we are limiting how many connections we accept if e.errno == 24: # Too many open files self.log.critical('Too many opened files, shutting down') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True else: self.log.critical('unrecoverable io error') for line in traceback.format_exc().split('\n'): self.log.critical(line) self._shutdown = True finally: