def kill_empty_zygote(self, zygote, sig=signal.SIGQUIT): """Send zygote SIGQUIT if it has zero workers. """ # The only valid time to kill a zygote is if it doesn't have # any workers left. if zygote.worker_count == 0: self.logger.info("killing zygote with pid %d" % zygote.pid) safe_kill(zygote.pid, sig)
def kill_workers(self, num_workers_to_kill): if num_workers_to_kill > len(self.children): self.logger.error( "Request to kill %d workers out of %d current workers", num_workers_to_kill, len(self.children) ) return worker_pids = random.sample(self.children, num_workers_to_kill) for pid in worker_pids: safe_kill(pid) wait_for_pids(worker_pids, self.WAIT_FOR_KILL_TIME, self.logger)
def kill_workers(self, num_workers_to_kill): if num_workers_to_kill > len(self.children): self.logger.error( 'Request to kill %d workers out of %d current workers', num_workers_to_kill, len(self.children) ) return worker_pids = random.sample(self.children, num_workers_to_kill) for pid in worker_pids: safe_kill(pid) wait_for_pids(worker_pids, self.WAIT_FOR_KILL_TIME, self.logger)
def kill_all_workers(self): """Kill all workers and wait (synchronously) for them to exit""" # reset the signal handler so that we don't get interrupted # by SIGCHLDs signal.signal(signal.SIGCHLD, signal.SIG_DFL) waiting_pids = set() self.logger.debug('zygote requesting kill on %d pids', len(self.children)) for pid in self.children: if safe_kill(pid, signal.SIGQUIT): waiting_pids.add(pid) wait_for_pids(waiting_pids, self.WAIT_FOR_KILL_TIME, self.logger) self.logger.debug('zygote done killing children, terminating') sys.exit(0)
def transition_idle_workers(self): """Transition idle HTTP workers from old zygotes to the current zygote. """ if not self.started_transition: self.started_transition = time.time() if (time.time() - self.started_transition) > self.WAIT_FOR_KILL_TIME: self.logger.debug( "sending SIGKILL for transition because it was Too Damn Slow") sig = signal.SIGKILL else: sig = signal.SIGQUIT other_zygotes = self.zygote_collection.other_zygotes( self.current_zygote) if self.current_zygote.canary and self.prev_zygote: if self.prev_zygote in other_zygotes: other_zygotes.remove(self.prev_zygote) kill_count = 0 other_zygote_count = len(other_zygotes) for zygote in other_zygotes: for worker in zygote.idle_workers(): self.logger.debug("killing worker %d with signal %d", worker.pid, sig) if safe_kill(worker.pid, sig): kill_count += 1 self.logger.info('Attempted to transition %d workers from %d zygotes', kill_count, other_zygote_count) if other_zygote_count: # The list of other zygotes was at least one, so we should # reschedule another call to transition_idle_workers. When a zygote # runs out of worker children, the handle_protocol_msg function will # notice this fact when it receives the final MessageWorkerExit, and # at that time it will kill the worker, which is how this timeout # loop gets ended. self.io_loop.add_timeout(time.time() + self.POLL_INTERVAL, self.transition_idle_workers) else: self.started_transition = None # Cleanup empty zygotes for the next iteration of the transition. for zygote in other_zygotes: if zygote.worker_count == 0: self.kill_empty_zygote(zygote, sig)
def transition_idle_workers(self): """Transition idle HTTP workers from old zygotes to the current zygote. """ if not self.started_transition: self.started_transition = time.time() if (time.time() - self.started_transition) > self.WAIT_FOR_KILL_TIME: self.logger.debug("sending SIGKILL for transition because it was Too Damn Slow") sig = signal.SIGKILL else: sig = signal.SIGQUIT other_zygotes = self.zygote_collection.other_zygotes(self.current_zygote) if self.current_zygote.canary and self.prev_zygote: if self.prev_zygote in other_zygotes: other_zygotes.remove(self.prev_zygote) kill_count = 0 other_zygote_count = len(other_zygotes) for zygote in other_zygotes: for worker in zygote.idle_workers(): self.logger.debug("killing worker %d with signal %d", worker.pid, sig) if safe_kill(worker.pid, sig): kill_count += 1 self.logger.info('Attempted to transition %d workers from %d zygotes', kill_count, other_zygote_count) if other_zygote_count: # The list of other zygotes was at least one, so we should # reschedule another call to transition_idle_workers. When a zygote # runs out of worker children, the handle_protocol_msg function will # notice this fact when it receives the final MessageWorkerExit, and # at that time it will kill the worker, which is how this timeout # loop gets ended. self.io_loop.add_timeout(time.time() + self.POLL_INTERVAL, self.transition_idle_workers) else: self.started_transition = None # Cleanup empty zygotes for the next iteration of the transition. for zygote in other_zygotes: if zygote.worker_count == 0: self.kill_empty_zygote(zygote, sig)
def handle_protocol_msg(self, fd, events): """Callback for messages received on the master_socket""" assert fd == self.master_socket.fileno() data = self.master_socket.recv(self.RECV_SIZE) msg = message.Message.parse(data) msg_type = type(msg) self.logger.debug('received message of type %s from pid %d', msg_type.__name__, msg.pid) if msg_type is message.MessageCanaryInit: self.logger.info( "Canary zygote initialized. Transitioning idle workers.") # This is not the canary zygote anymore self.current_zygote.canary = False # We can also release the handle on the previous # zygote. It is already in the zygote_collection for # accounting purposses, but we won't need to keep track of # it anymore. self.prev_zygote = None # Canary initialization was successful, we can now transition workers self.io_loop.add_callback(self.transition_idle_workers) elif msg_type is message.MessageWorkerStart: # a new worker was spawned by one of our zygotes; add it to # zygote_collection, and note the time created and the zygote parent zygote = self.zygote_collection[msg.worker_ppid] if zygote: zygote.add_worker(msg.pid, msg.time_created) elif msg_type is message.MessageWorkerExitInitFail: if not self.current_zygote.canary: self.logger.error("A worker initialization failed, giving up") self.stop() return elif msg_type is message.MessageWorkerExit: # a worker exited. tell the current/active zygote to spawn a new # child. if this was the last child of a different (non-current) # zygote, kill that zygote zygote = self.zygote_collection[msg.pid] if not zygote: return zygote.remove_worker(msg.child_pid) if zygote.shutting_down: self.logger.debug( 'Removed a worker from shutting down zygote %d, %d left', msg.pid, len(zygote.workers())) return else: self.logger.debug('Removed a worker from zygote %d, %d left', msg.pid, len(zygote.workers())) if not self.stopped: if zygote in (self.current_zygote, self.prev_zygote): if self.num_workers > zygote.worker_count: # Only start a new if we're below quota. This # is how we scale down the number of workers. zygote.request_spawn() else: # Not a zygote that we care about. Request shutdown. zygote.request_shut_down() elif msg_type is message.MessageHTTPBegin: # a worker started servicing an HTTP request worker = self.zygote_collection.get_worker(msg.pid) if worker: worker.start_request(msg.remote_ip, msg.http_line) elif msg_type is message.MessageHTTPEnd: # a worker finished servicing an HTTP request worker = self.zygote_collection.get_worker(msg.pid) if worker: worker.end_request() if self.max_requests is not None and worker.request_count >= self.max_requests: self.logger.info( 'Worker %d reached max_requests %d, killing it', worker.pid, self.max_requests) safe_kill(worker.pid, signal.SIGQUIT) else: self.logger.warning('master got unexpected message of type %s', msg_type)
def handle_protocol_msg(self, fd, events): """Callback for messages received on the master_socket""" assert fd == self.master_socket.fileno() data = self.master_socket.recv(self.RECV_SIZE) msg = message.Message.parse(data) msg_type = type(msg) self.logger.debug('received message of type %s from pid %d', msg_type.__name__, msg.pid) if msg_type is message.MessageCanaryInit: self.logger.info("Canary zygote initialized. Transitioning idle workers.") # This is not the canary zygote anymore self.current_zygote.canary = False # We can also release the handle on the previous # zygote. It is already in the zygote_collection for # accounting purposses, but we won't need to keep track of # it anymore. self.prev_zygote = None # Canary initialization was successful, we can now transition workers self.io_loop.add_callback(self.transition_idle_workers) elif msg_type is message.MessageWorkerStart: # a new worker was spawned by one of our zygotes; add it to # zygote_collection, and note the time created and the zygote parent zygote = self.zygote_collection[msg.worker_ppid] if zygote: zygote.add_worker(msg.pid, msg.time_created) elif msg_type is message.MessageWorkerExitInitFail: if not self.current_zygote.canary: self.logger.error("A worker initialization failed, giving up") self.stop() return elif msg_type is message.MessageWorkerExit: # a worker exited. tell the current/active zygote to spawn a new # child. if this was the last child of a different (non-current) # zygote, kill that zygote zygote = self.zygote_collection[msg.pid] if not zygote: return zygote.remove_worker(msg.child_pid) if zygote.shutting_down: self.logger.debug('Removed a worker from shutting down zygote %d, %d left', msg.pid, len(zygote.workers())) return else: self.logger.debug('Removed a worker from zygote %d, %d left', msg.pid, len(zygote.workers())) if not self.stopped: if zygote in (self.current_zygote, self.prev_zygote): if self.num_workers > zygote.worker_count: # Only start a new if we're below quota. This # is how we scale down the number of workers. zygote.request_spawn() else: # Not a zygote that we care about. Request shutdown. zygote.request_shut_down() elif msg_type is message.MessageHTTPBegin: # a worker started servicing an HTTP request worker = self.zygote_collection.get_worker(msg.pid) if worker: worker.start_request(msg.remote_ip, msg.http_line) elif msg_type is message.MessageHTTPEnd: # a worker finished servicing an HTTP request worker = self.zygote_collection.get_worker(msg.pid) if worker: worker.end_request() if self.max_requests is not None and worker.request_count >= self.max_requests: self.logger.info('Worker %d reached max_requests %d, killing it', worker.pid, self.max_requests) safe_kill(worker.pid, signal.SIGQUIT) else: self.logger.warning('master got unexpected message of type %s', msg_type)