def startWorker(self): log.msg("Heartbeat monitor initializing") config = self.get_static_config() self.deadline = config.deadline redis_config = config.redis_manager self._redis = yield TxRedisManager.from_config(redis_config) self._storage = Storage(self._redis) self._systems, self._workers = self.parse_config( config.monitored_systems) # Start consuming heartbeats yield self.consume("heartbeat.inbound", self._consume_message, exchange_name='vumi.health', message_class=HeartBeatMessage) self._start_task()
class HeartBeatMonitor(BaseWorker): class CONFIG_CLASS(BaseWorker.CONFIG_CLASS): deadline = ConfigInt( "Check-in deadline for participating workers", required=True, static=True) redis_manager = ConfigDict( "Redis client configuration.", required=True, static=True) monitored_systems = ConfigDict( "Tree of systems and workers.", required=True, static=True) _task = None @inlineCallbacks def startWorker(self): log.msg("Heartbeat monitor initializing") config = self.get_static_config() self.deadline = config.deadline redis_config = config.redis_manager self._redis = yield TxRedisManager.from_config(redis_config) self._storage = Storage(self._redis) self._systems, self._workers = self.parse_config( config.monitored_systems) # Start consuming heartbeats yield self.consume("heartbeat.inbound", self._consume_message, exchange_name='vumi.health', message_class=HeartBeatMessage) self._start_task() @inlineCallbacks def stopWorker(self): log.msg("HeartBeat: stopping worker") if self._task: self._task.stop() self._task = None yield self._task_done self._redis.close_manager() def parse_config(self, config): """ Parse configuration and populate in-memory state """ systems = [] workers = {} # loop over each defined system for sys in config.values(): assert_field(sys, 'workers') assert_field(sys, 'system_id') system_id = sys['system_id'] system_workers = [] # loop over each defined worker in the system for wkr_entry in sys['workers'].values(): assert_field(wkr_entry, 'name') assert_field(wkr_entry, 'min_procs') worker_name = wkr_entry['name'] min_procs = wkr_entry['min_procs'] wkr = Worker(system_id, worker_name, min_procs) workers[wkr.worker_id] = wkr system_workers.append(wkr) systems.append(System(system_id, system_id, system_workers)) return systems, workers def update(self, msg): """ Process a heartbeat message. """ worker_id = msg['worker_id'] timestamp = msg['timestamp'] hostname = msg['hostname'] pid = msg['pid'] # A bunch of discard rules: # 1. Unknown worker (Monitored workers need to be in the config) # 2. Message which are too old. wkr = self._workers.get(worker_id, None) if wkr is None: log.msg("Discarding message. worker '%s' is unknown" % worker_id) return if timestamp < (time.time() - self.deadline): log.msg("Discarding heartbeat from '%s'. Too old" % worker_id) return wkr.record(hostname, pid) @inlineCallbacks def _sync_to_storage(self): """ Write systems data to storage """ # write system ids system_ids = [sys.system_id for sys in self._systems] yield self._storage.add_system_ids(system_ids) # dump each system for sys in self._systems: yield self._storage.write_system(sys) @inlineCallbacks def _periodic_task(self): """ Iterate over worker instance sets and check to see whether any have not checked-in on time. We call snapshot() first, since the execution of tasks here is interleaved with the processing of worker heartbeat messages. """ # snapshot the the set of checked-in instances for wkr in self._workers.values(): wkr.snapshot() # run diagnostic audits on all workers for wkr in self._workers.values(): yield wkr.audit(self._storage) # write everything to redis yield self._sync_to_storage() def _start_task(self): """Create a timer task to check for missing worker""" self._task = LoopingCall(self._periodic_task) self._task_done = self._task.start(self.deadline, now=False) errfn = lambda failure: log.err(failure, "Heartbeat verify: timer task died") self._task_done.addErrback(errfn) def _consume_message(self, msg): log.msg("Received message: %s" % msg) self.update(msg.payload)