Example #1
0
    def startWorker(self):
        log.msg("Heartbeat monitor initializing")
        config = self.get_static_config()

        self.deadline = config.deadline

        redis_config = config.redis_manager
        self._redis = yield TxRedisManager.from_config(redis_config)
        self._storage = Storage(self._redis)

        self._systems, self._workers = self.parse_config(
            config.monitored_systems)

        # Start consuming heartbeats
        yield self.consume("heartbeat.inbound", self._consume_message,
                           exchange_name='vumi.health',
                           message_class=HeartBeatMessage)

        self._start_task()
Example #2
0
class HeartBeatMonitor(BaseWorker):

    class CONFIG_CLASS(BaseWorker.CONFIG_CLASS):
        deadline = ConfigInt(
            "Check-in deadline for participating workers",
            required=True, static=True)
        redis_manager = ConfigDict(
            "Redis client configuration.",
            required=True, static=True)
        monitored_systems = ConfigDict(
            "Tree of systems and workers.",
            required=True, static=True)

    _task = None

    @inlineCallbacks
    def startWorker(self):
        log.msg("Heartbeat monitor initializing")
        config = self.get_static_config()

        self.deadline = config.deadline

        redis_config = config.redis_manager
        self._redis = yield TxRedisManager.from_config(redis_config)
        self._storage = Storage(self._redis)

        self._systems, self._workers = self.parse_config(
            config.monitored_systems)

        # Start consuming heartbeats
        yield self.consume("heartbeat.inbound", self._consume_message,
                           exchange_name='vumi.health',
                           message_class=HeartBeatMessage)

        self._start_task()

    @inlineCallbacks
    def stopWorker(self):
        log.msg("HeartBeat: stopping worker")
        if self._task:
            self._task.stop()
            self._task = None
            yield self._task_done
        self._redis.close_manager()

    def parse_config(self, config):
        """
        Parse configuration and populate in-memory state
        """
        systems = []
        workers = {}
        # loop over each defined system
        for sys in config.values():
            assert_field(sys, 'workers')
            assert_field(sys, 'system_id')
            system_id = sys['system_id']
            system_workers = []
            # loop over each defined worker in the system
            for wkr_entry in sys['workers'].values():
                assert_field(wkr_entry, 'name')
                assert_field(wkr_entry, 'min_procs')
                worker_name = wkr_entry['name']
                min_procs = wkr_entry['min_procs']
                wkr = Worker(system_id,
                             worker_name,
                             min_procs)
                workers[wkr.worker_id] = wkr
                system_workers.append(wkr)
            systems.append(System(system_id, system_id, system_workers))
        return systems, workers

    def update(self, msg):
        """
        Process a heartbeat message.
        """
        worker_id = msg['worker_id']
        timestamp = msg['timestamp']
        hostname = msg['hostname']
        pid = msg['pid']

        # A bunch of discard rules:
        # 1. Unknown worker (Monitored workers need to be in the config)
        # 2. Message which are too old.
        wkr = self._workers.get(worker_id, None)
        if wkr is None:
            log.msg("Discarding message. worker '%s' is unknown" % worker_id)
            return
        if timestamp < (time.time() - self.deadline):
            log.msg("Discarding heartbeat from '%s'. Too old" % worker_id)
            return

        wkr.record(hostname, pid)

    @inlineCallbacks
    def _sync_to_storage(self):
        """
        Write systems data to storage
        """
        # write system ids
        system_ids = [sys.system_id for sys in self._systems]
        yield self._storage.add_system_ids(system_ids)
        # dump each system
        for sys in self._systems:
            yield self._storage.write_system(sys)

    @inlineCallbacks
    def _periodic_task(self):
        """
        Iterate over worker instance sets and check to see whether any have not
        checked-in on time.

        We call snapshot() first, since the execution of tasks here is
        interleaved with the processing of worker heartbeat messages.
        """
        # snapshot the the set of checked-in instances
        for wkr in self._workers.values():
            wkr.snapshot()
        # run diagnostic audits on all workers
        for wkr in self._workers.values():
            yield wkr.audit(self._storage)
        # write everything to redis
        yield self._sync_to_storage()

    def _start_task(self):
        """Create a timer task to check for missing worker"""
        self._task = LoopingCall(self._periodic_task)
        self._task_done = self._task.start(self.deadline, now=False)
        errfn = lambda failure: log.err(failure,
                                        "Heartbeat verify: timer task died")
        self._task_done.addErrback(errfn)

    def _consume_message(self, msg):
        log.msg("Received message: %s" % msg)
        self.update(msg.payload)