Beispiel #1
0
class HeartbeatMonitorTests(unittest.TestCase):
    def setUp(self):
        self.state = FakeState(ControllerStore())
        self.monitor = None

    @defer.inlineCallbacks
    def test_recovery(self):

        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=100)
        nodes = ["n" + str(i + 1) for i in range(7)]
        n1, n2, n3, n4, n5, n6, n7 = nodes

        # set up some instances that reached their iaas_state before the
        # init time (100)

        # this one has been running for well longer than the missing timeout
        # and we will have not received a heartbeat. It shouldn't be marked
        # MISSING until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50,
                                           InstanceHealthState.OK)

        # this has been running for 10 seconds before the init time but we
        # have never received a heartbeat. It should be marked as MISSING
        # after the boot timeout expires, starting from the init time.
        self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90,
                                           InstanceHealthState.UNKNOWN)

        # is terminated and nothing should happen
        self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 110, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 105, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.OK)

        # this instance was already marked as errored before the recovery
        self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.PROCESS_ERROR)

        # this instance was a ZOMBIE, it should be initially marked back as
        # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again
        self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80,
                                           InstanceHealthState.ZOMBIE)

        yield self.monitor.update(100)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        yield self.monitor.update(105)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.ok_heartbeat(n5, 105)
        self.ok_heartbeat(n7, 105)  # this one will be relabeled as a zombie

        self.err_heartbeat(n6, 105, procs=['a'])
        yield self.monitor.update(106)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n5, 110)
        yield self.monitor.update(110)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n4, 110)
        self.err_heartbeat(n6, 110, procs=['a'])
        yield self.monitor.update(111)
        self.assertNodeState(InstanceHealthState.OK, n5, n4)
        self.assertNodeState(InstanceHealthState.MISSING, n1, n2)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n3)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

    @defer.inlineCallbacks
    def test_basic(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        nodes = [str(uuid.uuid4()) for i in range(3)]
        n1, n2, n3 = nodes

        # not using real timestamps
        now = 0

        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now)

        # all nodes are running but haven't been heard from
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        now = 5
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        # first heartbeat to n1
        yield self.ok_heartbeat(n1, now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        now = 10
        yield self.monitor.update(now)

        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3)

        yield self.ok_heartbeat(n1, now)  # n1 makes it in under the wire
        yield self.ok_heartbeat(n2, now)
        now = 11
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n2)
        self.assertNodeState(InstanceHealthState.MISSING, n3)

        yield self.ok_heartbeat(n3, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        # ok don't hear from n2 for a while, should go missing
        now = 13
        yield self.ok_heartbeat(n1, now)

        now = 16
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n3)
        self.assertNodeState(InstanceHealthState.MISSING, n2)

        yield self.ok_heartbeat(n2, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 20

        # roll all nodes to terminated in IaaS
        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.TERMINATED,
                                               now)

        # been longer than missing window for n1 but shouldn't matter
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 30
        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        # not a zombie yet
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 31
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n1)

        now = 42
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n1)

    @defer.inlineCallbacks
    def test_error(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        yield self.err_heartbeat(node, now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0], 'faiiiill')

        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)

    @defer.inlineCallbacks
    def test_process_error(self):
        self.monitor = HealthMonitor(self.state,
                                     boot_seconds=10,
                                     missing_seconds=5,
                                     zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        procs = [{
            'name': 'proc1',
            'stderr': 'faaaaaail',
            'state': 100,
            'exitcode': -1,
            'stop_timestamp': 25242
        }]
        yield self.err_heartbeat(node, now, procs)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
        procs[0].pop('stderr')

        now = 8
        yield self.err_heartbeat(node, now, procs)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')

    def assertNodeState(self, state, *node_ids):
        for n in node_ids:
            self.assertEqual(state, self.state.instances[n].health)

    def ok_heartbeat(self, node_id, timestamp):
        msg = {
            'node_id': node_id,
            'timestamp': timestamp,
            'state': InstanceHealthState.OK
        }
        return self.monitor.new_heartbeat(msg, timestamp)

    def err_heartbeat(self, node_id, timestamp, procs=None):

        msg = {
            'node_id': node_id,
            'timestamp': timestamp,
        }
        if procs:
            msg['state'] = InstanceHealthState.PROCESS_ERROR
            msg['failed_processes'] = procs
        else:
            msg['state'] = InstanceHealthState.MONITOR_ERROR
            msg['error'] = 'faiiiill'

        return self.monitor.new_heartbeat(msg, timestamp)
Beispiel #2
0
class ControllerCore(object):
    """Controller functionality that is not specific to the messaging layer.
    """

    def __init__(self, provisioner_client, engineclass, controller_name, conf=None, state=None, store=None):

        if state:
            self.state = state
        else:
            self.state = ControllerCoreState(store or ControllerStore())

        prov_vars = None
        health_kwargs = None
        if conf:
            if conf.has_key(PROVISIONER_VARS_KEY):
                prov_vars = conf[PROVISIONER_VARS_KEY]

            if conf.get(MONITOR_HEALTH_KEY):
                health_kwargs = {}
                if HEALTH_BOOT_KEY in conf:
                    health_kwargs["boot_seconds"] = conf[HEALTH_BOOT_KEY]
                if HEALTH_MISSING_KEY in conf:
                    health_kwargs["missing_seconds"] = conf[HEALTH_MISSING_KEY]
                if HEALTH_ZOMBIE_KEY in conf:
                    health_kwargs["zombie_seconds"] = conf[HEALTH_ZOMBIE_KEY]
        self.conf = conf

        if health_kwargs is not None:
            self.health_monitor = HealthMonitor(self.state, **health_kwargs)
        else:
            self.health_monitor = None

        # There can only ever be one 'reconfigure' or 'decide' engine call run
        # at ANY time.  The 'decide' call is triggered via timed looping call
        # and 'reconfigure' is triggered asynchronously at any moment.
        self.busy = defer.DeferredSemaphore(1)

        self.provisioner_client = provisioner_client

        health_not_checked = self.health_monitor is None
        self.control = ControllerCoreControl(
            provisioner_client, self.state, prov_vars, controller_name, health_not_checked=health_not_checked
        )
        self.engine = EngineLoader().load(engineclass)

        self.control_loop = None

    def new_sensor_info(self, content):
        """Handle an incoming sensor message

        @param content Raw sensor content
        @retval Deferred
        """
        return self.state.new_sensor_item(content)

    def new_instance_state(self, content):
        """Handle an incoming instance state message

        @param content Raw instance state content
        @retval Deferred
        """
        return self.state.new_instance_state(content)

    def new_heartbeat(self, content):
        """Handle an incoming heartbeat message

        @param content Raw heartbeat content
        @retval Deferred
        """
        if self.health_monitor:
            return self.health_monitor.new_heartbeat(content)
        else:
            return defer.succeed(None)

    def begin_controlling(self):
        """Call the decision engine at the appropriate times.
        """
        log.debug("Starting engine decision loop - %s second interval", self.control.sleep_seconds)
        self.control_loop = LoopingCall(self.run_decide)
        self.control_loop.start(self.control.sleep_seconds, now=False)

    def run_recovery(self):
        """Recover instance and sensor states. This must run before new info
        starts arriving.
        """
        return self.state.recover()

    @defer.inlineCallbacks
    def run_initialize(self):
        """Performs initialization routines that may require async processing
        """

        # to make absolutely certain we have the latest records for instances,
        # we request provisioner to dump state
        instance_ids = []
        for instance in self.state.instances.itervalues():
            if instance.state < InstanceStates.TERMINATED:
                instance_ids.append(instance.instance_id)

        if instance_ids:
            yield self.provisioner_client.dump_state(nodes=instance_ids, force_subscribe=self.control.controller_name)

        engine_state = self.state.get_engine_state()

        # engines can be reconfigured after boot. If this is a recovery
        # situation we need to make sure to use the latest config, which
        # may be different from the one used in the initial boot. So, the
        # initial config is used and any reconfigured values are folded in
        # before engine configuration is called. This means that engines
        # must be able to handle the same values in both configure and
        # reconfigure.
        extraconf = yield self.state.get_engine_extraconf()

        if self.conf:
            engine_conf = copy.deepcopy(self.conf)
            engine_conf.update(extraconf)
        else:
            engine_conf = extraconf

        # DE routines can optionally return a Deferred
        yield defer.maybeDeferred(self.engine.initialize, self.control, engine_state, engine_conf)

    @defer.inlineCallbacks
    def run_decide(self):

        # allow health monitor to update any MISSING etc instance states
        if self.health_monitor:
            yield self.health_monitor.update()

        engine_state = self.state.get_engine_state()
        try:
            yield self.busy.run(self.engine.decide, self.control, engine_state)
        except Exception, e:
            log.error("Error in engine decide call: %s", str(e), exc_info=True)
Beispiel #3
0
class ControllerCore(object):
    """Controller functionality that is not specific to the messaging layer.
    """
    def __init__(self,
                 provisioner_client,
                 engineclass,
                 controller_name,
                 conf=None,
                 state=None,
                 store=None):

        if state:
            self.state = state
        else:
            self.state = ControllerCoreState(store or ControllerStore())

        prov_vars = None
        health_kwargs = None
        if conf:
            if conf.has_key(PROVISIONER_VARS_KEY):
                prov_vars = conf[PROVISIONER_VARS_KEY]

            if conf.get(MONITOR_HEALTH_KEY):
                health_kwargs = {}
                if HEALTH_BOOT_KEY in conf:
                    health_kwargs['boot_seconds'] = conf[HEALTH_BOOT_KEY]
                if HEALTH_MISSING_KEY in conf:
                    health_kwargs['missing_seconds'] = conf[HEALTH_MISSING_KEY]
                if HEALTH_ZOMBIE_KEY in conf:
                    health_kwargs['zombie_seconds'] = conf[HEALTH_ZOMBIE_KEY]
        self.conf = conf

        if health_kwargs is not None:
            self.health_monitor = HealthMonitor(self.state, **health_kwargs)
        else:
            self.health_monitor = None

        # There can only ever be one 'reconfigure' or 'decide' engine call run
        # at ANY time.  The 'decide' call is triggered via timed looping call
        # and 'reconfigure' is triggered asynchronously at any moment.
        self.busy = defer.DeferredSemaphore(1)

        self.provisioner_client = provisioner_client

        health_not_checked = self.health_monitor is None
        self.control = ControllerCoreControl(
            provisioner_client,
            self.state,
            prov_vars,
            controller_name,
            health_not_checked=health_not_checked)
        self.engine = EngineLoader().load(engineclass)

        self.control_loop = None

    def new_sensor_info(self, content):
        """Handle an incoming sensor message

        @param content Raw sensor content
        @retval Deferred
        """
        return self.state.new_sensor_item(content)

    def new_instance_state(self, content):
        """Handle an incoming instance state message

        @param content Raw instance state content
        @retval Deferred
        """
        return self.state.new_instance_state(content)

    def new_heartbeat(self, content):
        """Handle an incoming heartbeat message

        @param content Raw heartbeat content
        @retval Deferred
        """
        if self.health_monitor:
            return self.health_monitor.new_heartbeat(content)
        else:
            return defer.succeed(None)

    def begin_controlling(self):
        """Call the decision engine at the appropriate times.
        """
        log.debug('Starting engine decision loop - %s second interval',
                  self.control.sleep_seconds)
        self.control_loop = LoopingCall(self.run_decide)
        self.control_loop.start(self.control.sleep_seconds, now=False)

    def run_recovery(self):
        """Recover instance and sensor states. This must run before new info
        starts arriving.
        """
        return self.state.recover()

    @defer.inlineCallbacks
    def run_initialize(self):
        """Performs initialization routines that may require async processing
        """

        # to make absolutely certain we have the latest records for instances,
        # we request provisioner to dump state
        instance_ids = []
        for instance in self.state.instances.itervalues():
            if instance.state < InstanceStates.TERMINATED:
                instance_ids.append(instance.instance_id)

        if instance_ids:
            yield self.provisioner_client.dump_state(
                nodes=instance_ids,
                force_subscribe=self.control.controller_name)

        engine_state = self.state.get_engine_state()

        # engines can be reconfigured after boot. If this is a recovery
        # situation we need to make sure to use the latest config, which
        # may be different from the one used in the initial boot. So, the
        # initial config is used and any reconfigured values are folded in
        # before engine configuration is called. This means that engines
        # must be able to handle the same values in both configure and
        # reconfigure.
        extraconf = yield self.state.get_engine_extraconf()

        if self.conf:
            engine_conf = copy.deepcopy(self.conf)
            engine_conf.update(extraconf)
        else:
            engine_conf = extraconf

        # DE routines can optionally return a Deferred
        yield defer.maybeDeferred(self.engine.initialize, self.control,
                                  engine_state, engine_conf)

    @defer.inlineCallbacks
    def run_decide(self):

        # allow health monitor to update any MISSING etc instance states
        if self.health_monitor:
            yield self.health_monitor.update()

        engine_state = self.state.get_engine_state()
        try:
            yield self.busy.run(self.engine.decide, self.control, engine_state)
        except Exception, e:
            log.error("Error in engine decide call: %s", str(e), exc_info=True)
Beispiel #4
0
class HeartbeatMonitorTests(unittest.TestCase):
    def setUp(self):
        self.state = FakeState(ControllerStore())
        self.monitor = None

    @defer.inlineCallbacks
    def test_recovery(self):

        self.monitor = HealthMonitor(self.state, boot_seconds=10,
                                     missing_seconds=5, zombie_seconds=10,
                                     init_time=100)
        nodes = ["n" + str(i+1) for i in range(7)]
        n1, n2, n3, n4, n5, n6, n7 = nodes

        # set up some instances that reached their iaas_state before the
        # init time (100)

        # this one has been running for well longer than the missing timeout
        # and we will have not received a heartbeat. It shouldn't be marked
        # MISSING until more than 5 seconds after the init_time
        self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50,
                                           InstanceHealthState.OK)

        # this has been running for 10 seconds before the init time but we
        # have never received a heartbeat. It should be marked as MISSING
        # after the boot timeout expires, starting from the init time.
        self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90,
                                           InstanceHealthState.UNKNOWN)

        # is terminated and nothing should happen
        self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 110, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.UNKNOWN)

        # this one will get a heartbeat at 105, just before it would be
        # marked MISSING
        self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.OK)

        # this instance was already marked as errored before the recovery
        self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95,
                                           InstanceHealthState.PROCESS_ERROR)

        # this instance was a ZOMBIE, it should be initially marked back as
        # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again
        self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80,
                                           InstanceHealthState.ZOMBIE)

        yield self.monitor.update(100)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        yield self.monitor.update(105)
        self.assertNodeState(InstanceHealthState.OK, n1, n5)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)

        self.ok_heartbeat(n5, 105)
        self.ok_heartbeat(n7, 105) # this one will be relabeled as a zombie

        self.err_heartbeat(n6, 105, procs=['a'])
        yield self.monitor.update(106)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n5, 110)
        yield self.monitor.update(110)
        self.assertNodeState(InstanceHealthState.OK, n5)
        self.assertNodeState(InstanceHealthState.MISSING, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

        self.ok_heartbeat(n4, 110)
        self.err_heartbeat(n6, 110, procs=['a'])
        yield self.monitor.update(111)
        self.assertNodeState(InstanceHealthState.OK, n5, n4)
        self.assertNodeState(InstanceHealthState.MISSING, n1, n2)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n3)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n7)

    @defer.inlineCallbacks
    def test_basic(self):
        self.monitor = HealthMonitor(self.state, boot_seconds=10,
                                     missing_seconds=5, zombie_seconds=10,
                                     init_time=0)

        nodes = [str(uuid.uuid4()) for i in range(3)]
        n1, n2, n3 = nodes

        # not using real timestamps
        now = 0

        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now)

        # all nodes are running but haven't been heard from
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        now = 5
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes)

        # first heartbeat to n1
        yield self.ok_heartbeat(n1, now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        now  = 10
        yield self.monitor.update(now)

        self.assertNodeState(InstanceHealthState.OK, n1)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3)

        yield self.ok_heartbeat(n1, now) # n1 makes it in under the wire
        yield self.ok_heartbeat(n2, now)
        now = 11
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n2)
        self.assertNodeState(InstanceHealthState.MISSING, n3)

        yield self.ok_heartbeat(n3, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        # ok don't hear from n2 for a while, should go missing
        now = 13
        yield self.ok_heartbeat(n1, now)

        now = 16
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1, n3)
        self.assertNodeState(InstanceHealthState.MISSING, n2)

        yield self.ok_heartbeat(n2, now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 20

        # roll all nodes to terminated in IaaS
        for n in nodes:
            self.state.new_fake_instance_state(n, InstanceStates.TERMINATED, now)

        # been longer than missing window for n1 but shouldn't matter
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 30
        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        # not a zombie yet
        self.assertNodeState(InstanceHealthState.OK, *nodes)

        now = 31
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, n1)

        yield self.ok_heartbeat(n1, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.ZOMBIE, n1)

        now = 42
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.UNKNOWN, n1)

    @defer.inlineCallbacks
    def test_error(self):
        self.monitor = HealthMonitor(self.state, boot_seconds=10,
                                     missing_seconds=5, zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        yield self.err_heartbeat(node, now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0], 'faiiiill')

        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node)

    @defer.inlineCallbacks
    def test_process_error(self):
        self.monitor = HealthMonitor(self.state, boot_seconds=10,
                                     missing_seconds=5, zombie_seconds=10,
                                     init_time=0)

        node = str(uuid.uuid4())

        now = 1
        self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now)
        yield self.ok_heartbeat(node, now)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.OK, node)

        now = 5
        procs = [{'name' : 'proc1', 'stderr' : 'faaaaaail', 'state' : 100,
                  'exitcode' : -1, 'stop_timestamp' : 25242}]
        yield self.err_heartbeat(node, now, procs)
        yield self.monitor.update(now)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')
        procs[0].pop('stderr')

        now = 8
        yield self.err_heartbeat(node, now, procs)
        self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node)
        errors = self.state.instances[node].errors
        self.assertEqual(len(errors), 1)
        self.assertEqual(errors[0]['stderr'], 'faaaaaail')

    def assertNodeState(self, state, *node_ids):
        for n in node_ids:
            self.assertEqual(state, self.state.instances[n].health)

    def ok_heartbeat(self, node_id, timestamp):
        msg = {'node_id' : node_id, 'timestamp' : timestamp,
            'state' : InstanceHealthState.OK}
        return self.monitor.new_heartbeat(msg, timestamp)

    def err_heartbeat(self, node_id, timestamp, procs=None):

        msg = {'node_id' : node_id, 'timestamp' : timestamp,}
        if procs:
            msg['state'] = InstanceHealthState.PROCESS_ERROR
            msg['failed_processes'] = procs
        else:
            msg['state'] = InstanceHealthState.MONITOR_ERROR
            msg['error'] = 'faiiiill'

        return self.monitor.new_heartbeat(msg, timestamp)