class HeartbeatMonitorTests(unittest.TestCase): def setUp(self): self.state = FakeState(ControllerStore()) self.monitor = None @defer.inlineCallbacks def test_recovery(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=100) nodes = ["n" + str(i + 1) for i in range(7)] n1, n2, n3, n4, n5, n6, n7 = nodes # set up some instances that reached their iaas_state before the # init time (100) # this one has been running for well longer than the missing timeout # and we will have not received a heartbeat. It shouldn't be marked # MISSING until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50, InstanceHealthState.OK) # this has been running for 10 seconds before the init time but we # have never received a heartbeat. It should be marked as MISSING # after the boot timeout expires, starting from the init time. self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90, InstanceHealthState.UNKNOWN) # is terminated and nothing should happen self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 110, just before it would be # marked MISSING self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 105, just before it would be # marked MISSING self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95, InstanceHealthState.OK) # this instance was already marked as errored before the recovery self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95, InstanceHealthState.PROCESS_ERROR) # this instance was a ZOMBIE, it should be initially marked back as # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80, InstanceHealthState.ZOMBIE) yield self.monitor.update(100) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) yield self.monitor.update(105) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.ok_heartbeat(n5, 105) self.ok_heartbeat(n7, 105) # this one will be relabeled as a zombie self.err_heartbeat(n6, 105, procs=['a']) yield self.monitor.update(106) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n5, 110) yield self.monitor.update(110) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n4, 110) self.err_heartbeat(n6, 110, procs=['a']) yield self.monitor.update(111) self.assertNodeState(InstanceHealthState.OK, n5, n4) self.assertNodeState(InstanceHealthState.MISSING, n1, n2) self.assertNodeState(InstanceHealthState.UNKNOWN, n3) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) @defer.inlineCallbacks def test_basic(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) nodes = [str(uuid.uuid4()) for i in range(3)] n1, n2, n3 = nodes # not using real timestamps now = 0 for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now) # all nodes are running but haven't been heard from self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) now = 5 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) # first heartbeat to n1 yield self.ok_heartbeat(n1, now) self.assertNodeState(InstanceHealthState.OK, n1) now = 10 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3) yield self.ok_heartbeat(n1, now) # n1 makes it in under the wire yield self.ok_heartbeat(n2, now) now = 11 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n2) self.assertNodeState(InstanceHealthState.MISSING, n3) yield self.ok_heartbeat(n3, now) self.assertNodeState(InstanceHealthState.OK, *nodes) # ok don't hear from n2 for a while, should go missing now = 13 yield self.ok_heartbeat(n1, now) now = 16 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n3) self.assertNodeState(InstanceHealthState.MISSING, n2) yield self.ok_heartbeat(n2, now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 20 # roll all nodes to terminated in IaaS for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.TERMINATED, now) # been longer than missing window for n1 but shouldn't matter yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 30 yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) # not a zombie yet self.assertNodeState(InstanceHealthState.OK, *nodes) now = 31 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.ZOMBIE, n1) now = 42 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, n1) @defer.inlineCallbacks def test_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 yield self.err_heartbeat(node, now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0], 'faiiiill') yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) @defer.inlineCallbacks def test_process_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 procs = [{ 'name': 'proc1', 'stderr': 'faaaaaail', 'state': 100, 'exitcode': -1, 'stop_timestamp': 25242 }] yield self.err_heartbeat(node, now, procs) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') procs[0].pop('stderr') now = 8 yield self.err_heartbeat(node, now, procs) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') def assertNodeState(self, state, *node_ids): for n in node_ids: self.assertEqual(state, self.state.instances[n].health) def ok_heartbeat(self, node_id, timestamp): msg = { 'node_id': node_id, 'timestamp': timestamp, 'state': InstanceHealthState.OK } return self.monitor.new_heartbeat(msg, timestamp) def err_heartbeat(self, node_id, timestamp, procs=None): msg = { 'node_id': node_id, 'timestamp': timestamp, } if procs: msg['state'] = InstanceHealthState.PROCESS_ERROR msg['failed_processes'] = procs else: msg['state'] = InstanceHealthState.MONITOR_ERROR msg['error'] = 'faiiiill' return self.monitor.new_heartbeat(msg, timestamp)
class ControllerCore(object): """Controller functionality that is not specific to the messaging layer. """ def __init__(self, provisioner_client, engineclass, controller_name, conf=None, state=None, store=None): if state: self.state = state else: self.state = ControllerCoreState(store or ControllerStore()) prov_vars = None health_kwargs = None if conf: if conf.has_key(PROVISIONER_VARS_KEY): prov_vars = conf[PROVISIONER_VARS_KEY] if conf.get(MONITOR_HEALTH_KEY): health_kwargs = {} if HEALTH_BOOT_KEY in conf: health_kwargs["boot_seconds"] = conf[HEALTH_BOOT_KEY] if HEALTH_MISSING_KEY in conf: health_kwargs["missing_seconds"] = conf[HEALTH_MISSING_KEY] if HEALTH_ZOMBIE_KEY in conf: health_kwargs["zombie_seconds"] = conf[HEALTH_ZOMBIE_KEY] self.conf = conf if health_kwargs is not None: self.health_monitor = HealthMonitor(self.state, **health_kwargs) else: self.health_monitor = None # There can only ever be one 'reconfigure' or 'decide' engine call run # at ANY time. The 'decide' call is triggered via timed looping call # and 'reconfigure' is triggered asynchronously at any moment. self.busy = defer.DeferredSemaphore(1) self.provisioner_client = provisioner_client health_not_checked = self.health_monitor is None self.control = ControllerCoreControl( provisioner_client, self.state, prov_vars, controller_name, health_not_checked=health_not_checked ) self.engine = EngineLoader().load(engineclass) self.control_loop = None def new_sensor_info(self, content): """Handle an incoming sensor message @param content Raw sensor content @retval Deferred """ return self.state.new_sensor_item(content) def new_instance_state(self, content): """Handle an incoming instance state message @param content Raw instance state content @retval Deferred """ return self.state.new_instance_state(content) def new_heartbeat(self, content): """Handle an incoming heartbeat message @param content Raw heartbeat content @retval Deferred """ if self.health_monitor: return self.health_monitor.new_heartbeat(content) else: return defer.succeed(None) def begin_controlling(self): """Call the decision engine at the appropriate times. """ log.debug("Starting engine decision loop - %s second interval", self.control.sleep_seconds) self.control_loop = LoopingCall(self.run_decide) self.control_loop.start(self.control.sleep_seconds, now=False) def run_recovery(self): """Recover instance and sensor states. This must run before new info starts arriving. """ return self.state.recover() @defer.inlineCallbacks def run_initialize(self): """Performs initialization routines that may require async processing """ # to make absolutely certain we have the latest records for instances, # we request provisioner to dump state instance_ids = [] for instance in self.state.instances.itervalues(): if instance.state < InstanceStates.TERMINATED: instance_ids.append(instance.instance_id) if instance_ids: yield self.provisioner_client.dump_state(nodes=instance_ids, force_subscribe=self.control.controller_name) engine_state = self.state.get_engine_state() # engines can be reconfigured after boot. If this is a recovery # situation we need to make sure to use the latest config, which # may be different from the one used in the initial boot. So, the # initial config is used and any reconfigured values are folded in # before engine configuration is called. This means that engines # must be able to handle the same values in both configure and # reconfigure. extraconf = yield self.state.get_engine_extraconf() if self.conf: engine_conf = copy.deepcopy(self.conf) engine_conf.update(extraconf) else: engine_conf = extraconf # DE routines can optionally return a Deferred yield defer.maybeDeferred(self.engine.initialize, self.control, engine_state, engine_conf) @defer.inlineCallbacks def run_decide(self): # allow health monitor to update any MISSING etc instance states if self.health_monitor: yield self.health_monitor.update() engine_state = self.state.get_engine_state() try: yield self.busy.run(self.engine.decide, self.control, engine_state) except Exception, e: log.error("Error in engine decide call: %s", str(e), exc_info=True)
class ControllerCore(object): """Controller functionality that is not specific to the messaging layer. """ def __init__(self, provisioner_client, engineclass, controller_name, conf=None, state=None, store=None): if state: self.state = state else: self.state = ControllerCoreState(store or ControllerStore()) prov_vars = None health_kwargs = None if conf: if conf.has_key(PROVISIONER_VARS_KEY): prov_vars = conf[PROVISIONER_VARS_KEY] if conf.get(MONITOR_HEALTH_KEY): health_kwargs = {} if HEALTH_BOOT_KEY in conf: health_kwargs['boot_seconds'] = conf[HEALTH_BOOT_KEY] if HEALTH_MISSING_KEY in conf: health_kwargs['missing_seconds'] = conf[HEALTH_MISSING_KEY] if HEALTH_ZOMBIE_KEY in conf: health_kwargs['zombie_seconds'] = conf[HEALTH_ZOMBIE_KEY] self.conf = conf if health_kwargs is not None: self.health_monitor = HealthMonitor(self.state, **health_kwargs) else: self.health_monitor = None # There can only ever be one 'reconfigure' or 'decide' engine call run # at ANY time. The 'decide' call is triggered via timed looping call # and 'reconfigure' is triggered asynchronously at any moment. self.busy = defer.DeferredSemaphore(1) self.provisioner_client = provisioner_client health_not_checked = self.health_monitor is None self.control = ControllerCoreControl( provisioner_client, self.state, prov_vars, controller_name, health_not_checked=health_not_checked) self.engine = EngineLoader().load(engineclass) self.control_loop = None def new_sensor_info(self, content): """Handle an incoming sensor message @param content Raw sensor content @retval Deferred """ return self.state.new_sensor_item(content) def new_instance_state(self, content): """Handle an incoming instance state message @param content Raw instance state content @retval Deferred """ return self.state.new_instance_state(content) def new_heartbeat(self, content): """Handle an incoming heartbeat message @param content Raw heartbeat content @retval Deferred """ if self.health_monitor: return self.health_monitor.new_heartbeat(content) else: return defer.succeed(None) def begin_controlling(self): """Call the decision engine at the appropriate times. """ log.debug('Starting engine decision loop - %s second interval', self.control.sleep_seconds) self.control_loop = LoopingCall(self.run_decide) self.control_loop.start(self.control.sleep_seconds, now=False) def run_recovery(self): """Recover instance and sensor states. This must run before new info starts arriving. """ return self.state.recover() @defer.inlineCallbacks def run_initialize(self): """Performs initialization routines that may require async processing """ # to make absolutely certain we have the latest records for instances, # we request provisioner to dump state instance_ids = [] for instance in self.state.instances.itervalues(): if instance.state < InstanceStates.TERMINATED: instance_ids.append(instance.instance_id) if instance_ids: yield self.provisioner_client.dump_state( nodes=instance_ids, force_subscribe=self.control.controller_name) engine_state = self.state.get_engine_state() # engines can be reconfigured after boot. If this is a recovery # situation we need to make sure to use the latest config, which # may be different from the one used in the initial boot. So, the # initial config is used and any reconfigured values are folded in # before engine configuration is called. This means that engines # must be able to handle the same values in both configure and # reconfigure. extraconf = yield self.state.get_engine_extraconf() if self.conf: engine_conf = copy.deepcopy(self.conf) engine_conf.update(extraconf) else: engine_conf = extraconf # DE routines can optionally return a Deferred yield defer.maybeDeferred(self.engine.initialize, self.control, engine_state, engine_conf) @defer.inlineCallbacks def run_decide(self): # allow health monitor to update any MISSING etc instance states if self.health_monitor: yield self.health_monitor.update() engine_state = self.state.get_engine_state() try: yield self.busy.run(self.engine.decide, self.control, engine_state) except Exception, e: log.error("Error in engine decide call: %s", str(e), exc_info=True)
class HeartbeatMonitorTests(unittest.TestCase): def setUp(self): self.state = FakeState(ControllerStore()) self.monitor = None @defer.inlineCallbacks def test_recovery(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=100) nodes = ["n" + str(i+1) for i in range(7)] n1, n2, n3, n4, n5, n6, n7 = nodes # set up some instances that reached their iaas_state before the # init time (100) # this one has been running for well longer than the missing timeout # and we will have not received a heartbeat. It shouldn't be marked # MISSING until more than 5 seconds after the init_time self.state.new_fake_instance_state(n1, InstanceStates.RUNNING, 50, InstanceHealthState.OK) # this has been running for 10 seconds before the init time but we # have never received a heartbeat. It should be marked as MISSING # after the boot timeout expires, starting from the init time. self.state.new_fake_instance_state(n2, InstanceStates.RUNNING, 90, InstanceHealthState.UNKNOWN) # is terminated and nothing should happen self.state.new_fake_instance_state(n3, InstanceStates.TERMINATED, 90, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 110, just before it would be # marked MISSING self.state.new_fake_instance_state(n4, InstanceStates.RUNNING, 95, InstanceHealthState.UNKNOWN) # this one will get a heartbeat at 105, just before it would be # marked MISSING self.state.new_fake_instance_state(n5, InstanceStates.RUNNING, 95, InstanceHealthState.OK) # this instance was already marked as errored before the recovery self.state.new_fake_instance_state(n6, InstanceStates.RUNNING, 95, InstanceHealthState.PROCESS_ERROR) # this instance was a ZOMBIE, it should be initially marked back as # UNKNOWN and then if a heartbeat arrives it should be ZOMBIE again self.state.new_fake_instance_state(n7, InstanceStates.TERMINATED, 80, InstanceHealthState.ZOMBIE) yield self.monitor.update(100) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) yield self.monitor.update(105) self.assertNodeState(InstanceHealthState.OK, n1, n5) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4, n7) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.ok_heartbeat(n5, 105) self.ok_heartbeat(n7, 105) # this one will be relabeled as a zombie self.err_heartbeat(n6, 105, procs=['a']) yield self.monitor.update(106) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n5, 110) yield self.monitor.update(110) self.assertNodeState(InstanceHealthState.OK, n5) self.assertNodeState(InstanceHealthState.MISSING, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3, n4) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) self.ok_heartbeat(n4, 110) self.err_heartbeat(n6, 110, procs=['a']) yield self.monitor.update(111) self.assertNodeState(InstanceHealthState.OK, n5, n4) self.assertNodeState(InstanceHealthState.MISSING, n1, n2) self.assertNodeState(InstanceHealthState.UNKNOWN, n3) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, n6) self.assertNodeState(InstanceHealthState.ZOMBIE, n7) @defer.inlineCallbacks def test_basic(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) nodes = [str(uuid.uuid4()) for i in range(3)] n1, n2, n3 = nodes # not using real timestamps now = 0 for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.RUNNING, now) # all nodes are running but haven't been heard from self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) now = 5 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, *nodes) # first heartbeat to n1 yield self.ok_heartbeat(n1, now) self.assertNodeState(InstanceHealthState.OK, n1) now = 10 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) self.assertNodeState(InstanceHealthState.UNKNOWN, n2, n3) yield self.ok_heartbeat(n1, now) # n1 makes it in under the wire yield self.ok_heartbeat(n2, now) now = 11 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n2) self.assertNodeState(InstanceHealthState.MISSING, n3) yield self.ok_heartbeat(n3, now) self.assertNodeState(InstanceHealthState.OK, *nodes) # ok don't hear from n2 for a while, should go missing now = 13 yield self.ok_heartbeat(n1, now) now = 16 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1, n3) self.assertNodeState(InstanceHealthState.MISSING, n2) yield self.ok_heartbeat(n2, now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 20 # roll all nodes to terminated in IaaS for n in nodes: self.state.new_fake_instance_state(n, InstanceStates.TERMINATED, now) # been longer than missing window for n1 but shouldn't matter yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, *nodes) now = 30 yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) # not a zombie yet self.assertNodeState(InstanceHealthState.OK, *nodes) now = 31 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, n1) yield self.ok_heartbeat(n1, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.ZOMBIE, n1) now = 42 yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.UNKNOWN, n1) @defer.inlineCallbacks def test_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 yield self.err_heartbeat(node, now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0], 'faiiiill') yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.MONITOR_ERROR, node) @defer.inlineCallbacks def test_process_error(self): self.monitor = HealthMonitor(self.state, boot_seconds=10, missing_seconds=5, zombie_seconds=10, init_time=0) node = str(uuid.uuid4()) now = 1 self.state.new_fake_instance_state(node, InstanceStates.RUNNING, now) yield self.ok_heartbeat(node, now) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.OK, node) now = 5 procs = [{'name' : 'proc1', 'stderr' : 'faaaaaail', 'state' : 100, 'exitcode' : -1, 'stop_timestamp' : 25242}] yield self.err_heartbeat(node, now, procs) yield self.monitor.update(now) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') procs[0].pop('stderr') now = 8 yield self.err_heartbeat(node, now, procs) self.assertNodeState(InstanceHealthState.PROCESS_ERROR, node) errors = self.state.instances[node].errors self.assertEqual(len(errors), 1) self.assertEqual(errors[0]['stderr'], 'faaaaaail') def assertNodeState(self, state, *node_ids): for n in node_ids: self.assertEqual(state, self.state.instances[n].health) def ok_heartbeat(self, node_id, timestamp): msg = {'node_id' : node_id, 'timestamp' : timestamp, 'state' : InstanceHealthState.OK} return self.monitor.new_heartbeat(msg, timestamp) def err_heartbeat(self, node_id, timestamp, procs=None): msg = {'node_id' : node_id, 'timestamp' : timestamp,} if procs: msg['state'] = InstanceHealthState.PROCESS_ERROR msg['failed_processes'] = procs else: msg['state'] = InstanceHealthState.MONITOR_ERROR msg['error'] = 'faiiiill' return self.monitor.new_heartbeat(msg, timestamp)