def test_needs_duplicate_process(self): # ensure processes represented in queue and in a resource are not # counted twice. This situation arises in between process and resource # record updates. self.mm.initialize() self.assertFalse(self.epum_client.reconfigures) self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys())) for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) self.assertEqual(self.epum_client.domain_subs[domain_id], [(self.service_name, "node_state")]) self.mm.register_needs() for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) engine = self.engine_conf[engine_id] if engine.get('spare_slots', 0) == 0: self.assert_one_reconfigure(domain_id, 0, []) self.epum_client.clear() engine1_domain_id = domain_id_from_engine("engine1") engine1_procs = self.enqueue_n_processes(10, "engine1") one_process_key = engine1_procs[0] owner, upid, rround = one_process_key self.mm.register_needs() self.assert_one_reconfigure(engine1_domain_id, 10, []) self.epum_client.clear() # now add some resources with assigned processes # and removed queued processes. need shouldn't change. engine1_resources = self.create_engine_resources("engine1", node_count=10, assignments=engine1_procs) self.assertEqual(len(engine1_resources), 10) self.mm.queued_processes = [] self.mm.register_needs() self.assertFalse(self.epum_client.reconfigures) # now pretend one process fails and is requeued # the requue can happen before the resource update so we # simulate this to ensure that the process isn't counted twice self.mm.queued_processes = [(owner, upid, rround + 1)] self.mm.register_needs() self.assertFalse(self.epum_client.reconfigures)
def test_add_remove_node_with_resource(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) resource = self.store.get_resource(resource_id) self.assertIsNotNone(resource) self.assertEqual(resource.state, ExecutionResourceState.OK) # now send a terminated state for the node. resource should be removed. self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED) self.assertTrue(self.store.get_resource(resource_id) is None) self.assertTrue(self.store.get_node("node1") is None)
def test_add_remove_node(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) node = self.store.get_node("node1") self.assertTrue(node is not None) self.assertEqual(node.node_id, "node1") self.assertEqual(node.domain_id, domain_id_from_engine("engine1")) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATING) node = self.store.get_node("node1") self.assertTrue(node is None) # this shouldn't cause any problems even though node is gone self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)
def test_heartbeat_node_update_race(self): # test processing two beats simultaneously, for eeagents in the same node. # check that they don't collide updating the node record node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) beat = make_beat(node_id) # this beat gets injected while the other is in the midst of processing sneaky_beat = make_beat(node_id) # when the PD attempts to update the process, sneak in an update # first so the request conflicts original_update_node = self.store.update_node def patched_update_node(node): # unpatch ourself first so we don't recurse forever self.store.update_node = original_update_node self.core.ee_heartbeat("eeagent2", sneaky_beat) original_update_node(node) self.store.update_node = patched_update_node self.core.ee_heartbeat("eeagent1", beat) node = self.store.get_node(node_id) self.assertEqual(set(["eeagent1", "eeagent2"]), set(node.resources))
def initialize(self): self.resources = {} self.queued_processes = [] self.stale_processes = [] self.throttled_processes = [] self.resource_set_changed = True self.changed_resources = set() self.process_set_changed = True self.needs_matchmaking = True self.registered_needs = {} self._get_pending_processes() # create the domains if they don't already exist if self.epum_client: for engine in list(self.ee_registry): if not self.domain_definition_id: raise Exception("domain definition must be provided") if not self.base_domain_config: raise Exception("domain config must be provided") domain_id = domain_id_from_engine(engine.engine_id) try: self.epum_client.describe_domain(domain_id) except NotFoundError: config = self._get_domain_config(engine) self.epum_client.add_domain(domain_id, self.domain_definition_id, config, subscriber_name=self.service_name, subscriber_op='node_state')
def register_needs(self): self._get_pending_processes() for engine in list(self.ee_registry): engine_id = engine.engine_id need, unoccupied_nodes = self.calculate_need(engine_id) registered_need = self.registered_needs.get(engine_id) if need != registered_need: retiree_ids = None # on scale down, request for specific nodes to be terminated if need < registered_need: unoccupied_nodes.sort(key=self._node_state_time, reverse=True) retiree_ids = unoccupied_nodes[:registered_need - need] for resource in self.resources.itervalues(): if resource.node_id in retiree_ids: self.core.resource_change_state(resource, ExecutionResourceState.DISABLED) log.info("Scaling engine '%s' to %s nodes (was %s)", engine_id, need, self.registered_needs.get(engine_id, 0)) if retiree_ids: log.info("Retiring engine '%s' nodes: %s", engine_id, ", ".join(retiree_ids)) config = get_domain_reconfigure_config(need, retiree_ids) domain_id = domain_id_from_engine(engine_id) self.epum_client.reconfigure_domain(domain_id, config) self.registered_needs[engine_id] = need
def test_needs_maximum_vms(self): self.mm.initialize() self.assertFalse(self.epum_client.reconfigures) self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys())) for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) self.assertEqual(self.epum_client.domain_subs[domain_id], [(self.service_name, "node_state")]) self.mm.register_needs() for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) engine = self.engine_conf[engine_id] if engine.get('spare_slots', 0) == 0: self.assert_one_reconfigure(domain_id, 0, []) self.epum_client.clear() engine1_domain_id = domain_id_from_engine("engine1") engine2_domain_id = domain_id_from_engine("engine2") # engine1 has 1 slot and 1 replica per node, expect a VM per process self.enqueue_n_processes(10, "engine1") # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes self.enqueue_n_processes(10, "engine2") self.mm.register_needs() self.assert_one_reconfigure(engine1_domain_id, 10, []) self.assert_one_reconfigure(engine2_domain_id, 5, []) self.epum_client.clear() # engine1 has 1 slot and 1 replica per node, expect a VM per process, normally self.enqueue_n_processes(1000, "engine1") # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes, normally self.enqueue_n_processes(1000, "engine2") # But, we set a maximum of 100 VMs, so even though we have thousands of processes # queued, we only start 100 VMs total self.mm.register_needs() self.assert_one_reconfigure(engine1_domain_id, 100, []) self.assert_one_reconfigure(engine2_domain_id, 100, []) self.epum_client.clear()
def test_add_remove_node_with_resource_and_processes(self): self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) # set up a few of processes on the resource p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id) self.store.add_process(p2) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id) self.store.add_process(p3) resource = self.store.get_resource(resource_id) resource.assigned = [p1.key, p2.key, p3.key] self.store.update_resource(resource) # now send a terminated state for the node. resource should be removed. self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED) self.assertTrue(self.store.get_resource(resource_id) is None) self.assertTrue(self.store.get_node("node1") is None) queued_processes = set(self.store.get_queued_processes()) # these two should have been rescheduled for procname in ("proc1", "proc2"): proc = self.store.get_process(None, procname) self.assertEqual(proc.state, ProcessState.DIED_REQUESTED) self.assertEqual(proc.round, 1) self.assertIn(proc.key, queued_processes) self.notifier.assert_process_state(procname, ProcessState.DIED_REQUESTED) # this one should be terminated proc3 = self.store.get_process(None, "proc3") self.assertEqual(proc3.state, ProcessState.TERMINATED) self.assertEqual(proc3.round, 0) self.assertNotIn(proc3.key, queued_processes) self.notifier.assert_process_state("proc3", ProcessState.TERMINATED)
def _create_missing_domains(self): # create the domains if they don't already exist if self.epum_client: for engine in list(self.ee_registry): if not self.domain_definition_id: raise Exception("domain definition must be provided") if not self.base_domain_config: raise Exception("domain config must be provided") domain_id = domain_id_from_engine(engine.engine_id) try: self.epum_client.describe_domain(domain_id) except NotFoundError: config = self._get_domain_config(engine) self.epum_client.add_domain(domain_id, self.domain_definition_id, config, subscriber_name=self.service_name, subscriber_op='node_state')
def test_heartbeat_node_removed(self): # test processing a heartbeat where node is removed partway through node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) beat = make_beat(node_id) original_update_node = self.store.update_node def patched_update_node(node): # unpatch ourself first so we don't recurse forever self.store.update_node = original_update_node self.store.remove_node(node.node_id) original_update_node(node) self.store.update_node = patched_update_node # this shouldn't blow up, and no resource should be added self.core.ee_heartbeat("eeagent1", beat) self.assertEqual(self.store.get_resource("eeagent1"), None)
def test_heartbeat_timestamps(self): # test processing a heartbeat where node is removed partway through node_id = uuid.uuid4().hex self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING) d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00") d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00") d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00") self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d1) self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3) # out of order hbeat. time shouln't be updated self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat())) resource = self.store.get_resource("eeagent1") self.assertEqual(resource.last_heartbeat_datetime, d3)
def announce_node(self, node_name, engine, process_dispatcher, state=None): """Announce a node to each process dispatcher. @param node_name: the name of the node to advertise @param engine: the execution engine of the node @param process_dispatcher: the pd to announce to @param state: the state to advertise to the pd """ if not state: state = InstanceState.RUNNING pd_client = ProcessDispatcherClient(self.dashi, process_dispatcher) log.info("Announcing %s of engine %s is '%s' to %s" % (node_name, engine, state, process_dispatcher)) domain_id = domain_id_from_engine(engine) for i in range(1, ADVERTISE_RETRIES): try: pd_client.node_state(node_name, domain_id, state) break except timeout: wait_time = i * i # Exponentially increasing wait log.warning("PD '%s' not available yet. Waiting %ss" % (process_dispatcher, wait_time)) time.sleep(2**i)
def announce_node(self, node_name, engine, process_dispatcher, state=None): """Announce a node to each process dispatcher. @param node_name: the name of the node to advertise @param engine: the execution engine of the node @param process_dispatcher: the pd to announce to @param state: the state to advertise to the pd """ if not state: state = InstanceState.RUNNING pd_client = ProcessDispatcherClient(self.dashi, process_dispatcher) log.info("Announcing %s of engine %s is '%s' to %s" % (node_name, engine, state, process_dispatcher)) domain_id = domain_id_from_engine(engine) for i in range(1, ADVERTISE_RETRIES): try: pd_client.node_state(node_name, domain_id, state) break except timeout: wait_time = i * i # Exponentially increasing wait log.warning("PD '%s' not available yet. Waiting %ss" % (process_dispatcher, wait_time)) time.sleep(2 ** i)
def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)
def test_needs(self): self.mm.initialize() self.assertFalse(self.epum_client.reconfigures) self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys())) for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) self.assertEqual(self.epum_client.domain_subs[domain_id], [(self.service_name, "node_state")]) self.mm.register_needs() for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) engine = self.engine_conf[engine_id] if engine.get('spare_slots', 0) == 0: self.assert_one_reconfigure(domain_id, 0, []) self.epum_client.clear() engine1_domain_id = domain_id_from_engine("engine1") engine2_domain_id = domain_id_from_engine("engine2") engine3_domain_id = domain_id_from_engine("engine3") engine4_domain_id = domain_id_from_engine("engine4") # engine1 has 1 slot and 1 replica per node, expect a VM per process engine1_procs = self.enqueue_n_processes(10, "engine1") # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes engine2_procs = self.enqueue_n_processes(10, "engine2") # engine3 has 2 slots and 2 replicas per node, expect a VM per 4 processes engine3_procs = self.enqueue_n_processes(10, "engine3") # engine4 has 2 slots and 1 replica per node, and a # minimum of 1 free slot, expect a VM per process + 1 engine4_procs = self.enqueue_n_processes(10, "engine4") self.mm.register_needs() self.assert_one_reconfigure(engine1_domain_id, 10, []) self.assert_one_reconfigure(engine2_domain_id, 5, []) self.assert_one_reconfigure(engine3_domain_id, 3, []) self.assert_one_reconfigure(engine4_domain_id, 11, []) self.epum_client.clear() # now add some resources with assigned processes # and removed queued processes. need shouldn't change. engine1_resources = self.create_engine_resources("engine1", node_count=10, assignments=engine1_procs) self.assertEqual(len(engine1_resources), 10) engine2_resources = self.create_engine_resources("engine2", node_count=5, assignments=engine2_procs) self.assertEqual(len(engine2_resources), 5) engine3_resources = self.create_engine_resources("engine3", node_count=3, assignments=engine3_procs) self.assertEqual(len(engine3_resources), 6) engine4_resources = self.create_engine_resources("engine4", node_count=11, assignments=engine4_procs) self.assertEqual(len(engine4_resources), 11) self.mm.queued_processes = [] self.mm.register_needs() self.assertFalse(self.epum_client.reconfigures) # now try scale down # empty 2 resources from engine1. 2 nodes should be terminated. engine1_retirees = set() for resource in engine1_resources[:2]: engine1_retirees.add(resource.node_id) resource.assigned = [] # empty 2 resources from engine2. 2 nodes should be terminated engine2_retirees = set() for resource in engine2_resources[:2]: engine2_retirees.add(resource.node_id) resource.assigned = [] # empty 3 resources from engine3. 1 node should be terminated for resource in engine3_resources[:3]: resource.assigned = [] engine3_retirees = set([engine3_resources[0].node_id]) # empty 2 resources from engine4. 2 nodes should be terminated engine4_retirees = set() for resource in engine4_resources: if len(resource.assigned) > 0: engine4_retirees.add(resource.node_id) resource.assigned = [] if len(engine4_retirees) >= 2: break self.mm.register_needs() self.assert_one_reconfigure(engine1_domain_id, 8, engine1_retirees) self.assert_one_reconfigure(engine2_domain_id, 3, engine2_retirees) self.assert_one_reconfigure(engine3_domain_id, 2, engine3_retirees) # Note that we cannot check which nodes have retired, since the spare # one may be terminated self.assert_one_reconfigure(engine4_domain_id, 9) self.epum_client.clear()
def test_needs_unscheduled_pending(self): # engine1 has 1 slot, expect a VM per process engine1_pending_procs = self.create_n_pending_processes(10, "engine1") # engine2 has 2 slots, expect a VM per 2 processes engine2_pending_procs = self.create_n_pending_processes(10, "engine2") # Normally this is done by the doctor, but we do it manually here, # since there is no doctor in this test env self.store.set_initialized() self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING) self.mm.initialize() self.assertFalse(self.epum_client.reconfigures) self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys())) for engine_id in self.engine_conf: domain_id = domain_id_from_engine(engine_id) self.assertEqual(self.epum_client.domain_subs[domain_id], [(self.service_name, "node_state")]) engine1_domain_id = domain_id_from_engine("engine1") engine2_domain_id = domain_id_from_engine("engine2") self.mm.register_needs() # we should see VMs even though we have no queued procs self.assert_one_reconfigure(engine1_domain_id, 10, []) self.assert_one_reconfigure(engine2_domain_id, 5, []) self.epum_client.clear() # engine1 has 1 slot, expect a VM per process engine1_queued_procs = self.enqueue_n_processes(10, "engine1") # engine2 has 2 slots, expect a VM per 2 processes engine2_queued_procs = self.enqueue_n_processes(10, "engine2") self.mm.register_needs() # we should see for the queued and pending procs self.assert_one_reconfigure(engine1_domain_id, 20, []) self.assert_one_reconfigure(engine2_domain_id, 10, []) self.epum_client.clear() # When we enqueue the procs and mark the PD OK self.enqueue_pending_processes() self.mm.register_needs() # The matchmaker won't have checked for the updated pending procs self.assertEqual(len(self.mm.unscheduled_pending_processes), 20) # But there should be no change to requested VMs, since we should # deduplicate processes self.assertFalse(self.epum_client.reconfigures) self.store.set_pd_state(ProcessDispatcherState.OK) self.mm.register_needs() # The matchmaker should have no pending processes self.assertEqual(len(self.mm.unscheduled_pending_processes), 0) # There should be no change to requested VMs self.assertFalse(self.epum_client.reconfigures) self.epum_client.clear() # now add some resources with assigned processes # and removed queued processes. need shouldn't change. engine1_procs = engine1_queued_procs + engine1_pending_procs engine2_procs = engine2_queued_procs + engine2_pending_procs engine1_resources = self.create_engine_resources("engine1", node_count=20, assignments=engine1_procs) self.assertEqual(len(engine1_resources), 20) engine2_resources = self.create_engine_resources("engine2", node_count=10, assignments=engine2_procs) self.assertEqual(len(engine2_resources), 10) self.mm.queued_processes = [] self.mm.register_needs() self.assertFalse(self.epum_client.reconfigures) # empty resources from engine1. all nodes should be terminated. engine1_retirees = set() for resource in engine1_resources: engine1_retirees.add(resource.node_id) resource.assigned = [] # empty resources from engine2. all nodes should be terminated engine2_retirees = set() for resource in engine2_resources: engine2_retirees.add(resource.node_id) resource.assigned = [] self.mm.register_needs() # we should see for the queued and pending procs self.assert_one_reconfigure(engine1_domain_id, 0, engine1_retirees) self.assert_one_reconfigure(engine2_domain_id, 0, engine2_retirees) self.epum_client.clear()
def test_uninitialized_system_boot_with_state(self): self.store.set_system_boot(True) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) resource_id = "eeagent_1" self.core.ee_heartbeat(resource_id, make_beat("node1")) p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ALWAYS) self.store.add_process(p0) p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id) self.store.add_process(p1) p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id) self.store.add_process(p2) p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id) self.store.add_process(p3) # this one shouldn't restart p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING, configuration=nosystemrestart_process_config(), assigned=resource_id, restart_mode=RestartMode.ABNORMAL) self.store.add_process(p4) # non-running proceses should also potentially be restarted on boot p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING) self.store.add_process(p5) self.store.enqueue_process(*p5.key) p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED) self.store.add_process(p6) # not this one, due to RestartMode p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED, configuration=nosystemrestart_process_config(), restart_mode=RestartMode.ALWAYS) self.store.add_process(p7) self.store.enqueue_process(*p7.key) resource = self.store.get_resource(resource_id) resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key] self.store.update_resource(resource) restartable_procs = ["proc1", "proc2", "proc5", "proc6"] dead_procs = ["proc0", "proc4", "proc7"] self._run_in_thread() assert self.store.wait_initialized(timeout=10) self.assertEqual(len(self.store.get_queued_processes()), 0) self.assertEqual(len(self.store.get_node_ids()), 0) self.assertEqual(len(self.store.get_resource_ids()), 0) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.UNSCHEDULED_PENDING) for proc in dead_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.TERMINATED) self.assertEqual(self.store.get_process(None, "proc3").state, ProcessState.TERMINATED) self.assertEqual(self.store.get_pd_state(), ProcessDispatcherState.SYSTEM_BOOTING) # now end system boot self.store.set_system_boot(False) wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK) # check that pending processes were correctly rescheduled self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs)) for proc in restartable_procs: self.assertEqual(self.store.get_process(None, proc).state, ProcessState.REQUESTED)
def test_resource_monitor(self): t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC) mock_now = Mock() mock_now.return_value = t0 def increment_now(seconds): t = mock_now.return_value + timedelta(seconds=seconds) mock_now.return_value = t log.debug("THE TIME IS NOW: %s", t) return t monitor = self._setup_resource_monitor() monitor._now_func = mock_now # before there are any resources, monitor should work but return a None delay self.assertIsNone(monitor.monitor_cycle()) self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING) # 3 resources. all report in at t0 r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3" self._send_heartbeat(r1, "node1", t0) self._send_heartbeat(r2, "node1", t0) self._send_heartbeat(r3, "node1", t0) states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK, r3: ExecutionResourceState.OK} self.assert_monitor_cycle(10, states) t1 = increment_now(5) # :05 # heartbeat comes in for r1 5 seconds later self._send_heartbeat(r1, "node1", t1) self.assert_monitor_cycle(5, states) increment_now(5) # :10 # no heartbeats for r2 and r3. they should be marked WARNING states[r2] = ExecutionResourceState.WARNING states[r3] = ExecutionResourceState.WARNING self.assert_monitor_cycle(5, states) increment_now(4) # :14 # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1)) self.assert_monitor_cycle(1, states) increment_now(6) # :20 # r1 should go warning, r3 should go missing states[r1] = ExecutionResourceState.WARNING states[r3] = ExecutionResourceState.MISSING self.assert_monitor_cycle(4, states) t2 = increment_now(3) # :23 self._send_heartbeat(r1, "node1", t2) states[r1] = ExecutionResourceState.OK self.assert_monitor_cycle(1, states) t3 = increment_now(2) # :25 self._send_heartbeat(r3, "node1", t3) states[r2] = ExecutionResourceState.MISSING states[r3] = ExecutionResourceState.OK self.assert_monitor_cycle(8, states) increment_now(5) # :30 # hearbeat r2 enough to go back to WARNING, but still late self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15))) self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15)) states[r2] = ExecutionResourceState.WARNING self.assert_monitor_cycle(3, states) t4 = increment_now(5) # :35 # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok) self._send_heartbeat(r1, "node1", t4) self._send_heartbeat(r3, "node1", t4) self.core.resource_change_state(self.store.get_resource(r2), ExecutionResourceState.DISABLED) states[r2] = ExecutionResourceState.DISABLED self.assert_monitor_cycle(10, states)