Example #1
0
    def test_needs_duplicate_process(self):

        # ensure processes represented in queue and in a resource are not
        # counted twice. This situation arises in between process and resource
        # record updates.
        self.mm.initialize()

        self.assertFalse(self.epum_client.reconfigures)
        self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys()))

        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            self.assertEqual(self.epum_client.domain_subs[domain_id],
                [(self.service_name, "node_state")])

        self.mm.register_needs()
        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            engine = self.engine_conf[engine_id]
            if engine.get('spare_slots', 0) == 0:
                self.assert_one_reconfigure(domain_id, 0, [])
        self.epum_client.clear()

        engine1_domain_id = domain_id_from_engine("engine1")

        engine1_procs = self.enqueue_n_processes(10, "engine1")

        one_process_key = engine1_procs[0]
        owner, upid, rround = one_process_key

        self.mm.register_needs()
        self.assert_one_reconfigure(engine1_domain_id, 10, [])
        self.epum_client.clear()

        # now add some resources with assigned processes
        # and removed queued processes. need shouldn't change.
        engine1_resources = self.create_engine_resources("engine1",
            node_count=10, assignments=engine1_procs)
        self.assertEqual(len(engine1_resources), 10)
        self.mm.queued_processes = []

        self.mm.register_needs()
        self.assertFalse(self.epum_client.reconfigures)

        # now pretend one process fails and is requeued
        # the requue can happen before the resource update so we
        # simulate this to ensure that the process isn't counted twice

        self.mm.queued_processes = [(owner, upid, rround + 1)]
        self.mm.register_needs()
        self.assertFalse(self.epum_client.reconfigures)
Example #2
0
    def test_add_remove_node_with_resource(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        resource = self.store.get_resource(resource_id)
        self.assertIsNotNone(resource)
        self.assertEqual(resource.state, ExecutionResourceState.OK)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)
Example #3
0
    def test_add_remove_node(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)

        node = self.store.get_node("node1")
        self.assertTrue(node is not None)
        self.assertEqual(node.node_id, "node1")
        self.assertEqual(node.domain_id, domain_id_from_engine("engine1"))

        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATING)
        node = self.store.get_node("node1")
        self.assertTrue(node is None)

        # this shouldn't cause any problems even though node is gone
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)
Example #4
0
    def test_heartbeat_node_update_race(self):

        # test processing two beats simultaneously, for eeagents in the same node.
        # check that they don't collide updating the node record
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        # this beat gets injected while the other is in the midst of processing
        sneaky_beat = make_beat(node_id)

        # when the PD attempts to update the process, sneak in an update
        # first so the request conflicts
        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node

            self.core.ee_heartbeat("eeagent2", sneaky_beat)
            original_update_node(node)

        self.store.update_node = patched_update_node

        self.core.ee_heartbeat("eeagent1", beat)

        node = self.store.get_node(node_id)
        self.assertEqual(set(["eeagent1", "eeagent2"]), set(node.resources))
Example #5
0
    def initialize(self):

        self.resources = {}
        self.queued_processes = []
        self.stale_processes = []
        self.throttled_processes = []

        self.resource_set_changed = True
        self.changed_resources = set()
        self.process_set_changed = True

        self.needs_matchmaking = True

        self.registered_needs = {}
        self._get_pending_processes()

        # create the domains if they don't already exist
        if self.epum_client:
            for engine in list(self.ee_registry):

                if not self.domain_definition_id:
                    raise Exception("domain definition must be provided")

                if not self.base_domain_config:
                    raise Exception("domain config must be provided")

                domain_id = domain_id_from_engine(engine.engine_id)
                try:
                    self.epum_client.describe_domain(domain_id)
                except NotFoundError:
                    config = self._get_domain_config(engine)
                    self.epum_client.add_domain(domain_id,
                        self.domain_definition_id, config,
                        subscriber_name=self.service_name,
                        subscriber_op='node_state')
Example #6
0
    def register_needs(self):

        self._get_pending_processes()

        for engine in list(self.ee_registry):

            engine_id = engine.engine_id

            need, unoccupied_nodes = self.calculate_need(engine_id)
            registered_need = self.registered_needs.get(engine_id)
            if need != registered_need:

                retiree_ids = None
                # on scale down, request for specific nodes to be terminated
                if need < registered_need:

                    unoccupied_nodes.sort(key=self._node_state_time, reverse=True)
                    retiree_ids = unoccupied_nodes[:registered_need - need]
                    for resource in self.resources.itervalues():
                        if resource.node_id in retiree_ids:
                            self.core.resource_change_state(resource,
                                ExecutionResourceState.DISABLED)

                log.info("Scaling engine '%s' to %s nodes (was %s)",
                        engine_id, need, self.registered_needs.get(engine_id, 0))
                if retiree_ids:
                    log.info("Retiring engine '%s' nodes: %s", engine_id, ", ".join(retiree_ids))
                config = get_domain_reconfigure_config(need, retiree_ids)
                domain_id = domain_id_from_engine(engine_id)
                self.epum_client.reconfigure_domain(domain_id, config)
                self.registered_needs[engine_id] = need
Example #7
0
    def test_needs_maximum_vms(self):
        self.mm.initialize()

        self.assertFalse(self.epum_client.reconfigures)
        self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys()))

        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            self.assertEqual(self.epum_client.domain_subs[domain_id],
                [(self.service_name, "node_state")])

        self.mm.register_needs()
        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            engine = self.engine_conf[engine_id]
            if engine.get('spare_slots', 0) == 0:
                self.assert_one_reconfigure(domain_id, 0, [])
        self.epum_client.clear()

        engine1_domain_id = domain_id_from_engine("engine1")
        engine2_domain_id = domain_id_from_engine("engine2")

        # engine1 has 1 slot and 1 replica per node, expect a VM per process
        self.enqueue_n_processes(10, "engine1")

        # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes
        self.enqueue_n_processes(10, "engine2")

        self.mm.register_needs()
        self.assert_one_reconfigure(engine1_domain_id, 10, [])
        self.assert_one_reconfigure(engine2_domain_id, 5, [])
        self.epum_client.clear()

        # engine1 has 1 slot and 1 replica per node, expect a VM per process, normally
        self.enqueue_n_processes(1000, "engine1")

        # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes, normally
        self.enqueue_n_processes(1000, "engine2")

        # But, we set a maximum of 100 VMs, so even though we have thousands of processes
        # queued, we only start 100 VMs total
        self.mm.register_needs()
        self.assert_one_reconfigure(engine1_domain_id, 100, [])
        self.assert_one_reconfigure(engine2_domain_id, 100, [])
        self.epum_client.clear()
Example #8
0
    def test_add_remove_node_with_resource_and_processes(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        # set up a few of processes on the resource
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING,
                assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING,
            assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING,
            assigned=resource_id)
        self.store.add_process(p3)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p1.key, p2.key, p3.key]
        self.store.update_resource(resource)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)

        queued_processes = set(self.store.get_queued_processes())

        # these two should have been rescheduled
        for procname in ("proc1", "proc2"):
            proc = self.store.get_process(None, procname)
            self.assertEqual(proc.state, ProcessState.DIED_REQUESTED)
            self.assertEqual(proc.round, 1)
            self.assertIn(proc.key, queued_processes)
            self.notifier.assert_process_state(procname, ProcessState.DIED_REQUESTED)

        # this one should be terminated
        proc3 = self.store.get_process(None, "proc3")
        self.assertEqual(proc3.state, ProcessState.TERMINATED)
        self.assertEqual(proc3.round, 0)
        self.assertNotIn(proc3.key, queued_processes)
        self.notifier.assert_process_state("proc3", ProcessState.TERMINATED)
Example #9
0
    def _create_missing_domains(self):

        # create the domains if they don't already exist
        if self.epum_client:
            for engine in list(self.ee_registry):

                if not self.domain_definition_id:
                    raise Exception("domain definition must be provided")

                if not self.base_domain_config:
                    raise Exception("domain config must be provided")

                domain_id = domain_id_from_engine(engine.engine_id)
                try:
                    self.epum_client.describe_domain(domain_id)
                except NotFoundError:
                    config = self._get_domain_config(engine)
                    self.epum_client.add_domain(domain_id,
                        self.domain_definition_id, config,
                        subscriber_name=self.service_name,
                        subscriber_op='node_state')
Example #10
0
    def test_heartbeat_node_removed(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node
            self.store.remove_node(node.node_id)
            original_update_node(node)

        self.store.update_node = patched_update_node

        # this shouldn't blow up, and no resource should be added
        self.core.ee_heartbeat("eeagent1", beat)
        self.assertEqual(self.store.get_resource("eeagent1"), None)
Example #11
0
    def test_heartbeat_timestamps(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00")
        d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00")
        d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00")

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat()))

        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d1)

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

        # out of order hbeat. time shouln't be updated
        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)
Example #12
0
    def announce_node(self, node_name, engine, process_dispatcher, state=None):
        """Announce a node to each process dispatcher.

        @param node_name: the name of the node to advertise
        @param engine: the execution engine of the node
        @param process_dispatcher: the pd to announce to
        @param state: the state to advertise to the pd
        """
        if not state:
            state = InstanceState.RUNNING

        pd_client = ProcessDispatcherClient(self.dashi, process_dispatcher)
        log.info("Announcing %s of engine %s is '%s' to %s" %
                 (node_name, engine, state, process_dispatcher))
        domain_id = domain_id_from_engine(engine)
        for i in range(1, ADVERTISE_RETRIES):
            try:
                pd_client.node_state(node_name, domain_id, state)
                break
            except timeout:
                wait_time = i * i  # Exponentially increasing wait
                log.warning("PD '%s' not available yet. Waiting %ss" %
                            (process_dispatcher, wait_time))
                time.sleep(2**i)
Example #13
0
    def announce_node(self, node_name, engine, process_dispatcher,
            state=None):
        """Announce a node to each process dispatcher.

        @param node_name: the name of the node to advertise
        @param engine: the execution engine of the node
        @param process_dispatcher: the pd to announce to
        @param state: the state to advertise to the pd
        """
        if not state:
            state = InstanceState.RUNNING

        pd_client = ProcessDispatcherClient(self.dashi, process_dispatcher)
        log.info("Announcing %s of engine %s is '%s' to %s" % (node_name,
            engine, state, process_dispatcher))
        domain_id = domain_id_from_engine(engine)
        for i in range(1, ADVERTISE_RETRIES):
            try:
                pd_client.node_state(node_name, domain_id, state)
                break
            except timeout:
                wait_time = i * i  # Exponentially increasing wait
                log.warning("PD '%s' not available yet. Waiting %ss" % (process_dispatcher, wait_time))
                time.sleep(2 ** i)
 def _send_node_state(self, engine_id, node_id=None):
     node_id = node_id or uuid.uuid4().hex
     node_state = dict(node_id=node_id, state=InstanceState.RUNNING,
         domain_id=domain_id_from_engine(engine_id))
     self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)
Example #15
0
    def test_needs(self):
        self.mm.initialize()

        self.assertFalse(self.epum_client.reconfigures)
        self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys()))

        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            self.assertEqual(self.epum_client.domain_subs[domain_id],
                [(self.service_name, "node_state")])

        self.mm.register_needs()
        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            engine = self.engine_conf[engine_id]
            if engine.get('spare_slots', 0) == 0:
                self.assert_one_reconfigure(domain_id, 0, [])
        self.epum_client.clear()

        engine1_domain_id = domain_id_from_engine("engine1")
        engine2_domain_id = domain_id_from_engine("engine2")
        engine3_domain_id = domain_id_from_engine("engine3")
        engine4_domain_id = domain_id_from_engine("engine4")

        # engine1 has 1 slot and 1 replica per node, expect a VM per process
        engine1_procs = self.enqueue_n_processes(10, "engine1")

        # engine2 has 2 slots and 1 replica per node, expect a VM per 2 processes
        engine2_procs = self.enqueue_n_processes(10, "engine2")

        # engine3 has 2 slots and 2 replicas per node, expect a VM per 4 processes
        engine3_procs = self.enqueue_n_processes(10, "engine3")

        # engine4 has 2 slots and 1 replica per node, and a
        # minimum of 1 free slot, expect a VM per process + 1
        engine4_procs = self.enqueue_n_processes(10, "engine4")

        self.mm.register_needs()
        self.assert_one_reconfigure(engine1_domain_id, 10, [])
        self.assert_one_reconfigure(engine2_domain_id, 5, [])
        self.assert_one_reconfigure(engine3_domain_id, 3, [])
        self.assert_one_reconfigure(engine4_domain_id, 11, [])
        self.epum_client.clear()

        # now add some resources with assigned processes
        # and removed queued processes. need shouldn't change.
        engine1_resources = self.create_engine_resources("engine1",
            node_count=10, assignments=engine1_procs)
        self.assertEqual(len(engine1_resources), 10)
        engine2_resources = self.create_engine_resources("engine2",
            node_count=5, assignments=engine2_procs)
        self.assertEqual(len(engine2_resources), 5)
        engine3_resources = self.create_engine_resources("engine3",
            node_count=3, assignments=engine3_procs)
        self.assertEqual(len(engine3_resources), 6)
        engine4_resources = self.create_engine_resources("engine4",
            node_count=11, assignments=engine4_procs)
        self.assertEqual(len(engine4_resources), 11)
        self.mm.queued_processes = []

        self.mm.register_needs()
        self.assertFalse(self.epum_client.reconfigures)

        # now try scale down

        # empty 2 resources from engine1. 2 nodes should be terminated.
        engine1_retirees = set()
        for resource in engine1_resources[:2]:
            engine1_retirees.add(resource.node_id)
            resource.assigned = []

        # empty 2 resources from engine2. 2 nodes should be terminated
        engine2_retirees = set()
        for resource in engine2_resources[:2]:
            engine2_retirees.add(resource.node_id)
            resource.assigned = []

        # empty 3 resources from engine3.  1 node should be terminated
        for resource in engine3_resources[:3]:
            resource.assigned = []
        engine3_retirees = set([engine3_resources[0].node_id])

        # empty 2 resources from engine4.  2 nodes should be terminated
        engine4_retirees = set()
        for resource in engine4_resources:
            if len(resource.assigned) > 0:
                engine4_retirees.add(resource.node_id)
                resource.assigned = []

            if len(engine4_retirees) >= 2:
                break

        self.mm.register_needs()
        self.assert_one_reconfigure(engine1_domain_id, 8,
            engine1_retirees)
        self.assert_one_reconfigure(engine2_domain_id, 3,
            engine2_retirees)
        self.assert_one_reconfigure(engine3_domain_id, 2,
            engine3_retirees)
        # Note that we cannot check which nodes have retired, since the spare
        # one may be terminated
        self.assert_one_reconfigure(engine4_domain_id, 9)
        self.epum_client.clear()
Example #16
0
    def test_needs_unscheduled_pending(self):

        # engine1 has 1 slot, expect a VM per process
        engine1_pending_procs = self.create_n_pending_processes(10, "engine1")

        # engine2 has 2 slots, expect a VM per 2 processes
        engine2_pending_procs = self.create_n_pending_processes(10, "engine2")

        # Normally this is done by the doctor, but we do it manually here,
        # since there is no doctor in this test env
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        self.mm.initialize()

        self.assertFalse(self.epum_client.reconfigures)
        self.assertEqual(len(self.epum_client.domains), len(self.engine_conf.keys()))

        for engine_id in self.engine_conf:
            domain_id = domain_id_from_engine(engine_id)
            self.assertEqual(self.epum_client.domain_subs[domain_id],
                [(self.service_name, "node_state")])

        engine1_domain_id = domain_id_from_engine("engine1")
        engine2_domain_id = domain_id_from_engine("engine2")

        self.mm.register_needs()

        # we should see VMs even though we have no queued procs
        self.assert_one_reconfigure(engine1_domain_id, 10, [])
        self.assert_one_reconfigure(engine2_domain_id, 5, [])

        self.epum_client.clear()

        # engine1 has 1 slot, expect a VM per process
        engine1_queued_procs = self.enqueue_n_processes(10, "engine1")

        # engine2 has 2 slots, expect a VM per 2 processes
        engine2_queued_procs = self.enqueue_n_processes(10, "engine2")

        self.mm.register_needs()

        # we should see for the queued and pending procs
        self.assert_one_reconfigure(engine1_domain_id, 20, [])
        self.assert_one_reconfigure(engine2_domain_id, 10, [])
        self.epum_client.clear()

        # When we enqueue the procs and mark the PD OK
        self.enqueue_pending_processes()
        self.mm.register_needs()

        # The matchmaker won't have checked for the updated pending procs
        self.assertEqual(len(self.mm.unscheduled_pending_processes), 20)

        # But there should be no change to requested VMs, since we should
        # deduplicate processes
        self.assertFalse(self.epum_client.reconfigures)

        self.store.set_pd_state(ProcessDispatcherState.OK)

        self.mm.register_needs()

        # The matchmaker should have no pending processes
        self.assertEqual(len(self.mm.unscheduled_pending_processes), 0)

        # There should be no change to requested VMs
        self.assertFalse(self.epum_client.reconfigures)

        self.epum_client.clear()

        # now add some resources with assigned processes
        # and removed queued processes. need shouldn't change.
        engine1_procs = engine1_queued_procs + engine1_pending_procs
        engine2_procs = engine2_queued_procs + engine2_pending_procs
        engine1_resources = self.create_engine_resources("engine1",
            node_count=20, assignments=engine1_procs)
        self.assertEqual(len(engine1_resources), 20)
        engine2_resources = self.create_engine_resources("engine2",
            node_count=10, assignments=engine2_procs)
        self.assertEqual(len(engine2_resources), 10)
        self.mm.queued_processes = []

        self.mm.register_needs()
        self.assertFalse(self.epum_client.reconfigures)

        # empty resources from engine1. all nodes should be terminated.
        engine1_retirees = set()
        for resource in engine1_resources:
            engine1_retirees.add(resource.node_id)
            resource.assigned = []

        # empty resources from engine2. all nodes should be terminated
        engine2_retirees = set()
        for resource in engine2_resources:
            engine2_retirees.add(resource.node_id)
            resource.assigned = []

        self.mm.register_needs()

        # we should see for the queued and pending procs
        self.assert_one_reconfigure(engine1_domain_id, 0, engine1_retirees)
        self.assert_one_reconfigure(engine2_domain_id, 0, engine2_retirees)
        self.epum_client.clear()
Example #17
0
    def test_uninitialized_system_boot_with_state(self):
        self.store.set_system_boot(True)
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING,
                assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING,
            assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING,
            assigned=resource_id)
        self.store.add_process(p3)

        # this one shouldn't restart
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ABNORMAL)
        self.store.add_process(p4)

        # non-running proceses should also potentially be restarted on boot
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING)
        self.store.add_process(p5)
        self.store.enqueue_process(*p5.key)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED)
        self.store.add_process(p6)

        # not this one, due to RestartMode
        p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED,
            configuration=nosystemrestart_process_config(),
            restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p7)
        self.store.enqueue_process(*p7.key)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key]
        self.store.update_resource(resource)

        restartable_procs = ["proc1", "proc2", "proc5", "proc6"]
        dead_procs = ["proc0", "proc4", "proc7"]

        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)

        self.assertEqual(len(self.store.get_queued_processes()), 0)
        self.assertEqual(len(self.store.get_node_ids()), 0)
        self.assertEqual(len(self.store.get_resource_ids()), 0)

        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.UNSCHEDULED_PENDING)
        for proc in dead_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.TERMINATED)
        self.assertEqual(self.store.get_process(None, "proc3").state,
                         ProcessState.TERMINATED)

        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)
Example #18
0
    def test_resource_monitor(self):
        t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC)
        mock_now = Mock()
        mock_now.return_value = t0

        def increment_now(seconds):
            t = mock_now.return_value + timedelta(seconds=seconds)
            mock_now.return_value = t
            log.debug("THE TIME IS NOW: %s", t)
            return t

        monitor = self._setup_resource_monitor()
        monitor._now_func = mock_now

        # before there are any resources, monitor should work but return a None delay
        self.assertIsNone(monitor.monitor_cycle())

        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)

        # 3 resources. all report in at t0
        r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3"
        self._send_heartbeat(r1, "node1", t0)
        self._send_heartbeat(r2, "node1", t0)
        self._send_heartbeat(r3, "node1", t0)

        states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK,
                  r3: ExecutionResourceState.OK}

        self.assert_monitor_cycle(10, states)

        t1 = increment_now(5)  # :05
        # heartbeat comes in for r1 5 seconds later
        self._send_heartbeat(r1, "node1", t1)

        self.assert_monitor_cycle(5, states)

        increment_now(5)  # :10

        # no heartbeats for r2 and r3. they should be marked WARNING
        states[r2] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(5, states)

        increment_now(4)  # :14

        # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1))

        self.assert_monitor_cycle(1, states)

        increment_now(6)  # :20

        # r1 should go warning, r3 should go missing
        states[r1] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.MISSING
        self.assert_monitor_cycle(4, states)

        t2 = increment_now(3)  # :23
        self._send_heartbeat(r1, "node1", t2)
        states[r1] = ExecutionResourceState.OK
        self.assert_monitor_cycle(1, states)

        t3 = increment_now(2)  # :25
        self._send_heartbeat(r3, "node1", t3)
        states[r2] = ExecutionResourceState.MISSING
        states[r3] = ExecutionResourceState.OK
        self.assert_monitor_cycle(8, states)

        increment_now(5)  # :30
        # hearbeat r2 enough to go back to WARNING, but still late
        self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15)))
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15))
        states[r2] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(3, states)

        t4 = increment_now(5)  # :35
        # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok)
        self._send_heartbeat(r1, "node1", t4)
        self._send_heartbeat(r3, "node1", t4)
        self.core.resource_change_state(self.store.get_resource(r2),
            ExecutionResourceState.DISABLED)

        states[r2] = ExecutionResourceState.DISABLED
        self.assert_monitor_cycle(10, states)
 def _send_node_state(self, engine_id, node_id=None):
     node_id = node_id or uuid.uuid4().hex
     node_state = dict(node_id=node_id,
                       state=InstanceState.RUNNING,
                       domain_id=domain_id_from_engine(engine_id))
     self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)