Ejemplo n.º 1
0
 def setUp(self):
     self.store = self.get_store()
     self.registry = EngineRegistry.from_config(self.engine_conf)
     self.resource_client = Mock()
     self.notifier = MockNotifier()
     self.core = ProcessDispatcherCore(self.store, self.registry,
         self.resource_client, self.notifier)
Ejemplo n.º 2
0
    def setUp(self):
        self.store = self.setup_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = MockResourceClient()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry,
            self.resource_client, self.notifier)
        self.doctor = PDDoctor(self.core, self.store)

        self.docthread = None

        self.monitor = None
Ejemplo n.º 3
0
    def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None,
                 store=None, epum_client=None, notifier=None, definition_id=None,
                 domain_config=None, sysname=None):

        configs = ["service", "processdispatcher"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)
        self.topic = self.CFG.processdispatcher.get('service_name', topic)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG,
                                             amqp_uri=amqp_uri, sysname=sysname)

        engine_conf = self.CFG.processdispatcher.get('engines', {})
        default_engine = self.CFG.processdispatcher.get('default_engine')
        process_engines = self.CFG.processdispatcher.get('process_engines')
        if default_engine is None and len(engine_conf.keys()) == 1:
            default_engine = engine_conf.keys()[0]
        self.store = store or get_processdispatcher_store(self.CFG)
        self.store.initialize()
        self.registry = registry or EngineRegistry.from_config(engine_conf,
            default=default_engine, process_engines=process_engines)
        self.eeagent_client = EEAgentClient(self.dashi)

        domain_definition_id = None
        base_domain_config = None
        # allow disabling communication with EPUM for epuharness case
        if epum_client:
            self.epum_client = epum_client
            domain_definition_id = definition_id
            base_domain_config = domain_config
        elif not self.CFG.processdispatcher.get('static_resources'):
            domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id')
            base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config')
            epum_service_name = self.CFG.processdispatcher.get('epum_service_name',
                    'epu_management_service')
            self.epum_client = EPUManagementClient(self.dashi, epum_service_name)

        else:
            self.epum_client = None

        if notifier:
            self.notifier = notifier
        else:
            self.notifier = SubscriberNotifier(self.dashi)

        self.core = ProcessDispatcherCore(self.store,
                                          self.registry,
                                          self.eeagent_client,
                                          self.notifier)

        launch_type = self.CFG.processdispatcher.get('launch_type', 'supd')
        restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {})
        dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds')

        self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client,
            self.registry, self.epum_client, self.notifier, self.topic,
            domain_definition_id, base_domain_config, launch_type,
            restart_throttling_config, dispatch_retry_seconds)

        self.doctor = PDDoctor(self.core, self.store, config=self.CFG)
        self.ready_event = threading.Event()
Ejemplo n.º 4
0
class ProcessDispatcherService(object):
    """PD service interface
    """

    def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None,
                 store=None, epum_client=None, notifier=None, definition_id=None,
                 domain_config=None, sysname=None):

        configs = ["service", "processdispatcher"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)
        self.topic = self.CFG.processdispatcher.get('service_name', topic)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG,
                                             amqp_uri=amqp_uri, sysname=sysname)

        engine_conf = self.CFG.processdispatcher.get('engines', {})
        default_engine = self.CFG.processdispatcher.get('default_engine')
        process_engines = self.CFG.processdispatcher.get('process_engines')
        if default_engine is None and len(engine_conf.keys()) == 1:
            default_engine = engine_conf.keys()[0]
        self.store = store or get_processdispatcher_store(self.CFG)
        self.store.initialize()
        self.registry = registry or EngineRegistry.from_config(engine_conf,
            default=default_engine, process_engines=process_engines)
        self.eeagent_client = EEAgentClient(self.dashi)

        domain_definition_id = None
        base_domain_config = None
        # allow disabling communication with EPUM for epuharness case
        if epum_client:
            self.epum_client = epum_client
            domain_definition_id = definition_id
            base_domain_config = domain_config
        elif not self.CFG.processdispatcher.get('static_resources'):
            domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id')
            base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config')
            epum_service_name = self.CFG.processdispatcher.get('epum_service_name',
                    'epu_management_service')
            self.epum_client = EPUManagementClient(self.dashi, epum_service_name)

        else:
            self.epum_client = None

        if notifier:
            self.notifier = notifier
        else:
            self.notifier = SubscriberNotifier(self.dashi)

        self.core = ProcessDispatcherCore(self.store,
                                          self.registry,
                                          self.eeagent_client,
                                          self.notifier)

        launch_type = self.CFG.processdispatcher.get('launch_type', 'supd')
        restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {})
        dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds')

        self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client,
            self.registry, self.epum_client, self.notifier, self.topic,
            domain_definition_id, base_domain_config, launch_type,
            restart_throttling_config, dispatch_retry_seconds)

        self.doctor = PDDoctor(self.core, self.store, config=self.CFG)
        self.ready_event = threading.Event()

    def start(self):

        # start the doctor before we do anything else
        log.debug("Starting doctor election")
        self.doctor.start_election()

        log.debug("Waiting for Doctor to initialize the Process Dispatcher")
        # wait for the store to be initialized before proceeding. The doctor
        # (maybe not OUR doctor, but whoever gets elected), will check the
        # state of the system and then mark it as initialized.
        self.store.wait_initialized()

        epu.dashiproc.link_dashi_exceptions(self.dashi)

        self.dashi.handle(self.set_system_boot)
        self.dashi.handle(self.create_definition)
        self.dashi.handle(self.describe_definition)
        self.dashi.handle(self.update_definition)
        self.dashi.handle(self.remove_definition)
        self.dashi.handle(self.list_definitions)
        self.dashi.handle(self.create_process)
        self.dashi.handle(self.schedule_process)
        self.dashi.handle(self.describe_process)
        self.dashi.handle(self.describe_processes)
        self.dashi.handle(self.restart_process)
        self.dashi.handle(self.terminate_process)
        self.dashi.handle(self.node_state)
        self.dashi.handle(self.heartbeat, sender_kwarg='sender')
        self.dashi.handle(self.dump)

        self.matchmaker.start_election()

        self.ready_event.set()

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Bye!")
        else:
            log.info("Exiting normally. Bye!")

    def stop(self):
        self.ready_event.clear()
        self.dashi.cancel()
        self.dashi.disconnect()
        self.store.shutdown()

    def _make_process_dict(self, proc):
        return dict(upid=proc.upid, state=proc.state, round=proc.round,
                    assigned=proc.assigned)

    def set_system_boot(self, system_boot):
        self.core.set_system_boot(system_boot)

    def create_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.create_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def describe_definition(self, definition_id):
        return self.core.describe_definition(definition_id)

    def update_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.update_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def remove_definition(self, definition_id):
        self.core.remove_definition(definition_id)

    def list_definitions(self):
        return self.core.list_definitions()

    def create_process(self, upid, definition_id, name=None):
        result = self.core.create_process(None, upid, definition_id, name=name)
        return self._make_process_dict(result)

    def schedule_process(self, upid, definition_id=None, configuration=None,
                         subscribers=None, constraints=None,
                         queueing_mode=None, restart_mode=None,
                         execution_engine_id=None, node_exclusive=None,
                         name=None):

        result = self.core.schedule_process(None, upid=upid,
            definition_id=definition_id, configuration=configuration,
            subscribers=subscribers, constraints=constraints,
            queueing_mode=queueing_mode, restart_mode=restart_mode,
            node_exclusive=node_exclusive,
            execution_engine_id=execution_engine_id, name=name)
        return self._make_process_dict(result)

    def describe_process(self, upid):
        return self.core.describe_process(None, upid)

    def describe_processes(self):
        return self.core.describe_processes()

    def restart_process(self, upid):
        result = self.core.restart_process(None, upid)
        return self._make_process_dict(result)

    def terminate_process(self, upid):
        result = self.core.terminate_process(None, upid)
        return self._make_process_dict(result)

    def node_state(self, node_id, domain_id, state, properties=None):
        self.core.node_state(node_id, domain_id, state, properties=properties)

    def heartbeat(self, sender, message):
        log.debug("got heartbeat from %s: %s", sender, message)
        self.core.ee_heartbeat(sender, message)

    def dump(self):
        return self.core.dump()
Ejemplo n.º 5
0
class PDDoctorTests(unittest.TestCase, StoreTestMixin):

    engine_conf = {'engine1': {'slots': 4, 'heartbeat_period': 5,
                   'heartbeat_warning': 10, 'heartbeat_missing': 20}}

    def setUp(self):
        self.store = self.setup_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = MockResourceClient()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry,
            self.resource_client, self.notifier)
        self.doctor = PDDoctor(self.core, self.store)

        self.docthread = None

        self.monitor = None

    def tearDown(self):
        if self.docthread:
            self.doctor.cancel()
            self.docthread.join()
            self.docthread = None

        self.teardown_store()

    def setup_store(self):
        return ProcessDispatcherStore()

    def teardown_store(self):
        return

    def _run_in_thread(self):
        self.docthread = tevent.spawn(self.doctor.inaugurate)
        time.sleep(0.05)

    def test_uninitialized_system_boot_with_state(self):
        self.store.set_system_boot(True)
        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING,
                assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING,
            assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING,
            assigned=resource_id)
        self.store.add_process(p3)

        # this one shouldn't restart
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.RUNNING,
                configuration=nosystemrestart_process_config(),
                assigned=resource_id,
                restart_mode=RestartMode.ABNORMAL)
        self.store.add_process(p4)

        # non-running proceses should also potentially be restarted on boot
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.WAITING)
        self.store.add_process(p5)
        self.store.enqueue_process(*p5.key)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REQUESTED)
        self.store.add_process(p6)

        # not this one, due to RestartMode
        p7 = ProcessRecord.new(None, "proc7", {}, ProcessState.REQUESTED,
            configuration=nosystemrestart_process_config(),
            restart_mode=RestartMode.ALWAYS)
        self.store.add_process(p7)
        self.store.enqueue_process(*p7.key)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p0.key, p1.key, p2.key, p3.key, p4.key]
        self.store.update_resource(resource)

        restartable_procs = ["proc1", "proc2", "proc5", "proc6"]
        dead_procs = ["proc0", "proc4", "proc7"]

        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)

        self.assertEqual(len(self.store.get_queued_processes()), 0)
        self.assertEqual(len(self.store.get_node_ids()), 0)
        self.assertEqual(len(self.store.get_resource_ids()), 0)

        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.UNSCHEDULED_PENDING)
        for proc in dead_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.TERMINATED)
        self.assertEqual(self.store.get_process(None, "proc3").state,
                         ProcessState.TERMINATED)

        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_uninitialized_system_boot_without_state(self):
        self.store.set_system_boot(True)
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.SYSTEM_BOOTING)
        self.store.set_system_boot(False)
        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

    def test_uninitialized_not_system_boot_with_procs(self):
        # tests the case where doctor arrives to an uninitialized system
        # that is not doing a system boot. HOWEVER, there are procs in the
        # UNSCHEDULED_PENDING state. This would likely only happen if the
        # PD died during system boot and recovered after the system boot flag
        # was turned off. Very small window, but possible.

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p2)

        restartable_procs = ["proc0", "proc1", "proc2"]
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_uninitialized_not_system_boot_without_procs(self):
        # tests the case where doctor arrives to an uninitialized system
        # that is not doing a system boot and has no UNSCHEDULED_PENDING procs.
        # this is likely a recovery from all-PD-workers failing and resuming in
        # a running system, or Zookeeper issue
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

    def test_initialized_system_boot_with_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have processes in the UNSCHEDULED_PENDING state that should be
        # rescheduled once system boot ends.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        p0 = ProcessRecord.new(None, "proc0", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p0)
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        self.store.add_process(p2)

        restartable_procs = ["proc0", "proc1", "proc2"]
        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

        # check that pending processes were correctly rescheduled
        self.assertEqual(len(self.store.get_queued_processes()), len(restartable_procs))
        for proc in restartable_procs:
            self.assertEqual(self.store.get_process(None, proc).state,
                             ProcessState.REQUESTED)

    def test_initialized_system_boot_without_procs(self):
        # tests the case where just the doctor dies in the middle of system boot
        # but after a doctor has already declared the system initialized. In this
        # case we have no processes to schedule on system boot completion.

        self.store.set_system_boot(True)
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.SYSTEM_BOOTING)

        self._run_in_thread()

        # now end system boot
        self.store.set_system_boot(False)

        wait(lambda: self.store.get_pd_state() == ProcessDispatcherState.OK)

    def test_initialized_not_system_boot(self):
        # recover into an already initialized and booted system. this is likely
        # a recovery from a doctor failure while the rest of the system was still
        # alive.
        self.store.set_initialized()
        self.store.set_pd_state(ProcessDispatcherState.OK)

        self._run_in_thread()

        # we have nothing really to check here, yet. but at least we can make sure
        # the process is cancellable.

    def test_monitor_thread(self):
        self._run_in_thread()

        assert self.store.wait_initialized(timeout=10)
        self.assertEqual(self.store.get_pd_state(),
                         ProcessDispatcherState.OK)

        self.assertIsNotNone(self.doctor.monitor)
        monitor_thread = self.doctor.monitor_thread
        self.assertIsNotNone(monitor_thread)
        self.assertTrue(monitor_thread.is_alive())

        # now cancel doctor. monitor should stop too
        self.doctor.cancel()
        wait(lambda: not monitor_thread.is_alive())

    def _setup_resource_monitor(self):
        self.monitor = ExecutionResourceMonitor(self.core, self.store)
        return self.monitor

    def _send_heartbeat(self, resource_id, node_id, timestamp):
        self.core.ee_heartbeat(resource_id, make_beat(node_id, timestamp=timestamp))

    def assert_monitor_cycle(self, expected_delay, resource_states=None):
        self.assertEqual(expected_delay, self.monitor.monitor_cycle())

        if resource_states:
            for resource_id, expected_state in resource_states.iteritems():
                found_state = self.store.get_resource(resource_id).state
                if found_state != expected_state:
                    self.fail("Resource %s state = %s. Expected %s" % (resource_id,
                        found_state, expected_state))

    def test_resource_monitor(self):
        t0 = datetime(2012, 3, 13, 9, 30, 0, tzinfo=UTC)
        mock_now = Mock()
        mock_now.return_value = t0

        def increment_now(seconds):
            t = mock_now.return_value + timedelta(seconds=seconds)
            mock_now.return_value = t
            log.debug("THE TIME IS NOW: %s", t)
            return t

        monitor = self._setup_resource_monitor()
        monitor._now_func = mock_now

        # before there are any resources, monitor should work but return a None delay
        self.assertIsNone(monitor.monitor_cycle())

        self.core.node_state("node1", domain_id_from_engine("engine1"),
            InstanceState.RUNNING)

        # 3 resources. all report in at t0
        r1, r2, r3 = "eeagent_1", "eeagent_2", "eeagent_3"
        self._send_heartbeat(r1, "node1", t0)
        self._send_heartbeat(r2, "node1", t0)
        self._send_heartbeat(r3, "node1", t0)

        states = {r1: ExecutionResourceState.OK, r2: ExecutionResourceState.OK,
                  r3: ExecutionResourceState.OK}

        self.assert_monitor_cycle(10, states)

        t1 = increment_now(5)  # :05
        # heartbeat comes in for r1 5 seconds later
        self._send_heartbeat(r1, "node1", t1)

        self.assert_monitor_cycle(5, states)

        increment_now(5)  # :10

        # no heartbeats for r2 and r3. they should be marked WARNING
        states[r2] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(5, states)

        increment_now(4)  # :14

        # r2 gets a heartbeat through, but its timestamp puts it still in the warning threshold
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=1))

        self.assert_monitor_cycle(1, states)

        increment_now(6)  # :20

        # r1 should go warning, r3 should go missing
        states[r1] = ExecutionResourceState.WARNING
        states[r3] = ExecutionResourceState.MISSING
        self.assert_monitor_cycle(4, states)

        t2 = increment_now(3)  # :23
        self._send_heartbeat(r1, "node1", t2)
        states[r1] = ExecutionResourceState.OK
        self.assert_monitor_cycle(1, states)

        t3 = increment_now(2)  # :25
        self._send_heartbeat(r3, "node1", t3)
        states[r2] = ExecutionResourceState.MISSING
        states[r3] = ExecutionResourceState.OK
        self.assert_monitor_cycle(8, states)

        increment_now(5)  # :30
        # hearbeat r2 enough to go back to WARNING, but still late
        self.core.ee_heartbeat(r2, make_beat("node1", timestamp=t0 + timedelta(seconds=15)))
        self._send_heartbeat(r2, "node1", t0 + timedelta(seconds=15))
        states[r2] = ExecutionResourceState.WARNING
        self.assert_monitor_cycle(3, states)

        t4 = increment_now(5)  # :35
        # disable r2 and heartbeat r1 and r3 (heartbeats arrive late, but that's ok)
        self._send_heartbeat(r1, "node1", t4)
        self._send_heartbeat(r3, "node1", t4)
        self.core.resource_change_state(self.store.get_resource(r2),
            ExecutionResourceState.DISABLED)

        states[r2] = ExecutionResourceState.DISABLED
        self.assert_monitor_cycle(10, states)
Ejemplo n.º 6
0
class ProcessDispatcherCoreTests(unittest.TestCase):

    engine_conf = {"engine1": {"slots": 4}, "engine2": {"slots": 4}, "engine3": {"slots": 2}, "engine4": {"slots": 2}}

    def setUp(self):
        self.store = self.get_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = Mock()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry, self.resource_client, self.notifier)

    def get_store(self):
        return ProcessDispatcherStore()

    def test_add_remove_node(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)

        node = self.store.get_node("node1")
        self.assertTrue(node is not None)
        self.assertEqual(node.node_id, "node1")
        self.assertEqual(node.domain_id, domain_id_from_engine("engine1"))

        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATING)
        node = self.store.get_node("node1")
        self.assertTrue(node is None)

        # this shouldn't cause any problems even though node is gone
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

    def test_add_remove_node_with_resource(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        resource = self.store.get_resource(resource_id)
        self.assertIsNotNone(resource)
        self.assertEqual(resource.state, ExecutionResourceState.OK)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)

    def test_add_remove_node_with_resource_and_processes(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        # set up a few of processes on the resource
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id)
        self.store.add_process(p3)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p1.key, p2.key, p3.key]
        self.store.update_resource(resource)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)

        queued_processes = set(self.store.get_queued_processes())

        # these two should have been rescheduled
        for procname in ("proc1", "proc2"):
            proc = self.store.get_process(None, procname)
            self.assertEqual(proc.state, ProcessState.DIED_REQUESTED)
            self.assertEqual(proc.round, 1)
            self.assertIn(proc.key, queued_processes)
            self.notifier.assert_process_state(procname, ProcessState.DIED_REQUESTED)

        # this one should be terminated
        proc3 = self.store.get_process(None, "proc3")
        self.assertEqual(proc3.state, ProcessState.TERMINATED)
        self.assertEqual(proc3.round, 0)
        self.assertNotIn(proc3.key, queued_processes)
        self.notifier.assert_process_state("proc3", ProcessState.TERMINATED)

    def test_terminate_not_found(self):
        # process which doesn't exist

        with self.assertRaises(NotFoundError):
            self.core.terminate_process(None, "notarealprocess")

    def test_terminate_terminal_process(self):
        # processes which are already in a terminal state shouldn't change
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATED)
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.EXITED)
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.FAILED)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REJECTED)

        for p in (p1, p2, p3, p4, p5, p6):
            self.store.add_process(p)

        for p in (p1, p2, p3, p4, p5, p6):
            gotproc = self.core.terminate_process(None, p.upid)

            self.assertEqual(gotproc.upid, p.upid)
            self.assertEqual(gotproc.state, p.state)

            p1 = self.store.get_process(None, p.upid)
            self.assertEqual(p1.state, p.state)
        self.assertEqual(self.resource_client.call_count, 0)

    def test_terminate_unassigned_process(self):
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.WAITING)
        self.store.add_process(p1)
        self.store.enqueue_process(*p1.key)

        gotproc = self.core.terminate_process(None, "proc1")

        self.assertEqual(gotproc.upid, "proc1")
        self.assertEqual(gotproc.state, ProcessState.TERMINATED)

        p1 = self.store.get_process(None, "proc1")
        self.assertEqual(p1.state, ProcessState.TERMINATED)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATED)

        # should be gone from queue too
        self.assertFalse(self.store.get_queued_processes())
        self.assertEqual(self.resource_client.call_count, 0)

    def test_terminate_raciness(self):
        # ensure process is TERMINATING before resource client is called

        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING)
        p1.assigned = "hats"
        self.store.add_process(p1)

        def assert_process_terminating(resource_id, upid, round):
            self.assertEqual(resource_id, "hats")
            self.assertEqual(upid, "proc1")
            process = self.store.get_process(None, upid)
            self.assertEqual(process.state, ProcessState.TERMINATING)

        self.resource_client.terminate_process.side_effect = assert_process_terminating

        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATING)

    def test_terminate_assigned(self):
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.ASSIGNED)
        p1.assigned = "hats"
        self.store.add_process(p1)
        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATING)

    def test_terminate_retry(self):
        # try to kill a process that is already terminating
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.TERMINATING)
        p1.assigned = "hats"
        self.store.add_process(p1)
        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_no_process_state()

    def test_process_subscribers(self):
        proc = "proc1"
        definition = "def1"
        subscribers = [("destination", "operation")]
        self.core.create_definition(definition, None, None)
        self.core.create_process(None, proc, definition)
        self.core.schedule_process(None, proc, subscribers=subscribers)

        record = self.store.get_process(None, proc)

        self.assertEqual(len(record.subscribers), len(subscribers))
        for a, b in zip(record.subscribers, subscribers):
            self.assertEqual(a[0], b[0])
            self.assertEqual(a[1], b[1])

    def test_schedule_notfound(self):

        # scheduling an unknown process
        proc = "proc1"
        with self.assertRaises(NotFoundError):
            self.core.schedule_process(None, proc)

    def test_schedule_new_process(self):
        proc = "proc1"
        definition = "def1"
        self.core.create_definition(definition, None, None)

        process = self.core.schedule_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

    def test_create_idempotency(self):
        proc = "proc1"
        definition = "def1"
        another_definition = "def2"
        self.core.create_definition(definition, None, None)
        self.core.create_definition(another_definition, None, None)

        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        # calling again is fine
        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        # with a different definition is not fine
        with self.assertRaises(BadRequestError):
            self.core.create_process(None, proc, another_definition)

        # nor with a different name
        with self.assertRaises(BadRequestError):
            self.core.create_process(None, proc, definition, name="hats")

    def test_schedule_idempotency(self):
        proc = "proc1"
        definition = "def1"

        self.core.create_definition(definition, None, None)

        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        process = self.core.schedule_process(None, proc)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # calling again is fine
        process = self.core.schedule_process(None, proc)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # with a different parameter is not fine
        with self.assertRaises(BadRequestError):
            self.core.schedule_process(None, proc, restart_mode=RestartMode.ALWAYS)

        with self.assertRaises(BadRequestError):
            self.core.schedule_process(None, proc, queueing_mode=QueueingMode.START_ONLY)

    def test_schedule_idempotency_procname(self):
        proc = "proc1"
        definition = "def1"

        self.core.create_definition(definition, None, None)

        # special case: changing process name is ok
        process = self.core.create_process(None, proc, definition, name="name1")
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        process = self.core.schedule_process(None, proc, name="name2")
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # special case: different process name is ok
        process = self.core.schedule_process(None, proc, name="name3")
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

    def test_process_should_restart(self):
        definition = "def1"

        self.core.create_definition(definition, None, None)

        abnormal_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED)
        all_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED, ProcessState.EXITED)
        # default behavior is to restart processes that exit abnormally
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition)
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            # system restart mode doesn't matter
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # same with explicit RestartMode.ABNORMAL specified
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ABNORMAL)
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # RestartMode.NEVER
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.NEVER)
        for state in all_states:
            self.assertFalse(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ALWAYS
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ALWAYS)
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ALWAYS with process.omit_from_system_restart
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ALWAYS,
            configuration=nosystemrestart_process_config(),
        )
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ABNORMAL with process.omit_from_system_restart
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ABNORMAL,
            configuration=nosystemrestart_process_config(),
        )
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # ensure that a process with a busted config doesn't raise an error
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ALWAYS,
            configuration={"process": ["what is a list doing here??"]},
        )
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))

    def test_heartbeat_node_update_race(self):

        # test processing two beats simultaneously, for eeagents in the same node.
        # check that they don't collide updating the node record
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        # this beat gets injected while the other is in the midst of processing
        sneaky_beat = make_beat(node_id)

        # when the PD attempts to update the process, sneak in an update
        # first so the request conflicts
        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node

            self.core.ee_heartbeat("eeagent2", sneaky_beat)
            original_update_node(node)

        self.store.update_node = patched_update_node

        self.core.ee_heartbeat("eeagent1", beat)

        node = self.store.get_node(node_id)
        self.assertEqual(set(["eeagent1", "eeagent2"]), set(node.resources))

    def test_heartbeat_node_removed(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node
            self.store.remove_node(node.node_id)
            original_update_node(node)

        self.store.update_node = patched_update_node

        # this shouldn't blow up, and no resource should be added
        self.core.ee_heartbeat("eeagent1", beat)
        self.assertEqual(self.store.get_resource("eeagent1"), None)

    def test_heartbeat_timestamps(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00")
        d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00")
        d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00")

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat()))

        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d1)

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

        # out of order hbeat. time shouln't be updated
        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

    def test_get_process_constraints(self):
        """test_get_process_constraints

        ensure that order of precedence of engine ids is correct. Should be:

        1. process target - when a process is scheduled, an execution_engine_id
        can be specified in the request's ProcessTarget object. If specified,
        this EE is used.
        2. process/engine mappings - the CEI Launch YML file contains a
        process_engines mapping of process packages to EE names. If the process'
        module matches an entry in this configuration, the associated EE is
        chosen. This format is described below.
        3. default execution engine - the CEI Launch YML file also must specify
        a default_execution_engine value. This is used as a last resort.
        """

        self.registry.set_process_engine_mapping("my", "engine4")
        self.registry.default = "engine1"

        process_definition = {"executable": {"module": "my.test", "class": "MyClass"}}
        process_constraints = {"engine": "mostimportantengine"}

        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.PENDING)
        constraints = self.core.get_process_constraints(p1)
        self.assertEqual(constraints["engine"], self.registry.default)

        p3 = ProcessRecord.new(None, "proc3", process_definition, ProcessState.PENDING, constraints=process_constraints)
        constraints = self.core.get_process_constraints(p3)
        self.assertEqual(constraints["engine"], "mostimportantengine")