Exemple #1
0
class ProcessDispatcherService(object):
    """PD service interface
    """

    def __init__(self, amqp_uri=None, topic="process_dispatcher", registry=None,
                 store=None, epum_client=None, notifier=None, definition_id=None,
                 domain_config=None, sysname=None):

        configs = ["service", "processdispatcher"]
        config_files = get_config_paths(configs)
        self.CFG = bootstrap.configure(config_files)
        self.topic = self.CFG.processdispatcher.get('service_name', topic)

        self.dashi = bootstrap.dashi_connect(self.topic, self.CFG,
                                             amqp_uri=amqp_uri, sysname=sysname)

        engine_conf = self.CFG.processdispatcher.get('engines', {})
        default_engine = self.CFG.processdispatcher.get('default_engine')
        process_engines = self.CFG.processdispatcher.get('process_engines')
        if default_engine is None and len(engine_conf.keys()) == 1:
            default_engine = engine_conf.keys()[0]
        self.store = store or get_processdispatcher_store(self.CFG)
        self.store.initialize()
        self.registry = registry or EngineRegistry.from_config(engine_conf,
            default=default_engine, process_engines=process_engines)
        self.eeagent_client = EEAgentClient(self.dashi)

        domain_definition_id = None
        base_domain_config = None
        # allow disabling communication with EPUM for epuharness case
        if epum_client:
            self.epum_client = epum_client
            domain_definition_id = definition_id
            base_domain_config = domain_config
        elif not self.CFG.processdispatcher.get('static_resources'):
            domain_definition_id = definition_id or self.CFG.processdispatcher.get('definition_id')
            base_domain_config = domain_config or self.CFG.processdispatcher.get('domain_config')
            epum_service_name = self.CFG.processdispatcher.get('epum_service_name',
                    'epu_management_service')
            self.epum_client = EPUManagementClient(self.dashi, epum_service_name)

        else:
            self.epum_client = None

        if notifier:
            self.notifier = notifier
        else:
            self.notifier = SubscriberNotifier(self.dashi)

        self.core = ProcessDispatcherCore(self.store,
                                          self.registry,
                                          self.eeagent_client,
                                          self.notifier)

        launch_type = self.CFG.processdispatcher.get('launch_type', 'supd')
        restart_throttling_config = self.CFG.processdispatcher.get('restart_throttling_config', {})
        dispatch_retry_seconds = self.CFG.processdispatcher.get('dispatch_retry_seconds')

        self.matchmaker = PDMatchmaker(self.core, self.store, self.eeagent_client,
            self.registry, self.epum_client, self.notifier, self.topic,
            domain_definition_id, base_domain_config, launch_type,
            restart_throttling_config, dispatch_retry_seconds)

        self.doctor = PDDoctor(self.core, self.store, config=self.CFG)
        self.ready_event = threading.Event()

    def start(self):

        # start the doctor before we do anything else
        log.debug("Starting doctor election")
        self.doctor.start_election()

        log.debug("Waiting for Doctor to initialize the Process Dispatcher")
        # wait for the store to be initialized before proceeding. The doctor
        # (maybe not OUR doctor, but whoever gets elected), will check the
        # state of the system and then mark it as initialized.
        self.store.wait_initialized()

        epu.dashiproc.link_dashi_exceptions(self.dashi)

        self.dashi.handle(self.set_system_boot)
        self.dashi.handle(self.create_definition)
        self.dashi.handle(self.describe_definition)
        self.dashi.handle(self.update_definition)
        self.dashi.handle(self.remove_definition)
        self.dashi.handle(self.list_definitions)
        self.dashi.handle(self.create_process)
        self.dashi.handle(self.schedule_process)
        self.dashi.handle(self.describe_process)
        self.dashi.handle(self.describe_processes)
        self.dashi.handle(self.restart_process)
        self.dashi.handle(self.terminate_process)
        self.dashi.handle(self.node_state)
        self.dashi.handle(self.heartbeat, sender_kwarg='sender')
        self.dashi.handle(self.dump)

        self.matchmaker.start_election()

        self.ready_event.set()

        try:
            self.dashi.consume()
        except KeyboardInterrupt:
            log.warning("Caught terminate signal. Bye!")
        else:
            log.info("Exiting normally. Bye!")

    def stop(self):
        self.ready_event.clear()
        self.dashi.cancel()
        self.dashi.disconnect()
        self.store.shutdown()

    def _make_process_dict(self, proc):
        return dict(upid=proc.upid, state=proc.state, round=proc.round,
                    assigned=proc.assigned)

    def set_system_boot(self, system_boot):
        self.core.set_system_boot(system_boot)

    def create_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.create_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def describe_definition(self, definition_id):
        return self.core.describe_definition(definition_id)

    def update_definition(self, definition_id, definition_type, executable,
                          name=None, description=None):
        self.core.update_definition(definition_id, definition_type, executable,
            name=name, description=description)

    def remove_definition(self, definition_id):
        self.core.remove_definition(definition_id)

    def list_definitions(self):
        return self.core.list_definitions()

    def create_process(self, upid, definition_id, name=None):
        result = self.core.create_process(None, upid, definition_id, name=name)
        return self._make_process_dict(result)

    def schedule_process(self, upid, definition_id=None, configuration=None,
                         subscribers=None, constraints=None,
                         queueing_mode=None, restart_mode=None,
                         execution_engine_id=None, node_exclusive=None,
                         name=None):

        result = self.core.schedule_process(None, upid=upid,
            definition_id=definition_id, configuration=configuration,
            subscribers=subscribers, constraints=constraints,
            queueing_mode=queueing_mode, restart_mode=restart_mode,
            node_exclusive=node_exclusive,
            execution_engine_id=execution_engine_id, name=name)
        return self._make_process_dict(result)

    def describe_process(self, upid):
        return self.core.describe_process(None, upid)

    def describe_processes(self):
        return self.core.describe_processes()

    def restart_process(self, upid):
        result = self.core.restart_process(None, upid)
        return self._make_process_dict(result)

    def terminate_process(self, upid):
        result = self.core.terminate_process(None, upid)
        return self._make_process_dict(result)

    def node_state(self, node_id, domain_id, state, properties=None):
        self.core.node_state(node_id, domain_id, state, properties=properties)

    def heartbeat(self, sender, message):
        log.debug("got heartbeat from %s: %s", sender, message)
        self.core.ee_heartbeat(sender, message)

    def dump(self):
        return self.core.dump()
Exemple #2
0
class ProcessDispatcherCoreTests(unittest.TestCase):

    engine_conf = {"engine1": {"slots": 4}, "engine2": {"slots": 4}, "engine3": {"slots": 2}, "engine4": {"slots": 2}}

    def setUp(self):
        self.store = self.get_store()
        self.registry = EngineRegistry.from_config(self.engine_conf)
        self.resource_client = Mock()
        self.notifier = MockNotifier()
        self.core = ProcessDispatcherCore(self.store, self.registry, self.resource_client, self.notifier)

    def get_store(self):
        return ProcessDispatcherStore()

    def test_add_remove_node(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)

        node = self.store.get_node("node1")
        self.assertTrue(node is not None)
        self.assertEqual(node.node_id, "node1")
        self.assertEqual(node.domain_id, domain_id_from_engine("engine1"))

        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATING)
        node = self.store.get_node("node1")
        self.assertTrue(node is None)

        # this shouldn't cause any problems even though node is gone
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

    def test_add_remove_node_with_resource(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        resource = self.store.get_resource(resource_id)
        self.assertIsNotNone(resource)
        self.assertEqual(resource.state, ExecutionResourceState.OK)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)

    def test_add_remove_node_with_resource_and_processes(self):
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.RUNNING)
        resource_id = "eeagent_1"
        self.core.ee_heartbeat(resource_id, make_beat("node1"))

        # set up a few of processes on the resource
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING, assigned=resource_id)
        self.store.add_process(p1)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.PENDING, assigned=resource_id)
        self.store.add_process(p2)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATING, assigned=resource_id)
        self.store.add_process(p3)

        resource = self.store.get_resource(resource_id)
        resource.assigned = [p1.key, p2.key, p3.key]
        self.store.update_resource(resource)

        # now send a terminated state for the node. resource should be removed.
        self.core.node_state("node1", domain_id_from_engine("engine1"), InstanceState.TERMINATED)

        self.assertTrue(self.store.get_resource(resource_id) is None)
        self.assertTrue(self.store.get_node("node1") is None)

        queued_processes = set(self.store.get_queued_processes())

        # these two should have been rescheduled
        for procname in ("proc1", "proc2"):
            proc = self.store.get_process(None, procname)
            self.assertEqual(proc.state, ProcessState.DIED_REQUESTED)
            self.assertEqual(proc.round, 1)
            self.assertIn(proc.key, queued_processes)
            self.notifier.assert_process_state(procname, ProcessState.DIED_REQUESTED)

        # this one should be terminated
        proc3 = self.store.get_process(None, "proc3")
        self.assertEqual(proc3.state, ProcessState.TERMINATED)
        self.assertEqual(proc3.round, 0)
        self.assertNotIn(proc3.key, queued_processes)
        self.notifier.assert_process_state("proc3", ProcessState.TERMINATED)

    def test_terminate_not_found(self):
        # process which doesn't exist

        with self.assertRaises(NotFoundError):
            self.core.terminate_process(None, "notarealprocess")

    def test_terminate_terminal_process(self):
        # processes which are already in a terminal state shouldn't change
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.UNSCHEDULED)
        p2 = ProcessRecord.new(None, "proc2", {}, ProcessState.UNSCHEDULED_PENDING)
        p3 = ProcessRecord.new(None, "proc3", {}, ProcessState.TERMINATED)
        p4 = ProcessRecord.new(None, "proc4", {}, ProcessState.EXITED)
        p5 = ProcessRecord.new(None, "proc5", {}, ProcessState.FAILED)
        p6 = ProcessRecord.new(None, "proc6", {}, ProcessState.REJECTED)

        for p in (p1, p2, p3, p4, p5, p6):
            self.store.add_process(p)

        for p in (p1, p2, p3, p4, p5, p6):
            gotproc = self.core.terminate_process(None, p.upid)

            self.assertEqual(gotproc.upid, p.upid)
            self.assertEqual(gotproc.state, p.state)

            p1 = self.store.get_process(None, p.upid)
            self.assertEqual(p1.state, p.state)
        self.assertEqual(self.resource_client.call_count, 0)

    def test_terminate_unassigned_process(self):
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.WAITING)
        self.store.add_process(p1)
        self.store.enqueue_process(*p1.key)

        gotproc = self.core.terminate_process(None, "proc1")

        self.assertEqual(gotproc.upid, "proc1")
        self.assertEqual(gotproc.state, ProcessState.TERMINATED)

        p1 = self.store.get_process(None, "proc1")
        self.assertEqual(p1.state, ProcessState.TERMINATED)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATED)

        # should be gone from queue too
        self.assertFalse(self.store.get_queued_processes())
        self.assertEqual(self.resource_client.call_count, 0)

    def test_terminate_raciness(self):
        # ensure process is TERMINATING before resource client is called

        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.RUNNING)
        p1.assigned = "hats"
        self.store.add_process(p1)

        def assert_process_terminating(resource_id, upid, round):
            self.assertEqual(resource_id, "hats")
            self.assertEqual(upid, "proc1")
            process = self.store.get_process(None, upid)
            self.assertEqual(process.state, ProcessState.TERMINATING)

        self.resource_client.terminate_process.side_effect = assert_process_terminating

        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATING)

    def test_terminate_assigned(self):
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.ASSIGNED)
        p1.assigned = "hats"
        self.store.add_process(p1)
        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_process_state("proc1", ProcessState.TERMINATING)

    def test_terminate_retry(self):
        # try to kill a process that is already terminating
        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.TERMINATING)
        p1.assigned = "hats"
        self.store.add_process(p1)
        self.core.terminate_process(None, "proc1")

        self.resource_client.terminate_process.assert_called_once_with("hats", "proc1", 0)
        self.notifier.assert_no_process_state()

    def test_process_subscribers(self):
        proc = "proc1"
        definition = "def1"
        subscribers = [("destination", "operation")]
        self.core.create_definition(definition, None, None)
        self.core.create_process(None, proc, definition)
        self.core.schedule_process(None, proc, subscribers=subscribers)

        record = self.store.get_process(None, proc)

        self.assertEqual(len(record.subscribers), len(subscribers))
        for a, b in zip(record.subscribers, subscribers):
            self.assertEqual(a[0], b[0])
            self.assertEqual(a[1], b[1])

    def test_schedule_notfound(self):

        # scheduling an unknown process
        proc = "proc1"
        with self.assertRaises(NotFoundError):
            self.core.schedule_process(None, proc)

    def test_schedule_new_process(self):
        proc = "proc1"
        definition = "def1"
        self.core.create_definition(definition, None, None)

        process = self.core.schedule_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

    def test_create_idempotency(self):
        proc = "proc1"
        definition = "def1"
        another_definition = "def2"
        self.core.create_definition(definition, None, None)
        self.core.create_definition(another_definition, None, None)

        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        # calling again is fine
        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        # with a different definition is not fine
        with self.assertRaises(BadRequestError):
            self.core.create_process(None, proc, another_definition)

        # nor with a different name
        with self.assertRaises(BadRequestError):
            self.core.create_process(None, proc, definition, name="hats")

    def test_schedule_idempotency(self):
        proc = "proc1"
        definition = "def1"

        self.core.create_definition(definition, None, None)

        process = self.core.create_process(None, proc, definition)
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        process = self.core.schedule_process(None, proc)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # calling again is fine
        process = self.core.schedule_process(None, proc)
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # with a different parameter is not fine
        with self.assertRaises(BadRequestError):
            self.core.schedule_process(None, proc, restart_mode=RestartMode.ALWAYS)

        with self.assertRaises(BadRequestError):
            self.core.schedule_process(None, proc, queueing_mode=QueueingMode.START_ONLY)

    def test_schedule_idempotency_procname(self):
        proc = "proc1"
        definition = "def1"

        self.core.create_definition(definition, None, None)

        # special case: changing process name is ok
        process = self.core.create_process(None, proc, definition, name="name1")
        self.assertEqual(process.state, ProcessState.UNSCHEDULED)
        self.assertEqual(process.upid, proc)

        process = self.core.schedule_process(None, proc, name="name2")
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

        # special case: different process name is ok
        process = self.core.schedule_process(None, proc, name="name3")
        self.assertEqual(process.state, ProcessState.REQUESTED)
        self.assertEqual(process.upid, proc)

    def test_process_should_restart(self):
        definition = "def1"

        self.core.create_definition(definition, None, None)

        abnormal_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED)
        all_states = (ProcessState.TERMINATED, ProcessState.TERMINATING, ProcessState.FAILED, ProcessState.EXITED)
        # default behavior is to restart processes that exit abnormally
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition)
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            # system restart mode doesn't matter
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # same with explicit RestartMode.ABNORMAL specified
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ABNORMAL)
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # RestartMode.NEVER
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.NEVER)
        for state in all_states:
            self.assertFalse(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ALWAYS
        process = self.core.schedule_process(None, uuid.uuid4().hex, definition, restart_mode=RestartMode.ALWAYS)
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertTrue(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ALWAYS with process.omit_from_system_restart
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ALWAYS,
            configuration=nosystemrestart_process_config(),
        )
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))

        # RestartMode.ABNORMAL with process.omit_from_system_restart
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ABNORMAL,
            configuration=nosystemrestart_process_config(),
        )
        for state in abnormal_states:
            self.assertTrue(self.core.process_should_restart(process, state))
            self.assertFalse(self.core.process_should_restart(process, state, is_system_restart=True))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED))
        self.assertFalse(self.core.process_should_restart(process, ProcessState.EXITED, is_system_restart=True))

        # ensure that a process with a busted config doesn't raise an error
        process = self.core.schedule_process(
            None,
            uuid.uuid4().hex,
            definition,
            restart_mode=RestartMode.ALWAYS,
            configuration={"process": ["what is a list doing here??"]},
        )
        for state in all_states:
            self.assertTrue(self.core.process_should_restart(process, state))

    def test_heartbeat_node_update_race(self):

        # test processing two beats simultaneously, for eeagents in the same node.
        # check that they don't collide updating the node record
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        # this beat gets injected while the other is in the midst of processing
        sneaky_beat = make_beat(node_id)

        # when the PD attempts to update the process, sneak in an update
        # first so the request conflicts
        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node

            self.core.ee_heartbeat("eeagent2", sneaky_beat)
            original_update_node(node)

        self.store.update_node = patched_update_node

        self.core.ee_heartbeat("eeagent1", beat)

        node = self.store.get_node(node_id)
        self.assertEqual(set(["eeagent1", "eeagent2"]), set(node.resources))

    def test_heartbeat_node_removed(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        beat = make_beat(node_id)

        original_update_node = self.store.update_node

        def patched_update_node(node):
            # unpatch ourself first so we don't recurse forever
            self.store.update_node = original_update_node
            self.store.remove_node(node.node_id)
            original_update_node(node)

        self.store.update_node = patched_update_node

        # this shouldn't blow up, and no resource should be added
        self.core.ee_heartbeat("eeagent1", beat)
        self.assertEqual(self.store.get_resource("eeagent1"), None)

    def test_heartbeat_timestamps(self):

        # test processing a heartbeat where node is removed partway through
        node_id = uuid.uuid4().hex
        self.core.node_state(node_id, domain_id_from_engine("engine1"), InstanceState.RUNNING)

        d1 = parse_datetime("2013-04-02T19:37:57.617734+00:00")
        d2 = parse_datetime("2013-04-02T19:38:57.617734+00:00")
        d3 = parse_datetime("2013-04-02T19:39:57.617734+00:00")

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d1.isoformat()))

        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d1)

        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d3.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

        # out of order hbeat. time shouln't be updated
        self.core.ee_heartbeat("eeagent1", make_beat(node_id, timestamp=d2.isoformat()))
        resource = self.store.get_resource("eeagent1")
        self.assertEqual(resource.last_heartbeat_datetime, d3)

    def test_get_process_constraints(self):
        """test_get_process_constraints

        ensure that order of precedence of engine ids is correct. Should be:

        1. process target - when a process is scheduled, an execution_engine_id
        can be specified in the request's ProcessTarget object. If specified,
        this EE is used.
        2. process/engine mappings - the CEI Launch YML file contains a
        process_engines mapping of process packages to EE names. If the process'
        module matches an entry in this configuration, the associated EE is
        chosen. This format is described below.
        3. default execution engine - the CEI Launch YML file also must specify
        a default_execution_engine value. This is used as a last resort.
        """

        self.registry.set_process_engine_mapping("my", "engine4")
        self.registry.default = "engine1"

        process_definition = {"executable": {"module": "my.test", "class": "MyClass"}}
        process_constraints = {"engine": "mostimportantengine"}

        p1 = ProcessRecord.new(None, "proc1", {}, ProcessState.PENDING)
        constraints = self.core.get_process_constraints(p1)
        self.assertEqual(constraints["engine"], self.registry.default)

        p3 = ProcessRecord.new(None, "proc3", process_definition, ProcessState.PENDING, constraints=process_constraints)
        constraints = self.core.get_process_constraints(p3)
        self.assertEqual(constraints["engine"], "mostimportantengine")