class ProcessDispatcherSimpleAPIClient(object): # State to use when state returned from PD is None unknown_state = "400-PENDING" state_map = { ProcessStateEnum.SPAWN: '500-RUNNING', ProcessStateEnum.TERMINATE: '700-TERMINATED', ProcessStateEnum.ERROR: '850-FAILED' } def __init__(self, name, **kwargs): self.real_client = ProcessDispatcherServiceClient(to_name=name, **kwargs) self.event_pub = EventPublisher() def dispatch_process(self, upid, spec, subscribers, constraints=None, immediate=False): name = spec.get('name') self.event_pub.publish_event(event_type="ProcessLifecycleEvent", origin=name, origin_type="DispatchedHAProcess", state=ProcessStateEnum.SPAWN) process_def = ProcessDefinition(name=name) process_def.executable = {'module': spec.get('module'), 'class': spec.get('class')} process_def_id = self.real_client.create_process_definition(process_def) pid = self.real_client.create_process(process_def_id) process_schedule = ProcessSchedule() sched_pid = self.real_client.schedule_process(process_def_id, process_schedule, configuration={}, process_id=pid) proc = self.real_client.read_process(sched_pid) dict_proc = {'upid': proc.process_id, 'state': self.state_map.get(proc.process_state, self.unknown_state), } return dict_proc def terminate_process(self, pid): return self.real_client.cancel_process(pid) def describe_processes(self): procs = self.real_client.list_processes() dict_procs = [] for proc in procs: dict_proc = {'upid': proc.process_id, 'state': self.state_map.get(proc.process_state, self.unknown_state), } dict_procs.append(dict_proc) return dict_procs
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase): def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.rr_cli = ResourceRegistryServiceClient() self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self.waiter = ProcessStateWaiter() def tearDown(self): self.waiter.stop() def test_create_schedule_cancel(self): process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid) self.assertEqual(pid, pid2) # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142 self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) # make sure process is readable directly from RR (mirrored) # verifies L4-CI-CEI-RQ63 # verifies L4-CI-CEI-RQ64 proc = self.rr_cli.read(pid) self.assertEqual(proc.process_id, pid) # now try communicating with the process to make sure it is really running test_client = TestClient() for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # verifies L4-CI-CEI-RQ147 # kill the process and start it again self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.waiter.stop() oldpid = pid pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid) self.assertEqual(pid, pid2) self.assertNotEqual(oldpid, pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # kill the process for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) def test_schedule_with_config(self): process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # verifies L4-CI-CEI-RQ66 # feed in a string that the process will return -- verifies that # configuration actually makes it to the instantiated process test_response = uuid.uuid4().hex configuration = {"test_response" : test_response} pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration=configuration, process_id=pid) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) test_client = TestClient() # verifies L4-CI-CEI-RQ139 # assure that configuration block (which can contain inputs, outputs, # and arbitrary config) 1) makes it to the process and 2) is returned # in process queries self.assertEqual(test_client.query(), test_response) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, configuration) # kill the process for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) def test_schedule_bad_config(self): process_schedule = ProcessSchedule() # a non-JSON-serializable IonObject o = ProcessTarget() with self.assertRaises(BadRequest) as ar: self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={"bad": o}) self.assertTrue(ar.exception.message.startswith("bad configuration")) def test_create_invalid_definition(self): # create process definition missing module and class # verifies L4-CI-CEI-RQ137 executable = dict(url="http://somewhere.com/something.py") definition = ProcessDefinition(name="test_process", executable=executable) with self.assertRaises(BadRequest) as ar: self.pd_cli.create_process_definition(definition)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase): def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self.event_queue = queue.Queue() self.event_sub = None def tearDown(self): if self.event_sub: self.event_sub.stop() self._stop_container() def _event_callback(self, event, *args, **kwargs): self.event_queue.put(event) def subscribe_events(self, origin): self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent", callback=self._event_callback, origin=origin, origin_type="DispatchedProcess") self.event_sub.start() def await_state_event(self, pid, state): event = self.event_queue.get(timeout=5) log.debug("Got event: %s", event) self.assertEqual(event.origin, pid) self.assertEqual(event.state, state) return event def test_create_schedule_cancel(self): process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS pid = self.pd_cli.create_process(self.process_definition_id) self.subscribe_events(pid) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid) self.assertEqual(pid, pid2) self.await_state_event(pid, ProcessStateEnum.SPAWN) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.SPAWN) # now try communicating with the process to make sure it is really running test_client = TestClient() for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # kill the process and start it again self.pd_cli.cancel_process(pid) self.await_state_event(pid, ProcessStateEnum.TERMINATE) oldpid = pid pid = self.pd_cli.create_process(self.process_definition_id) self.subscribe_events(pid) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid) self.assertEqual(pid, pid2) self.assertNotEqual(oldpid, pid) self.await_state_event(pid, ProcessStateEnum.SPAWN) for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # kill the process for good self.pd_cli.cancel_process(pid) self.await_state_event(pid, ProcessStateEnum.TERMINATE) def test_schedule_bad_config(self): process_schedule = ProcessSchedule() # a non-JSON-serializable IonObject o = ProcessTarget() with self.assertRaises(BadRequest) as ar: self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={"bad": o}) self.assertTrue(ar.exception.message.startswith("bad configuration"))
class HAProcessControl(object): def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""): self.pd_name = pd_name self.resource_registry = resource_registry self.service_id = service_id self.callback = callback if callback and not callable(callback): raise ValueError("callback is not callable") self.logprefix = logprefix self.client = ProcessDispatcherServiceClient(to_name=pd_name) self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent", callback=self._event_callback, origin_type="DispatchedProcess", auto_delete=True) self.processes = {} def start(self): service = self.resource_registry.read(self.service_id) process_assocs = self.resource_registry.find_associations(service, "hasProcess") for process_assoc in process_assocs: process_id = process_assoc.o if process_id: try: process = self.client.read_process(process_id) except NotFound: log.debug("%sService was associated with process %s, which is unknown to PD. ignoring.", self.logprefix, process_id) continue state = process.process_state state_str = ProcessStateEnum._str_map.get(state, str(state)) self.processes[process.process_id] = _process_dict_from_object(process) log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str) self.event_sub.start() def stop(self): self.event_sub.stop() def get_managed_upids(self): return self.processes.keys() def _event_callback(self, event, *args, **kwargs): if not event: return try: self._inner_event_callback(event) except (KeyboardInterrupt, SystemExit): raise except: log.exception("%sException in event handler. This is a bug!", self.logprefix) def _inner_event_callback(self, event): process_id = event.origin state = event.state state_str = ProcessStateEnum._str_map.get(state, str(state)) if not (process_id and process_id in self.processes): # we receive events for all processes but ignore most return process = None for _ in range(3): try: process = self.client.read_process(process_id) break except Timeout: log.warn("Timeout trying to read process from Process Dispatcher!", exc_info=True) pass # retry except NotFound: break if process: log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str) # replace the cached data about this process self.processes[process_id] = _process_dict_from_object(process) else: log.warn("%sReceived process %s event but failed to read from Process Dispatcher", self.logprefix, process_id) #XXX better approach here? we at least have the state from the event, # so sticking that on cached process. We could miss other important # data like hostname however. self.processes[process_id]['state'] = process_state_to_pd_core(state) if self.callback: try: self.callback() except (KeyboardInterrupt, SystemExit): raise except: e = sys.exc_info()[0] log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True) def _associate_process(self, process): try: self.resource_registry.create_association(self.service_id, "hasProcess", process.process_id) except Exception: log.exception("Couldn't associate service %s to process %s" % (self.service_id, process.process_id)) def schedule_process(self, pd_name, process_definition_id, **kwargs): if pd_name != self.pd_name: raise Exception("schedule_process request received for unknown PD: %s" % pd_name) # figure out if there is an existing PID which can be reused found_upid = None for process in self.processes.values(): upid = process.get('upid') state = process.get('state') if not (upid and state): continue if state in CoreProcessState.TERMINAL_STATES: found_upid = upid if found_upid: upid = found_upid proc = self.client.read_process(upid) else: # otherwise create a new process and associate upid = self.client.create_process(process_definition_id) # note: if the HAAgent fails between the create call above and the # associate call below, there may be orphaned Process objects. These # processes will not however be running, so are largely harmless. proc = self.client.read_process(upid) self._associate_process(proc) process_schedule = _get_process_schedule(**kwargs) configuration = kwargs.get('configuration') # cheat and roll the process state to REQUESTED before we actually # schedule it. this is in-memory only, so should be harmless. This # avoids a race between this scheduling process and the event # subscriber. proc.process_state = ProcessStateEnum.REQUESTED self.processes[upid] = _process_dict_from_object(proc) self.client.schedule_process(process_definition_id, process_schedule, configuration=configuration, process_id=upid) return upid def terminate_process(self, pid): return self.client.cancel_process(pid) def get_all_processes(self): processes = deepcopy(self.processes.values()) return {self.pd_name: processes} def reload_processes(self): for process_id, process_dict in self.processes.items(): try: process = self.client.read_process(process_id) except Exception: log.warn("%sFailed to read process %s from PD. Will retry later.", self.logprefix, process_id, exc_info=True) continue new_process_dict = _process_dict_from_object(process) if new_process_dict['state'] != process_dict['state']: log.warn("%sUpdating process %s record manually. we may have missed an event?", self.logprefix, process_id) self.processes[process_id] = new_process_dict
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self.dashi = None self._start_container() from pyon.public import CFG self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(name="process_dispatcher", processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.rr_cli = self.container.resource_registry self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self._eea_pids = [] self._eea_pid_to_resource_id = {} self._eea_pid_to_persistence_dir = {} self._tmpdirs = [] self.dashi = get_dashi(uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange'], sysname=CFG.get_safe("dashi.sysname") ) #send a fake node_state message to PD's dashi binding. self.node1_id = uuid.uuid4().hex self._send_node_state("engine1", self.node1_id) self._initial_eea_pid = self._start_eeagent(self.node1_id) self.waiter = ProcessStateWaiter() def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state) def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None): if not persistence_dir: persistence_dir = tempfile.mkdtemp() self._tmpdirs.append(persistence_dir) resource_id = resource_id or uuid.uuid4().hex agent_config = _get_eeagent_config(node_id, persistence_dir, resource_id=resource_id) pid = self.container_client.spawn_process(name="eeagent", module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=agent_config) log.info('Agent pid=%s.', str(pid)) self._eea_pids.append(pid) self._eea_pid_to_resource_id[pid] = resource_id self._eea_pid_to_persistence_dir[pid] = persistence_dir return pid def _kill_eeagent(self, pid): self.assertTrue(pid in self._eea_pids) self.container.terminate_process(pid) self._eea_pids.remove(pid) del self._eea_pid_to_resource_id[pid] del self._eea_pid_to_persistence_dir[pid] def tearDown(self): for pid in list(self._eea_pids): self._kill_eeagent(pid) for d in self._tmpdirs: shutil.rmtree(d) self.waiter.stop() if self.dashi: self.dashi.cancel() def test_requested_ee(self): # request non-default engine process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.WAITING) # request unknown engine, with NEVER queuing mode. The request # should be rejected. # verifies L4-CI-CEI-RQ52 process_target = ProcessTarget(execution_engine_id="not-a-real-ee") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target rejected_pid = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=rejected_pid) self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED) # now add a node and eeagent for engine2. original process should leave # queue and start running node2_id = uuid.uuid4().hex self._send_node_state("engine2", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # spawn another process. it should start immediately. process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # one more with node exclusive process_target = ProcessTarget(execution_engine_id="engine2", node_exclusive="hats") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_node_exclusive(self): # the node_exclusive constraint is used to ensure multiple processes # of the same "kind" each get a VM exclusive of each other. Other # processes may run on these VMs, just not processes with the same # node_exclusive tag. Since we cannot directly query the contents # of each node in this test, we prove the capability by scheduling # processes one by one and checking their state. # verifies L4-CI-CEI-RQ121 # verifies L4-CI-CEI-RQ57 # first off, setUp() created a single node and eeagent. # We schedule two processes with the same "abc" node_exclusive # tag. Since there is only one node, the first process should run # and the second should be queued. process_target = ProcessTarget(execution_engine_id="engine1") process_target.node_exclusive = "abc" process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid1 = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING) pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING) # now demonstrate that the node itself is not full by launching # a third process without a node_exclusive tag -- it should start # immediately process_target.node_exclusive = None pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # finally, add a second node to the engine. pid2 should be started # since there is an exclusive "abc" node free. node2_id = uuid.uuid4().hex self._send_node_state("engine1", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_code_download(self): # create a process definition that has no URL; only module and class. process_definition_no_url = ProcessDefinition(name='test_process_nodownload') process_definition_no_url.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess'} process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url) # create another that has a URL of the python file (this very file) # verifies L4-CI-CEI-RQ114 url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py') process_definition = ProcessDefinition(name='test_process_download') process_definition.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess', 'url': url} process_definition_id = self.pd_cli.create_process_definition(process_definition) process_target = ProcessTarget() process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target self.waiter.start() # Test a module with no download fails pid_no_url = self.pd_cli.create_process(process_definition_id_no_url) self.pd_cli.schedule_process(process_definition_id_no_url, process_schedule, process_id=pid_no_url) self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED) # Test a module with a URL runs pid = self.pd_cli.create_process(process_definition_id) self.pd_cli.schedule_process(process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) def _add_test_process(self, restart_mode=None): process_schedule = ProcessSchedule() if restart_mode is not None: process_schedule.restart_mode = restart_mode pid = self.pd_cli.create_process(self.process_definition_id) pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex config = {'process': {'listen_name': pid_listen_name}} self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid, configuration=config) client = TestClient(to_name=pid_listen_name) return pid, client def test_restart(self): self.waiter.start() restartable_pids = [] nonrestartable_pids = [] clients = {} # start 10 processes with RestartMode.ALWAYS for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ALWAYS) restartable_pids.append(pid) clients[pid] = client # and 10 processes with RestartMode.ABNORMAL for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL) restartable_pids.append(pid) clients[pid] = client # and 10 with RestartMode.NEVER for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.NEVER) nonrestartable_pids.append(pid) clients[pid] = client all_pids = restartable_pids + nonrestartable_pids self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING) for pid in all_pids: client = clients[pid] self.assertFalse(client.is_restart()) self.assertEqual(client.count(), 1) # now kill the whole eeagent and restart it. processes should # show up as FAILED in the next heartbeat. resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid] persistence_dir = self._eea_pid_to_persistence_dir[self._initial_eea_pid] log.debug("Restarting eeagent %s", self._initial_eea_pid) self._kill_eeagent(self._initial_eea_pid) # manually kill the processes to simulate a real container failure for pid in all_pids: self.container.terminate_process(pid) self._start_eeagent(self.node1_id, resource_id=resource_id, persistence_dir=persistence_dir) # wait for restartables to restart self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING) # query the processes again. it should have restart mode config for pid in restartable_pids: client = clients[pid] self.assertTrue(client.is_restart()) self.assertEqual(client.count(), 1) # meanwhile some procs should not have restarted for pid in nonrestartable_pids: proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_state, ProcessStateEnum.FAILED) # guard against extraneous events we were receiving as part of a bug: # processes restarting again after they were already restarted self.waiter.await_nothing(timeout=5) def test_idempotency(self): # ensure every operation can be safely retried process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS proc_name = 'myreallygoodname' pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # note: if we import UNSCHEDULED state into ProcessStateEnum, # this assertion will need to change. proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # repeating schedule is harmless pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) # repeating cancel is harmless self.pd_cli.cancel_process(pid) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class Launcher(object): """ Helper for launching platform and instrument agent processes. """ def __init__(self, timeout_spawn): """ @param timeout_spawn Default timeout in secs for the RUNNING event. """ self._timeout_spawn = timeout_spawn self._pd_client = ProcessDispatcherServiceClient() self._agent_launcher = AgentLauncher(self._pd_client) def destroy(self): if self._pd_client: self._pd_client.close() self._pd_client = None self._agent_launcher = None def launch_platform(self, agt_id, agent_config, timeout_spawn=None): """ Launches a platform agent. @param agt_id Some ID mainly used for logging @param agent_config Agent configuration @param timeout_spawn Timeout in secs for the RUNNING event (by default, the value given in constructor). If None or zero, no wait is performed. @return process ID """ timeout_spawn = timeout_spawn or self._timeout_spawn log.debug("launch_platform: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn) name = 'PlatformAgent_%s' % agt_id pdef = ProcessDefinition(name=name) pdef.executable = { 'module': 'ion.agents.platform.platform_agent', 'class': 'PlatformAgent' } pdef_id = self._pd_client.create_process_definition(process_definition=pdef) pid = self._agent_launcher.launch(agent_config, pdef_id) if timeout_spawn: log.debug("launch_platform: agt_id=%r: waiting for RUNNING", agt_id) self._agent_launcher.await_launch(timeout_spawn) log.debug("launch_platform: agt_id=%r: RUNNING", agt_id) return pid def launch_instrument(self, agt_id, agent_config, timeout_spawn=None): """ Launches an instrument agent. @param agt_id Some ID mainly used for logging @param agent_config Agent configuration @param timeout_spawn Timeout in secs for the RUNNING event (by default, the value given in constructor). If None or zero, no wait is performed. @return process ID """ timeout_spawn = timeout_spawn or self._timeout_spawn log.debug("launch_instrument: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn) name = 'InstrumentAgent_%s' % agt_id pdef = ProcessDefinition(name=name) pdef.executable = { 'module': 'ion.agents.instrument.instrument_agent', 'class': 'InstrumentAgent' } pdef_id = self._pd_client.create_process_definition(process_definition=pdef) pid = self._agent_launcher.launch(agent_config, pdef_id) if timeout_spawn: log.debug("launch_instrument: agt_id=%r: waiting for RUNNING", agt_id) self._agent_launcher.await_launch(timeout_spawn) log.debug("launch_instrument: agt_id=%r: RUNNING", agt_id) return pid def cancel_process(self, pid, timeout_cancel=None): """ Helper to terminate a process """ pinfo = self._pd_client.read_process(pid) if pinfo.process_state != ProcessStateEnum.RUNNING: log.debug("cancel_process: pid=%r is not RUNNING", pid) return log.debug("cancel_process: canceling pid=%r", pid) self._pd_client.cancel_process(pid) if timeout_cancel: log.debug("waiting %s seconds for preocess to cancel", timeout_cancel) psg = ProcessStateGate(self._pd_client.read_process, pid, ProcessStateEnum.TERMINATED) if not psg.await(timeout_cancel): log.debug("Process %r failed to get to TERMINATED in %s seconds", pid, timeout_cancel)
class HAProcessControl(object): def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""): self.pd_name = pd_name self.resource_registry = resource_registry self.service_id = service_id self.callback = callback if callback and not callable(callback): raise ValueError("callback is not callable") self.logprefix = logprefix self.client = ProcessDispatcherServiceClient(to_name=pd_name) self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent", callback=self._event_callback, origin_type="DispatchedProcess", auto_delete=True) self.processes = {} def start(self): service = self.resource_registry.read(self.service_id) process_assocs = self.resource_registry.find_associations(service, "hasProcess") for process_assoc in process_assocs: process_id = process_assoc.o if process_id: try: process = self.client.read_process(process_id) except NotFound: log.debug("%sService was associated with process %s, which is unknown to PD. ignoring.", self.logprefix, process_id) continue state = process.process_state state_str = ProcessStateEnum._str_map.get(state, str(state)) self.processes[process.process_id] = _process_dict_from_object(process) log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str) self.event_sub.start() def stop(self): self.event_sub.stop() def get_managed_upids(self): return self.processes.keys() def _event_callback(self, event, *args, **kwargs): if not event: return try: self._inner_event_callback(event) except Exception: log.exception("%sException in event handler. This is a bug!", self.logprefix) def _inner_event_callback(self, event): process_id = event.origin state = event.state state_str = ProcessStateEnum._str_map.get(state, str(state)) if not (process_id and process_id in self.processes): # we receive events for all processes but ignore most return process = None for _ in range(3): try: process = self.client.read_process(process_id) break except Timeout: log.warn("Timeout trying to read process from Process Dispatcher!", exc_info=True) pass # retry except NotFound: break if process: log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str) # replace the cached data about this process self.processes[process_id] = _process_dict_from_object(process) else: log.warn("%sReceived process %s event but failed to read from Process Dispatcher", self.logprefix, process_id) #XXX better approach here? we at least have the state from the event, # so sticking that on cached process. We could miss other important # data like hostname however. self.processes[process_id]['state'] = process_state_to_pd_core(state) if self.callback: try: self.callback() except Exception, e: log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self.dashi = None self._start_container() from pyon.public import CFG self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(name="process_dispatcher", processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.rr_cli = self.container.resource_registry self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = { 'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess' } self.process_definition_id = self.pd_cli.create_process_definition( self.process_definition) self._eea_pids = [] self._eea_pid_to_resource_id = {} self._eea_pid_to_persistence_dir = {} self._tmpdirs = [] self.dashi = get_dashi( uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange'], sysname=CFG.get_safe("dashi.sysname")) #send a fake node_state message to PD's dashi binding. self.node1_id = uuid.uuid4().hex self._send_node_state("engine1", self.node1_id) self._initial_eea_pid = self._start_eeagent(self.node1_id) self.waiter = ProcessStateWaiter() def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state) def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None): if not persistence_dir: persistence_dir = tempfile.mkdtemp() self._tmpdirs.append(persistence_dir) resource_id = resource_id or uuid.uuid4().hex agent_config = _get_eeagent_config(node_id, persistence_dir, resource_id=resource_id) pid = self.container_client.spawn_process( name="eeagent", module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=agent_config) log.info('Agent pid=%s.', str(pid)) self._eea_pids.append(pid) self._eea_pid_to_resource_id[pid] = resource_id self._eea_pid_to_persistence_dir[pid] = persistence_dir return pid def _kill_eeagent(self, pid): self.assertTrue(pid in self._eea_pids) self.container.terminate_process(pid) self._eea_pids.remove(pid) del self._eea_pid_to_resource_id[pid] del self._eea_pid_to_persistence_dir[pid] def tearDown(self): for pid in list(self._eea_pids): self._kill_eeagent(pid) for d in self._tmpdirs: shutil.rmtree(d) self.waiter.stop() if self.dashi: self.dashi.cancel() def test_requested_ee(self): # request non-default engine process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.WAITING) # request unknown engine, with NEVER queuing mode. The request # should be rejected. # verifies L4-CI-CEI-RQ52 process_target = ProcessTarget(execution_engine_id="not-a-real-ee") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target rejected_pid = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=rejected_pid) self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED) # now add a node and eeagent for engine2. original process should leave # queue and start running node2_id = uuid.uuid4().hex self._send_node_state("engine2", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # spawn another process. it should start immediately. process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # one more with node exclusive process_target = ProcessTarget(execution_engine_id="engine2", node_exclusive="hats") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_node_exclusive(self): # the node_exclusive constraint is used to ensure multiple processes # of the same "kind" each get a VM exclusive of each other. Other # processes may run on these VMs, just not processes with the same # node_exclusive tag. Since we cannot directly query the contents # of each node in this test, we prove the capability by scheduling # processes one by one and checking their state. # verifies L4-CI-CEI-RQ121 # verifies L4-CI-CEI-RQ57 # first off, setUp() created a single node and eeagent. # We schedule two processes with the same "abc" node_exclusive # tag. Since there is only one node, the first process should run # and the second should be queued. process_target = ProcessTarget(execution_engine_id="engine1") process_target.node_exclusive = "abc" process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid1 = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING) pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING) # now demonstrate that the node itself is not full by launching # a third process without a node_exclusive tag -- it should start # immediately process_target.node_exclusive = None pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # finally, add a second node to the engine. pid2 should be started # since there is an exclusive "abc" node free. node2_id = uuid.uuid4().hex self._send_node_state("engine1", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_code_download(self): # create a process definition that has no URL; only module and class. process_definition_no_url = ProcessDefinition( name='test_process_nodownload') process_definition_no_url.executable = { 'module': 'ion.my.test.process', 'class': 'TestProcess' } process_definition_id_no_url = self.pd_cli.create_process_definition( process_definition_no_url) # create another that has a URL of the python file (this very file) # verifies L4-CI-CEI-RQ114 url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py') process_definition = ProcessDefinition(name='test_process_download') process_definition.executable = { 'module': 'ion.my.test.process', 'class': 'TestProcess', 'url': url } process_definition_id = self.pd_cli.create_process_definition( process_definition) process_target = ProcessTarget() process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target self.waiter.start() # Test a module with no download fails pid_no_url = self.pd_cli.create_process(process_definition_id_no_url) self.pd_cli.schedule_process(process_definition_id_no_url, process_schedule, process_id=pid_no_url) self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED) # Test a module with a URL runs pid = self.pd_cli.create_process(process_definition_id) self.pd_cli.schedule_process(process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) def _add_test_process(self, restart_mode=None): process_schedule = ProcessSchedule() if restart_mode is not None: process_schedule.restart_mode = restart_mode pid = self.pd_cli.create_process(self.process_definition_id) pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex config = {'process': {'listen_name': pid_listen_name}} self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid, configuration=config) client = TestClient(to_name=pid_listen_name) return pid, client def test_restart(self): self.waiter.start() restartable_pids = [] nonrestartable_pids = [] clients = {} # start 10 processes with RestartMode.ALWAYS for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ALWAYS) restartable_pids.append(pid) clients[pid] = client # and 10 processes with RestartMode.ABNORMAL for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL) restartable_pids.append(pid) clients[pid] = client # and 10 with RestartMode.NEVER for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.NEVER) nonrestartable_pids.append(pid) clients[pid] = client all_pids = restartable_pids + nonrestartable_pids self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING) for pid in all_pids: client = clients[pid] self.assertFalse(client.is_restart()) self.assertEqual(client.count(), 1) # now kill the whole eeagent and restart it. processes should # show up as FAILED in the next heartbeat. resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid] persistence_dir = self._eea_pid_to_persistence_dir[ self._initial_eea_pid] log.debug("Restarting eeagent %s", self._initial_eea_pid) self._kill_eeagent(self._initial_eea_pid) # manually kill the processes to simulate a real container failure for pid in all_pids: self.container.terminate_process(pid) self._start_eeagent(self.node1_id, resource_id=resource_id, persistence_dir=persistence_dir) # wait for restartables to restart self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING) # query the processes again. it should have restart mode config for pid in restartable_pids: client = clients[pid] self.assertTrue(client.is_restart()) self.assertEqual(client.count(), 1) # meanwhile some procs should not have restarted for pid in nonrestartable_pids: proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_state, ProcessStateEnum.FAILED) # guard against extraneous events we were receiving as part of a bug: # processes restarting again after they were already restarted self.waiter.await_nothing(timeout=5) def test_idempotency(self): # ensure every operation can be safely retried process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS proc_name = 'myreallygoodname' pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # note: if we import UNSCHEDULED state into ProcessStateEnum, # this assertion will need to change. proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # repeating schedule is harmless pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) # repeating cancel is harmless self.pd_cli.cancel_process(pid) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase): def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.rr_cli = ResourceRegistryServiceClient() self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = { 'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess' } self.process_definition_id = self.pd_cli.create_process_definition( self.process_definition) self.waiter = ProcessStateWaiter() def tearDown(self): self.waiter.stop() def test_create_schedule_cancel(self): process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS proc_name = 'myreallygoodname' pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142 self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) # make sure process is readable directly from RR (mirrored) # verifies L4-CI-CEI-RQ63 # verifies L4-CI-CEI-RQ64 proc = self.rr_cli.read(pid) self.assertEqual(proc.process_id, pid) # now try communicating with the process to make sure it is really running test_client = TestClient() for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # verifies L4-CI-CEI-RQ147 # check the process name was set in container got_proc_name = test_client.get_process_name(pid=pid2) self.assertEqual(proc_name, got_proc_name) # kill the process and start it again self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) for i in range(5): self.assertEqual(i + 1, test_client.count(timeout=10)) # kill the process for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) def test_schedule_with_config(self): process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # verifies L4-CI-CEI-RQ66 # feed in a string that the process will return -- verifies that # configuration actually makes it to the instantiated process test_response = uuid.uuid4().hex configuration = {"test_response": test_response} pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration=configuration, process_id=pid) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) test_client = TestClient() # verifies L4-CI-CEI-RQ139 # assure that configuration block (which can contain inputs, outputs, # and arbitrary config) 1) makes it to the process and 2) is returned # in process queries self.assertEqual(test_client.query(), test_response) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, configuration) # kill the process for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) def test_schedule_bad_config(self): process_schedule = ProcessSchedule() # a non-JSON-serializable IonObject o = ProcessTarget() with self.assertRaises(BadRequest) as ar: self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={"bad": o}) self.assertTrue(ar.exception.message.startswith("bad configuration")) def test_cancel_notfound(self): with self.assertRaises(NotFound): self.pd_cli.cancel_process("not-a-real-process-id") def test_create_invalid_definition(self): # create process definition missing module and class # verifies L4-CI-CEI-RQ137 executable = dict(url="http://somewhere.com/something.py") definition = ProcessDefinition(name="test_process", executable=executable) with self.assertRaises(BadRequest): self.pd_cli.create_process_definition(definition)
class HAProcessControl(object): def __init__(self, pd_name, resource_registry, service_id, callback=None, logprefix=""): self.pd_name = pd_name self.resource_registry = resource_registry self.service_id = service_id self.callback = callback if callback and not callable(callback): raise ValueError("callback is not callable") self.logprefix = logprefix self.client = ProcessDispatcherServiceClient(to_name=pd_name) self.event_sub = EventSubscriber(event_type="ProcessLifecycleEvent", callback=self._event_callback, origin_type="DispatchedProcess", auto_delete=True) self.processes = {} def start(self): service = self.resource_registry.read(self.service_id) process_assocs = self.resource_registry.find_associations( service, "hasProcess") for process_assoc in process_assocs: process_id = process_assoc.o if process_id: try: process = self.client.read_process(process_id) except NotFound: log.debug( "%sService was associated with process %s, which is unknown to PD. ignoring.", self.logprefix, process_id) continue state = process.process_state state_str = ProcessStateEnum._str_map.get(state, str(state)) self.processes[process.process_id] = _process_dict_from_object( process) log.info("%srecovered process %s state=%s", self.logprefix, process_id, state_str) self.event_sub.start() def stop(self): self.event_sub.stop() def get_managed_upids(self): return self.processes.keys() def _event_callback(self, event, *args, **kwargs): if not event: return try: self._inner_event_callback(event) except (KeyboardInterrupt, SystemExit): raise except: log.exception("%sException in event handler. This is a bug!", self.logprefix) def _inner_event_callback(self, event): process_id = event.origin state = event.state state_str = ProcessStateEnum._str_map.get(state, str(state)) if not (process_id and process_id in self.processes): # we receive events for all processes but ignore most return process = None for _ in range(3): try: process = self.client.read_process(process_id) break except Timeout: log.warn( "Timeout trying to read process from Process Dispatcher!", exc_info=True) pass # retry except NotFound: break if process: log.info("%sreceived process %s state=%s", self.logprefix, process_id, state_str) # replace the cached data about this process self.processes[process_id] = _process_dict_from_object(process) else: log.warn( "%sReceived process %s event but failed to read from Process Dispatcher", self.logprefix, process_id) #XXX better approach here? we at least have the state from the event, # so sticking that on cached process. We could miss other important # data like hostname however. self.processes[process_id]['state'] = process_state_to_pd_core( state) if self.callback: try: self.callback() except (KeyboardInterrupt, SystemExit): raise except: e = sys.exc_info()[0] log.warn("%sError in HAAgent callback: %s", self.logprefix, e, exc_info=True) def _associate_process(self, process): try: self.resource_registry.create_association(self.service_id, "hasProcess", process.process_id) except Exception: log.exception("Couldn't associate service %s to process %s" % (self.service_id, process.process_id)) def schedule_process(self, pd_name, process_definition_id, **kwargs): if pd_name != self.pd_name: raise Exception( "schedule_process request received for unknown PD: %s" % pd_name) # figure out if there is an existing PID which can be reused found_upid = None for process in self.processes.values(): upid = process.get('upid') state = process.get('state') if not (upid and state): continue if state in CoreProcessState.TERMINAL_STATES: found_upid = upid if found_upid: upid = found_upid proc = self.client.read_process(upid) else: # otherwise create a new process and associate upid = self.client.create_process(process_definition_id) # note: if the HAAgent fails between the create call above and the # associate call below, there may be orphaned Process objects. These # processes will not however be running, so are largely harmless. proc = self.client.read_process(upid) self._associate_process(proc) process_schedule = _get_process_schedule(**kwargs) configuration = kwargs.get('configuration') # cheat and roll the process state to REQUESTED before we actually # schedule it. this is in-memory only, so should be harmless. This # avoids a race between this scheduling process and the event # subscriber. proc.process_state = ProcessStateEnum.REQUESTED self.processes[upid] = _process_dict_from_object(proc) self.client.schedule_process(process_definition_id, process_schedule, configuration=configuration, process_id=upid) return upid def terminate_process(self, pid): return self.client.cancel_process(pid) def get_all_processes(self): processes = deepcopy(self.processes.values()) return {self.pd_name: processes} def reload_processes(self): for process_id, process_dict in self.processes.items(): try: process = self.client.read_process(process_id) except Exception: log.warn( "%sFailed to read process %s from PD. Will retry later.", self.logprefix, process_id, exc_info=True) continue new_process_dict = _process_dict_from_object(process) if new_process_dict['state'] != process_dict['state']: log.warn( "%sUpdating process %s record manually. we may have missed an event?", self.logprefix, process_id) self.processes[process_id] = new_process_dict
class Launcher(object): """ Helper for launching platform and instrument agent processes. """ def __init__(self, timeout_spawn): """ @param timeout_spawn Default timeout in secs for the RUNNING event. """ self._timeout_spawn = timeout_spawn self._pd_client = ProcessDispatcherServiceClient() self._agent_launcher = AgentLauncher(self._pd_client) def destroy(self): if self._pd_client: self._pd_client.close() self._pd_client = None self._agent_launcher = None def launch_platform(self, agt_id, agent_config, timeout_spawn=None): """ Launches a platform agent. @param agt_id Some ID mainly used for logging @param agent_config Agent configuration @param timeout_spawn Timeout in secs for the RUNNING event (by default, the value given in constructor). If None or zero, no wait is performed. @return process ID """ timeout_spawn = timeout_spawn or self._timeout_spawn log.debug("launch_platform: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn) name = 'PlatformAgent_%s' % agt_id pdef = ProcessDefinition(name=name) pdef.executable = { 'module': 'ion.agents.platform.platform_agent', 'class': 'PlatformAgent' } pdef_id = self._pd_client.create_process_definition( process_definition=pdef) pid = self._agent_launcher.launch(agent_config, pdef_id) if timeout_spawn: log.debug("launch_platform: agt_id=%r: waiting for RUNNING", agt_id) self._agent_launcher.await_launch(timeout_spawn) log.debug("launch_platform: agt_id=%r: RUNNING", agt_id) return pid def launch_instrument(self, agt_id, agent_config, timeout_spawn=None): """ Launches an instrument agent. @param agt_id Some ID mainly used for logging @param agent_config Agent configuration @param timeout_spawn Timeout in secs for the RUNNING event (by default, the value given in constructor). If None or zero, no wait is performed. @return process ID """ timeout_spawn = timeout_spawn or self._timeout_spawn log.debug("launch_instrument: agt_id=%r, timeout_spawn=%s", agt_id, timeout_spawn) name = 'InstrumentAgent_%s' % agt_id pdef = ProcessDefinition(name=name) pdef.executable = { 'module': 'ion.agents.instrument.instrument_agent', 'class': 'InstrumentAgent' } pdef_id = self._pd_client.create_process_definition( process_definition=pdef) pid = self._agent_launcher.launch(agent_config, pdef_id) if timeout_spawn: log.debug("launch_instrument: agt_id=%r: waiting for RUNNING", agt_id) self._agent_launcher.await_launch(timeout_spawn) log.debug("launch_instrument: agt_id=%r: RUNNING", agt_id) return pid def cancel_process(self, pid, timeout_cancel=None): """ Helper to terminate a process """ pinfo = self._pd_client.read_process(pid) if pinfo.process_state != ProcessStateEnum.RUNNING: log.debug("cancel_process: pid=%r is not RUNNING", pid) return log.debug("cancel_process: canceling pid=%r", pid) self._pd_client.cancel_process(pid) if timeout_cancel: log.debug("waiting %s seconds for preocess to cancel", timeout_cancel) psg = ProcessStateGate(self._pd_client.read_process, pid, ProcessStateEnum.TERMINATED) if not psg. await (timeout_cancel): log.debug( "Process %r failed to get to TERMINATED in %s seconds", pid, timeout_cancel)