class ExecutionEngineAgentPyonIntTest(IonIntegrationTestCase): _webserver = None @needs_eeagent def setUp(self): self._start_container() self.container.start_rel_from_url("res/deploy/r2cei.yml") self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { "eeagent": { "heartbeat": 1, "slots": 100, "name": "pyon_eeagent", "launch_type": {"name": "pyon", "persistence_directory": self.persistence_directory}, }, "agent": {"resource_id": self.resource_id}, "logging": { "loggers": {"eeagent": {"level": "DEBUG", "handlers": ["console"]}}, "root": {"handlers": ["console"]}, }, } self._start_eeagent() def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config, ) log.info("Agent pid=%s.", str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info("Got eea client %s.", str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self._stop_webserver() self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) def _start_webserver(self, directory_to_serve, port=None): """ Start a webserver for testing code download Note: tries really hard to get a port, and if it can't use the suggested port, randomly picks another, and returns it """ def log_message(self, format, *args): # swallow log massages pass class Server(HTTPServer): requests = 0 def serve_forever(self): self._serving = 1 while self._serving: self.handle_request() self.requests += 1 def stop(self): self._serving = 0 if port is None: port = 8008 Handler = SimpleHTTPServer.SimpleHTTPRequestHandler Handler.log_message = log_message for i in range(0, 100): try: self._webserver = Server(("localhost", port), Handler) except socket.error: print "port %s is in use, picking another" % port port = randint(8000, 10000) continue else: break self._web_glet = gevent.spawn(self._webserver.serve_forever) return port def _stop_webserver(self): if self._webserver is not None: self._web_glet.kill() def _enable_code_download(self, whitelist=None): if whitelist is None: whitelist = [] self.container.terminate_process(self._eea_pid) self.agent_config["eeagent"]["code_download"] = {"enabled": True, "whitelist": whitelist} self._start_eeagent() def wait_for_state(self, upid, desired_state, timeout=30): attempts = 0 last_state = None while timeout > attempts: try: state = self.eea_client.dump_state().result except Timeout: log.warn("Timeout calling EEAgent dump_state. retrying.") continue proc = get_proc_for_upid(state, upid) last_state = proc.get("state") if last_state == desired_state: return gevent.sleep(1) attempts += 1 assert False, "Process %s took too long to get to %s, had %s" % (upid, desired_state, last_state) @needs_eeagent def test_basics(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_x" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcess" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) state = self.eea_client.dump_state().result assert len(state["processes"]) == 1 self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, "TERMINATED"]) state = self.eea_client.dump_state().result assert len(state["processes"]) == 1 self.eea_client.cleanup_process(u_pid, round) state = self.eea_client.dump_state().result assert len(state["processes"]) == 0 @needs_eeagent def test_restart(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_x" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcess" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) state = self.eea_client.dump_state().result assert len(state["processes"]) == 1 # Start again with incremented round. eeagent should restart the process round += 1 self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) state = self.eea_client.dump_state().result ee_round = state["processes"][0]["round"] assert round == int(ee_round) # TODO: this test is disabled, as the restart op is disabled # Run restart with incremented round. eeagent should restart the process # round += 1 # self.eea_client.restart_process(u_pid, round) # self.wait_for_state(u_pid, [500, 'RUNNING']) # state = self.eea_client.dump_state().result # ee_round = state['processes'][0]['round'] # assert round == int(ee_round) self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, "TERMINATED"]) state = self.eea_client.dump_state().result assert len(state["processes"]) == 1 self.eea_client.cleanup_process(u_pid, round) state = self.eea_client.dump_state().result assert len(state["processes"]) == 0 @needs_eeagent def test_failing_process(self): u_pid = "testfail" round = 0 run_type = "pyon" proc_name = "test_x" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcessFail" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, "FAILED"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_slow_to_start(self): upids = map(lambda i: str(uuid.uuid4().hex), range(0, 10)) round = 0 run_type = "pyon" proc_name = "test_x" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcessSlowStart" parameters = {"name": proc_name, "module": module, "cls": cls} for upid in upids: self.eea_client.launch_process(upid, round, run_type, parameters) for upid in upids: self.wait_for_state(upid, [500, "RUNNING"], timeout=60) @needs_eeagent def test_start_cancel(self): upid = str(uuid.uuid4().hex) round = 0 run_type = "pyon" proc_name = "test_x" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcessSlowStart" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(upid, round, run_type, parameters) self.wait_for_state(upid, [400, "PENDING"]) self.eea_client.terminate_process(upid, round) self.wait_for_state(upid, [700, "TERMINATED"]) @needs_eeagent def test_kill_and_revive(self): """test_kill_and_revive Ensure that when an eeagent dies, it pulls the processes it owned from persistence, and marks them as failed, so the PD can figure out what to do with them """ u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_transform" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcess" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) # Kill and restart eeagent. Also, kill proc started by eea to simulate # a killed container old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) proc_to_kill = self.container.proc_manager.procs_by_name.get(proc_name) self.assertIsNotNone(proc_to_kill) self.container.terminate_process(proc_to_kill.id) self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) self.wait_for_state(u_pid, [850, "FAILED"]) @needs_eeagent def test_run_out_of_slots(self): """test_run_out_of_slots """ old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) self.agent_config["eeagent"]["slots"] = 1 self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) u_pid_0, u_pid_1 = "test0", "test1" round = 0 run_type = "pyon" proc_name = "test_transform" module = "ion.agents.cei.test.test_eeagent" cls = "TestProcess" parameters = {"name": proc_name, "module": module, "cls": cls} self.eea_client.launch_process(u_pid_0, round, run_type, parameters) self.wait_for_state(u_pid_0, [500, "RUNNING"]) self.eea_client.launch_process(u_pid_1, round, run_type, parameters) self.wait_for_state(u_pid_1, [900, "REJECTED"]) old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) self.agent_config["eeagent"]["slots"] = 1 self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) self.wait_for_state(u_pid_0, [850, "FAILED"]) self.wait_for_state(u_pid_1, [900, "REJECTED"]) @needs_eeagent def test_download_code(self): self._enable_code_download(whitelist=["*"]) u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_transform" module = "ion.my.module.to.download" module_uri = "file://%s/downloads/module_to_download.py" % get_this_directory() bad_module_uri = "file:///tmp/notreal/module_to_download.py" cls = "TestDownloadProcess" parameters = {"name": proc_name, "module": module, "module_uri": bad_module_uri, "cls": cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 404 assert "Unable to download" in response.result parameters = {"name": proc_name, "module": module, "module_uri": module_uri, "cls": cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_whitelist(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_transform" module = "ion.my.module" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = "TestDownloadProcess" parameters = {"name": proc_name, "module": module, "module_uri": module_uri, "cls": cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "Code download not enabled" in response.result # Test no whitelist self._enable_code_download() response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 401 assert "not in code_download whitelist" in response.result # Test not matching self._enable_code_download(whitelist=["blork"]) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "not in code_download whitelist" in response.result # Test exact matching self._enable_code_download(whitelist=["localhost"]) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) # Test wildcard self._enable_code_download(whitelist=["*"]) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_caching(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) self._enable_code_download(["*"]) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = "test_transform" module = "ion.my.module" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = "TestDownloadProcess" parameters = {"name": proc_name, "module": module, "module_uri": module_uri, "cls": cls} # Launch a process, check that webserver is hit self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 # Launch another process, check that webserver is still only hit once self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test5" round = 0 run_type = "pyon" proc_name = "test_transformx" module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = "TestProcess" parameters = {"name": proc_name, "module": module, "module_uri": module_uri, "cls": cls} # Test that a module that is already available in tarball won't trigger a download self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, "RUNNING"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test9" round = 0 run_type = "pyon" proc_name = "test_transformx" module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = "TestProcessNotReal" parameters = {"name": proc_name, "module": module, "module_uri": module_uri, "cls": cls} # Test behaviour of a non existant class with no download self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, "FAILED"]) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid)
class HeartbeaterIntTest(IonIntegrationTestCase): @needs_eeagent def setUp(self): self._start_container() self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { "eeagent": { "heartbeat": "0.01", "slots": 100, "name": "pyon_eeagent", "launch_type": {"name": "pyon", "persistence_directory": self.persistence_directory}, }, "agent": {"resource_id": self.resource_id}, "logging": { "loggers": {"eeagent": {"level": "DEBUG", "handlers": ["console"]}}, "root": {"handlers": ["console"]}, }, } def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config, ) log.info("Agent pid=%s.", str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info("Got eea client %s.", str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent @unittest.skipIf(os.getenv("CEI_LAUNCH_TEST", False), "Skip test while in CEI LAUNCH mode") def test_heartbeater(self): """test_heartbeater Test whether the eeagent waits until the eeagent listener is ready before sending a heartbeat to the PD """ # beat_died is a list because of a hack to get around a limitation in python 2.7 # See: http://stackoverflow.com/questions/8934772/local-var-referenced-before-assignment beat_died = [False] def heartbeat_callback(heartbeat, headers): eeagent_id = heartbeat["eeagent_id"] agent_client = SimpleResourceAgentClient(eeagent_id, name=eeagent_id, process=FakeProcess()) ee_client = ExecutionEngineAgentClient(agent_client, timeout=2) try: ee_client.dump_state() except: log.exception("Heartbeat Failed!") beat_died[0] = True self.beat_subscriber = HeartbeatSubscriber( "heartbeat_queue", callback=heartbeat_callback, node=self.container.node ) self.beat_subscriber.start() try: self._start_eeagent() for i in range(0, 5): if beat_died[0] is True: assert False, "A Hearbeat callback wasn't able to contact the eeagent" gevent.sleep(0.5) finally: self.beat_subscriber.stop()
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self._start_container() self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self.event_queue = queue.Queue() self.event_sub = None self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 1, 'heartbeat_queue': 'hbeatq', 'slots': 100, 'name': 'pyon_eeagent', 'node_id': 'somenodeid', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, }, }, 'agent': {'resource_id': self.resource_id}, } #send a fake dt_state message to PD's dashi binding. dashi = get_dashi(uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange']) dt_state = dict(node_id="somenodeid", state=InstanceState.RUNNING, deployable_type="eeagent_pyon") dashi.fire(get_pd_dashi_name(), "dt_state", args=dt_state) self._eea_pid = self.container_client.spawn_process(name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process(name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) if self.event_sub: self.event_sub.stop() self._stop_container()
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self.dashi = None self._start_container() self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(name="process_dispatcher", processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.rr_cli = self.container.resource_registry self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self._eea_pids = [] self._tmpdirs = [] self.dashi = get_dashi(uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange']) #send a fake node_state message to PD's dashi binding. self.node1_id = uuid.uuid4().hex self._send_node_state("engine1", self.node1_id) self._start_eeagent(self.node1_id) self.waiter = ProcessStateWaiter() def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state) def _start_eeagent(self, node_id): persistence_dir = tempfile.mkdtemp() self._tmpdirs.append(persistence_dir) agent_config = _get_eeagent_config(node_id, persistence_dir) pid = self.container_client.spawn_process(name="eeagent", module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=agent_config) log.info('Agent pid=%s.', str(pid)) self._eea_pids.append(pid) def tearDown(self): for pid in self._eea_pids: self.container.terminate_process(pid) for d in self._tmpdirs: shutil.rmtree(d) self.waiter.stop() if self.dashi: self.dashi.cancel() def test_requested_ee(self): # request non-default engine process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.WAITING) # request unknown engine, with NEVER queuing mode. The request # should be rejected. # verifies L4-CI-CEI-RQ52 process_target = ProcessTarget(execution_engine_id="not-a-real-ee") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target rejected_pid = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=rejected_pid) self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED) # now add a node and eeagent for engine2. original process should leave # queue and start running node2_id = uuid.uuid4().hex self._send_node_state("engine2", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # spawn another process. it should start immediately. process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # one more with node exclusive process_target = ProcessTarget(execution_engine_id="engine2", node_exclusive="hats") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_node_exclusive(self): # the node_exclusive constraint is used to ensure multiple processes # of the same "kind" each get a VM exclusive of each other. Other # processes may run on these VMs, just not processes with the same # node_exclusive tag. Since we cannot directly query the contents # of each node in this test, we prove the capability by scheduling # processes one by one and checking their state. # verifies L4-CI-CEI-RQ121 # verifies L4-CI-CEI-RQ57 # first off, setUp() created a single node and eeagent. # We schedule two processes with the same "abc" node_exclusive # tag. Since there is only one node, the first process should run # and the second should be queued. process_target = ProcessTarget(execution_engine_id="engine1") process_target.node_exclusive = "abc" process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid1 = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING) pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING) # now demonstrate that the node itself is not full by launching # a third process without a node_exclusive tag -- it should start # immediately process_target.node_exclusive = None pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # finally, add a second node to the engine. pid2 should be started # since there is an exclusive "abc" node free. node2_id = uuid.uuid4().hex self._send_node_state("engine1", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_code_download(self): # create a process definition that has no URL; only module and class. process_definition_no_url = ProcessDefinition(name='test_process_nodownload') process_definition_no_url.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess'} process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url) # create another that has a URL of the python file (this very file) # verifies L4-CI-CEI-RQ114 url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py') process_definition = ProcessDefinition(name='test_process_download') process_definition.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess', 'url': url} process_definition_id = self.pd_cli.create_process_definition(process_definition) process_target = ProcessTarget() process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target self.waiter.start() # Test a module with no download fails pid_no_url = self.pd_cli.create_process(process_definition_id_no_url) self.pd_cli.schedule_process(process_definition_id_no_url, process_schedule, process_id=pid_no_url) self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED) # Test a module with a URL runs pid = self.pd_cli.create_process(process_definition_id) self.pd_cli.schedule_process(process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self.dashi = None self._start_container() from pyon.public import CFG self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(name="process_dispatcher", processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.rr_cli = self.container.resource_registry self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess'} self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition) self._eea_pids = [] self._eea_pid_to_resource_id = {} self._eea_pid_to_persistence_dir = {} self._tmpdirs = [] self.dashi = get_dashi(uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange'], sysname=CFG.get_safe("dashi.sysname") ) #send a fake node_state message to PD's dashi binding. self.node1_id = uuid.uuid4().hex self._send_node_state("engine1", self.node1_id) self._initial_eea_pid = self._start_eeagent(self.node1_id) self.waiter = ProcessStateWaiter() def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state) def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None): if not persistence_dir: persistence_dir = tempfile.mkdtemp() self._tmpdirs.append(persistence_dir) resource_id = resource_id or uuid.uuid4().hex agent_config = _get_eeagent_config(node_id, persistence_dir, resource_id=resource_id) pid = self.container_client.spawn_process(name="eeagent", module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=agent_config) log.info('Agent pid=%s.', str(pid)) self._eea_pids.append(pid) self._eea_pid_to_resource_id[pid] = resource_id self._eea_pid_to_persistence_dir[pid] = persistence_dir return pid def _kill_eeagent(self, pid): self.assertTrue(pid in self._eea_pids) self.container.terminate_process(pid) self._eea_pids.remove(pid) del self._eea_pid_to_resource_id[pid] del self._eea_pid_to_persistence_dir[pid] def tearDown(self): for pid in list(self._eea_pids): self._kill_eeagent(pid) for d in self._tmpdirs: shutil.rmtree(d) self.waiter.stop() if self.dashi: self.dashi.cancel() def test_requested_ee(self): # request non-default engine process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.WAITING) # request unknown engine, with NEVER queuing mode. The request # should be rejected. # verifies L4-CI-CEI-RQ52 process_target = ProcessTarget(execution_engine_id="not-a-real-ee") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target rejected_pid = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=rejected_pid) self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED) # now add a node and eeagent for engine2. original process should leave # queue and start running node2_id = uuid.uuid4().hex self._send_node_state("engine2", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # spawn another process. it should start immediately. process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # one more with node exclusive process_target = ProcessTarget(execution_engine_id="engine2", node_exclusive="hats") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_node_exclusive(self): # the node_exclusive constraint is used to ensure multiple processes # of the same "kind" each get a VM exclusive of each other. Other # processes may run on these VMs, just not processes with the same # node_exclusive tag. Since we cannot directly query the contents # of each node in this test, we prove the capability by scheduling # processes one by one and checking their state. # verifies L4-CI-CEI-RQ121 # verifies L4-CI-CEI-RQ57 # first off, setUp() created a single node and eeagent. # We schedule two processes with the same "abc" node_exclusive # tag. Since there is only one node, the first process should run # and the second should be queued. process_target = ProcessTarget(execution_engine_id="engine1") process_target.node_exclusive = "abc" process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid1 = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING) pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING) # now demonstrate that the node itself is not full by launching # a third process without a node_exclusive tag -- it should start # immediately process_target.node_exclusive = None pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # finally, add a second node to the engine. pid2 should be started # since there is an exclusive "abc" node free. node2_id = uuid.uuid4().hex self._send_node_state("engine1", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_code_download(self): # create a process definition that has no URL; only module and class. process_definition_no_url = ProcessDefinition(name='test_process_nodownload') process_definition_no_url.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess'} process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url) # create another that has a URL of the python file (this very file) # verifies L4-CI-CEI-RQ114 url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py') process_definition = ProcessDefinition(name='test_process_download') process_definition.executable = {'module': 'ion.my.test.process', 'class': 'TestProcess', 'url': url} process_definition_id = self.pd_cli.create_process_definition(process_definition) process_target = ProcessTarget() process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target self.waiter.start() # Test a module with no download fails pid_no_url = self.pd_cli.create_process(process_definition_id_no_url) self.pd_cli.schedule_process(process_definition_id_no_url, process_schedule, process_id=pid_no_url) self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED) # Test a module with a URL runs pid = self.pd_cli.create_process(process_definition_id) self.pd_cli.schedule_process(process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) def _add_test_process(self, restart_mode=None): process_schedule = ProcessSchedule() if restart_mode is not None: process_schedule.restart_mode = restart_mode pid = self.pd_cli.create_process(self.process_definition_id) pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex config = {'process': {'listen_name': pid_listen_name}} self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid, configuration=config) client = TestClient(to_name=pid_listen_name) return pid, client def test_restart(self): self.waiter.start() restartable_pids = [] nonrestartable_pids = [] clients = {} # start 10 processes with RestartMode.ALWAYS for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ALWAYS) restartable_pids.append(pid) clients[pid] = client # and 10 processes with RestartMode.ABNORMAL for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL) restartable_pids.append(pid) clients[pid] = client # and 10 with RestartMode.NEVER for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.NEVER) nonrestartable_pids.append(pid) clients[pid] = client all_pids = restartable_pids + nonrestartable_pids self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING) for pid in all_pids: client = clients[pid] self.assertFalse(client.is_restart()) self.assertEqual(client.count(), 1) # now kill the whole eeagent and restart it. processes should # show up as FAILED in the next heartbeat. resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid] persistence_dir = self._eea_pid_to_persistence_dir[self._initial_eea_pid] log.debug("Restarting eeagent %s", self._initial_eea_pid) self._kill_eeagent(self._initial_eea_pid) # manually kill the processes to simulate a real container failure for pid in all_pids: self.container.terminate_process(pid) self._start_eeagent(self.node1_id, resource_id=resource_id, persistence_dir=persistence_dir) # wait for restartables to restart self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING) # query the processes again. it should have restart mode config for pid in restartable_pids: client = clients[pid] self.assertTrue(client.is_restart()) self.assertEqual(client.count(), 1) # meanwhile some procs should not have restarted for pid in nonrestartable_pids: proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_state, ProcessStateEnum.FAILED) # guard against extraneous events we were receiving as part of a bug: # processes restarting again after they were already restarted self.waiter.await_nothing(timeout=5) def test_idempotency(self): # ensure every operation can be safely retried process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS proc_name = 'myreallygoodname' pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # note: if we import UNSCHEDULED state into ProcessStateEnum, # this assertion will need to change. proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # repeating schedule is harmless pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) # repeating cancel is harmless self.pd_cli.cancel_process(pid) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class ExecutionEngineAgentPyonIntTest(IonIntegrationTestCase): from ion.agents.cei.execution_engine_agent import ExecutionEngineAgentClient def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 0, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, }, }, 'agent': {'resource_id': self.resource_id}, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } self._start_eeagent() def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process(name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = ResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent def test_basics(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) @needs_eeagent def test_kill_and_revive(self): """test_kill_and_revive Ensure that when an eeagent dies, it pulls the processes it owned from persistence, and marks them as failed, so the PD can figure out what to do with them """ u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [500, 'RUNNING']) # Kill and restart eeagent. Also, kill proc started by eea to simulate # a killed container old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) proc_to_kill = self.container.proc_manager.procs_by_name.get(proc_name) self.assertIsNotNone(proc_to_kill) self.container.terminate_process(proc_to_kill.id) self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [850, 'FAILED'])
class HeartbeaterIntTest(IonIntegrationTestCase): @needs_eeagent def setUp(self): self._start_container() self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': "0.01", 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, } }, 'agent': {'resource_id': self.resource_id}, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } def _start_eeagent(self): self.container_client = ContainerAgentClient( node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_heartbeater(self): """test_heartbeater Test whether the eeagent waits until the eeagent listener is ready before sending a heartbeat to the PD """ # beat_died is a list because of a hack to get around a limitation in python 2.7 # See: http://stackoverflow.com/questions/8934772/local-var-referenced-before-assignment beat_died = [False] def heartbeat_callback(heartbeat, headers): eeagent_id = heartbeat['eeagent_id'] agent_client = SimpleResourceAgentClient(eeagent_id, name=eeagent_id, process=FakeProcess()) ee_client = ExecutionEngineAgentClient(agent_client, timeout=2) try: ee_client.dump_state() except: log.exception("Heartbeat Failed!") beat_died[0] = True self.beat_subscriber = HeartbeatSubscriber("heartbeat_queue", callback=heartbeat_callback, node=self.container.node) self.beat_subscriber.start() try: self._start_eeagent() for i in range(0, 5): if beat_died[0] is True: assert False, "A Hearbeat callback wasn't able to contact the eeagent" gevent.sleep(0.5) finally: self.beat_subscriber.stop()
class HeartbeaterIntTest(IonIntegrationTestCase): @needs_eeagent def setUp(self): self._start_container() self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 300, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, } }, 'agent': {'resource_id': self.resource_id}, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } def _start_eeagent(self): self.container_client = ContainerAgentClient( node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_heartbeater(self): """test_heartbeater Test whether the eeagent waits until the eeagent listener is ready before sending a heartbeat to the PD """ beat_died = threading.Event() beat_succeeded = threading.Event() def heartbeat_callback(heartbeat, headers): eeagent_id = heartbeat['eeagent_id'] agent_client = SimpleResourceAgentClient(eeagent_id, name=eeagent_id, process=FakeProcess()) ee_client = ExecutionEngineAgentClient(agent_client, timeout=10) try: ee_client.dump_state() beat_succeeded.set() except: log.exception("Heartbeat Failed!") beat_died.set() self.beat_subscriber = HeartbeatSubscriber("heartbeat_queue", callback=heartbeat_callback, node=self.container.node) self.beat_subscriber.start() self._start_eeagent() success = beat_succeeded.wait(20) if success is False: died = beat_died.wait(20) assert died is False, "A Hearbeat callback wasn't able to contact the eeagent"
class ExecutionEngineAgentPyonIntTest(IonIntegrationTestCase): _webserver = None @needs_eeagent def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 1, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, } }, 'agent': {'resource_id': self.resource_id}, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } self._start_eeagent() def _start_eeagent(self): self.container_client = ContainerAgentClient( node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self._stop_webserver() self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) def _start_webserver(self, directory_to_serve, port=None): """ Start a webserver for testing code download Note: tries really hard to get a port, and if it can't use the suggested port, randomly picks another, and returns it """ def log_message(self, format, *args): #swallow log massages pass class Server(HTTPServer): requests = 0 def serve_forever(self): self._serving = 1 while self._serving: self.handle_request() self.requests += 1 def stop(self): self._serving = 0 if port is None: port = 8008 Handler = SimpleHTTPServer.SimpleHTTPRequestHandler Handler.log_message = log_message for i in range(0, 100): try: self._webserver = Server(("localhost", port), Handler) except socket.error: print "port %s is in use, picking another" % port port = randint(8000, 10000) continue else: break self._web_glet = gevent.spawn(self._webserver.serve_forever) return port def _stop_webserver(self): if self._webserver is not None: self._web_glet.kill() def _enable_code_download(self, whitelist=None): if whitelist is None: whitelist = [] self.container.terminate_process(self._eea_pid) self.agent_config['eeagent']['code_download'] = { 'enabled': True, 'whitelist': whitelist } self._start_eeagent() def wait_for_state(self, upid, desired_state, timeout=30): attempts = 0 last_state = None while timeout > attempts: try: state = self.eea_client.dump_state().result except Timeout: log.warn("Timeout calling EEAgent dump_state. retrying.") continue proc = get_proc_for_upid(state, upid) last_state = proc.get('state') if last_state == desired_state: return gevent.sleep(1) attempts += 1 assert False, "Process %s took too long to get to %s, had %s" % (upid, desired_state, last_state) @needs_eeagent def test_basics(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, 'TERMINATED']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 self.eea_client.cleanup_process(u_pid, round) state = self.eea_client.dump_state().result assert len(state['processes']) == 0 @needs_eeagent def test_duplicate(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, 'TERMINATED']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 self.eea_client.cleanup_process(u_pid, round) state = self.eea_client.dump_state().result assert len(state['processes']) == 0 @needs_eeagent def test_restart(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 # Start again with incremented round. eeagent should restart the process round += 1 self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) state = self.eea_client.dump_state().result ee_round = state['processes'][0]['round'] assert round == int(ee_round) # TODO: this test is disabled, as the restart op is disabled # Run restart with incremented round. eeagent should restart the process #round += 1 #self.eea_client.restart_process(u_pid, round) #self.wait_for_state(u_pid, [500, 'RUNNING']) #state = self.eea_client.dump_state().result #ee_round = state['processes'][0]['round'] #assert round == int(ee_round) self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, 'TERMINATED']) state = self.eea_client.dump_state().result assert len(state['processes']) == 1 self.eea_client.cleanup_process(u_pid, round) state = self.eea_client.dump_state().result assert len(state['processes']) == 0 @needs_eeagent def test_failing_process(self): u_pid = "testfail" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessFail' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, 'FAILED']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_slow_to_start(self): upids = map(lambda i: str(uuid.uuid4().hex), range(0, 10)) round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessSlowStart' parameters = {'name': proc_name, 'module': module, 'cls': cls} for upid in upids: self.eea_client.launch_process(upid, round, run_type, parameters) for upid in upids: self.wait_for_state(upid, [500, 'RUNNING'], timeout=60) @needs_eeagent def test_start_cancel(self): upid = str(uuid.uuid4().hex) round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessSlowStart' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(upid, round, run_type, parameters) self.wait_for_state(upid, [400, 'PENDING']) self.eea_client.terminate_process(upid, round) self.wait_for_state(upid, [700, 'TERMINATED']) @needs_eeagent def test_kill_and_revive(self): """test_kill_and_revive Ensure that when an eeagent dies, it pulls the processes it owned from persistence, and marks them as failed, so the PD can figure out what to do with them """ u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) # Kill and restart eeagent. Also, kill proc started by eea to simulate # a killed container old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) proc_to_kill = self.container.proc_manager.procs_by_name.get(proc_name) self.assertIsNotNone(proc_to_kill) self.container.terminate_process(proc_to_kill.id) self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) self.wait_for_state(u_pid, [850, 'FAILED']) @needs_eeagent def test_run_out_of_slots(self): """test_run_out_of_slots """ old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) self.agent_config['eeagent']['slots'] = 1 self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) u_pid_0, u_pid_1 = "test0", "test1" round = 0 run_type = "pyon" proc_name = 'test_transform' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid_0, round, run_type, parameters) self.wait_for_state(u_pid_0, [500, 'RUNNING']) self.eea_client.launch_process(u_pid_1, round, run_type, parameters) self.wait_for_state(u_pid_1, [900, 'REJECTED']) old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) self.agent_config['eeagent']['slots'] = 1 self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) self.wait_for_state(u_pid_0, [850, 'FAILED']) self.wait_for_state(u_pid_1, [900, 'REJECTED']) @needs_eeagent def test_download_code(self): self._enable_code_download(whitelist=['*']) u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module.to.download" module_uri = 'file://%s/downloads/module_to_download.py' % get_this_directory() bad_module_uri = 'file:///tmp/notreal/module_to_download.py' cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': bad_module_uri, 'cls': cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 404 assert "Unable to download" in response.result parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} round += 1 self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_whitelist(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "Code download not enabled" in response.result # Test no whitelist self._enable_code_download() round += 1 response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 401 assert "not in code_download whitelist" in response.result # Test not matching self._enable_code_download(whitelist=['blork']) round += 1 response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "not in code_download whitelist" in response.result # Test exact matching self._enable_code_download(whitelist=['localhost']) round += 1 response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) # Test wildcard self._enable_code_download(whitelist=['*']) round += 1 response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) @needs_eeagent def test_caching(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) self._enable_code_download(['*']) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Launch a process, check that webserver is hit self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 # Launch another process, check that webserver is still only hit once self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test5" round = 0 run_type = "pyon" proc_name = 'test_transformx' module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Test that a module that is already available in tarball won't trigger a download self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test9" round = 0 run_type = "pyon" proc_name = 'test_transformx' module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/ion/agents/cei/test/downloads/module_to_download.py" % http_port cls = 'TestProcessNotReal' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Test behaviour of a non existant class with no download self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, 'FAILED']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result get_proc_for_upid(state, u_pid)
class ExecutionEngineAgentPyonIntTest(IonIntegrationTestCase): _webserver = None @needs_eeagent def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 0, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, } }, 'agent': {'resource_id': self.resource_id}, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } self._start_eeagent() def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process(name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self._stop_webserver() self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) def _start_webserver(self, directory_to_serve, port=None): """ Start a webserver for testing code download Note: tries really hard to get a port, and if it can't use the suggested port, randomly picks another, and returns it """ def log_message(self, format, *args): #swallow log massages pass class Server(HTTPServer): requests = 0 def serve_forever(self): self._serving = 1 while self._serving: self.handle_request() self.requests += 1 def stop(self): self._serving = 0 if port is None: port = 8008 self.old_cwd = os.getcwd() os.chdir(directory_to_serve) Handler = SimpleHTTPServer.SimpleHTTPRequestHandler Handler.log_message = log_message for i in range(0, 100): try: self._webserver = Server(("localhost", port), Handler) except socket.error: print "port %s is in use, picking another" % port port = randint(8000, 10000) continue else: break self._web_glet = gevent.spawn(self._webserver.serve_forever) return port def _stop_webserver(self): if self._webserver is not None: self._web_glet.kill() os.chdir(self.old_cwd) def _enable_code_download(self, whitelist=None): if whitelist is None: whitelist = [] self.container.terminate_process(self._eea_pid) self.agent_config['eeagent']['code_download'] = { 'enabled': True, 'whitelist': whitelist } self._start_eeagent() def wait_for_state(self, upid, desired_state, timeout=30): attempts = 0 last_state = None while timeout > attempts: state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, upid) last_state = proc.get('state') if last_state == desired_state: return gevent.sleep(1) attempts += 1 assert False, "Process %s took too long to get to %s, had %s" % (upid, desired_state, last_state) @needs_eeagent def test_basics(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) self.wait_for_state(u_pid, [700, 'TERMINATED']) @needs_eeagent def test_failing_process(self): u_pid = "testfail" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessFail' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, 'FAILED']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) @needs_eeagent def test_slow_to_start(self): upids = map(lambda i: str(uuid.uuid4().hex), range(0, 10)) round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessSlowStart' parameters = {'name': proc_name, 'module': module, 'cls': cls} for upid in upids: self.eea_client.launch_process(upid, round, run_type, parameters) for upid in upids: self.wait_for_state(upid, [500, 'RUNNING'], timeout=60) @needs_eeagent def test_start_cancel(self): upid = str(uuid.uuid4().hex) round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcessSlowStart' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(upid, round, run_type, parameters) self.wait_for_state(upid, [400, 'PENDING']) self.eea_client.terminate_process(upid, round) self.wait_for_state(upid, [700, 'TERMINATED']) @needs_eeagent def test_kill_and_revive(self): """test_kill_and_revive Ensure that when an eeagent dies, it pulls the processes it owned from persistence, and marks them as failed, so the PD can figure out what to do with them """ u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) # Kill and restart eeagent. Also, kill proc started by eea to simulate # a killed container old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) proc_to_kill = self.container.proc_manager.procs_by_name.get(proc_name) self.assertIsNotNone(proc_to_kill) self.container.terminate_process(proc_to_kill.id) self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) self.wait_for_state(u_pid, [850, 'FAILED']) @needs_eeagent def test_download_code(self): self._enable_code_download(whitelist=['*']) u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module.to.download" module_uri = 'file://%s/downloads/module_to_download.py' % get_this_directory() bad_module_uri = 'file:///tmp/notreal/module_to_download.py' cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': bad_module_uri, 'cls': cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 404 assert "Unable to download" in response.result parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) @needs_eeagent def test_whitelist(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module" module_uri = "http://localhost:%s/module_to_download.py" % http_port cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "Code download not enabled" in response.result # Test no whitelist self._enable_code_download() response = self.eea_client.launch_process(u_pid, round, run_type, parameters) print response assert response.status == 401 assert "not in code_download whitelist" in response.result # Test not matching self._enable_code_download(whitelist=['blork']) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) assert response.status == 401 assert "not in code_download whitelist" in response.result # Test exact matching self._enable_code_download(whitelist=['localhost']) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) # Test wildcard self._enable_code_download(whitelist=['*']) response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) @needs_eeagent def test_caching(self): downloads_directory = os.path.join(get_this_directory(), "downloads") http_port = 8910 http_port = self._start_webserver(downloads_directory, port=http_port) while self._webserver is None: print "Waiting for webserver to come up" gevent.sleep(1) self._enable_code_download(['*']) assert self._webserver.requests == 0 u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = "ion.my.module" module_uri = "http://localhost:%s/module_to_download.py" % http_port cls = 'TestDownloadProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Launch a process, check that webserver is hit response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 # Launch another process, check that webserver is still only hit once response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test5" round = 0 run_type = "pyon" proc_name = 'test_transformx' module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/module_to_download.py" % http_port cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Test that a module that is already available in tarball won't trigger a download response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) assert self._webserver.requests == 1 u_pid = "test9" round = 0 run_type = "pyon" proc_name = 'test_transformx' module = "ion.agents.cei.test.test_eeagent" module_uri = "http://localhost:%s/module_to_download.py" % http_port cls = 'TestProcessNotReal' parameters = {'name': proc_name, 'module': module, 'module_uri': module_uri, 'cls': cls} # Test behaviour of a non existant class with no download response = self.eea_client.launch_process(u_pid, round, run_type, parameters) self.wait_for_state(u_pid, [850, 'FAILED']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest): """Run the basic int tests again, with a different environment """ def setUp(self): self.dashi = None self._start_container() from pyon.public import CFG self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() app = dict(name="process_dispatcher", processapp=("process_dispatcher", "ion.services.cei.process_dispatcher_service", "ProcessDispatcherService")) self.container.start_app(app, config=pd_config) self.rr_cli = self.container.resource_registry self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node) self.process_definition = ProcessDefinition(name='test_process') self.process_definition.executable = { 'module': 'ion.services.cei.test.test_process_dispatcher', 'class': 'TestProcess' } self.process_definition_id = self.pd_cli.create_process_definition( self.process_definition) self._eea_pids = [] self._eea_pid_to_resource_id = {} self._eea_pid_to_persistence_dir = {} self._tmpdirs = [] self.dashi = get_dashi( uuid.uuid4().hex, pd_config['processdispatcher']['dashi_uri'], pd_config['processdispatcher']['dashi_exchange'], sysname=CFG.get_safe("dashi.sysname")) #send a fake node_state message to PD's dashi binding. self.node1_id = uuid.uuid4().hex self._send_node_state("engine1", self.node1_id) self._initial_eea_pid = self._start_eeagent(self.node1_id) self.waiter = ProcessStateWaiter() def _send_node_state(self, engine_id, node_id=None): node_id = node_id or uuid.uuid4().hex node_state = dict(node_id=node_id, state=InstanceState.RUNNING, domain_id=domain_id_from_engine(engine_id)) self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state) def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None): if not persistence_dir: persistence_dir = tempfile.mkdtemp() self._tmpdirs.append(persistence_dir) resource_id = resource_id or uuid.uuid4().hex agent_config = _get_eeagent_config(node_id, persistence_dir, resource_id=resource_id) pid = self.container_client.spawn_process( name="eeagent", module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=agent_config) log.info('Agent pid=%s.', str(pid)) self._eea_pids.append(pid) self._eea_pid_to_resource_id[pid] = resource_id self._eea_pid_to_persistence_dir[pid] = persistence_dir return pid def _kill_eeagent(self, pid): self.assertTrue(pid in self._eea_pids) self.container.terminate_process(pid) self._eea_pids.remove(pid) del self._eea_pid_to_resource_id[pid] del self._eea_pid_to_persistence_dir[pid] def tearDown(self): for pid in list(self._eea_pids): self._kill_eeagent(pid) for d in self._tmpdirs: shutil.rmtree(d) self.waiter.stop() if self.dashi: self.dashi.cancel() def test_requested_ee(self): # request non-default engine process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.WAITING) # request unknown engine, with NEVER queuing mode. The request # should be rejected. # verifies L4-CI-CEI-RQ52 process_target = ProcessTarget(execution_engine_id="not-a-real-ee") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target rejected_pid = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=rejected_pid) self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED) # now add a node and eeagent for engine2. original process should leave # queue and start running node2_id = uuid.uuid4().hex self._send_node_state("engine2", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # spawn another process. it should start immediately. process_target = ProcessTarget(execution_engine_id="engine2") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # one more with node exclusive process_target = ProcessTarget(execution_engine_id="engine2", node_exclusive="hats") process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.NEVER process_schedule.target = process_target pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_node_exclusive(self): # the node_exclusive constraint is used to ensure multiple processes # of the same "kind" each get a VM exclusive of each other. Other # processes may run on these VMs, just not processes with the same # node_exclusive tag. Since we cannot directly query the contents # of each node in this test, we prove the capability by scheduling # processes one by one and checking their state. # verifies L4-CI-CEI-RQ121 # verifies L4-CI-CEI-RQ57 # first off, setUp() created a single node and eeagent. # We schedule two processes with the same "abc" node_exclusive # tag. Since there is only one node, the first process should run # and the second should be queued. process_target = ProcessTarget(execution_engine_id="engine1") process_target.node_exclusive = "abc" process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target pid1 = self.pd_cli.create_process(self.process_definition_id) self.waiter.start() self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING) pid2 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING) # now demonstrate that the node itself is not full by launching # a third process without a node_exclusive tag -- it should start # immediately process_target.node_exclusive = None pid3 = self.pd_cli.create_process(self.process_definition_id) self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING) # finally, add a second node to the engine. pid2 should be started # since there is an exclusive "abc" node free. node2_id = uuid.uuid4().hex self._send_node_state("engine1", node2_id) self._start_eeagent(node2_id) self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING) # kill the processes for good self.pd_cli.cancel_process(pid1) self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid2) self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED) self.pd_cli.cancel_process(pid3) self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED) def test_code_download(self): # create a process definition that has no URL; only module and class. process_definition_no_url = ProcessDefinition( name='test_process_nodownload') process_definition_no_url.executable = { 'module': 'ion.my.test.process', 'class': 'TestProcess' } process_definition_id_no_url = self.pd_cli.create_process_definition( process_definition_no_url) # create another that has a URL of the python file (this very file) # verifies L4-CI-CEI-RQ114 url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py') process_definition = ProcessDefinition(name='test_process_download') process_definition.executable = { 'module': 'ion.my.test.process', 'class': 'TestProcess', 'url': url } process_definition_id = self.pd_cli.create_process_definition( process_definition) process_target = ProcessTarget() process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS process_schedule.target = process_target self.waiter.start() # Test a module with no download fails pid_no_url = self.pd_cli.create_process(process_definition_id_no_url) self.pd_cli.schedule_process(process_definition_id_no_url, process_schedule, process_id=pid_no_url) self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED) # Test a module with a URL runs pid = self.pd_cli.create_process(process_definition_id) self.pd_cli.schedule_process(process_definition_id, process_schedule, process_id=pid) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) def _add_test_process(self, restart_mode=None): process_schedule = ProcessSchedule() if restart_mode is not None: process_schedule.restart_mode = restart_mode pid = self.pd_cli.create_process(self.process_definition_id) pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex config = {'process': {'listen_name': pid_listen_name}} self.pd_cli.schedule_process(self.process_definition_id, process_schedule, process_id=pid, configuration=config) client = TestClient(to_name=pid_listen_name) return pid, client def test_restart(self): self.waiter.start() restartable_pids = [] nonrestartable_pids = [] clients = {} # start 10 processes with RestartMode.ALWAYS for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ALWAYS) restartable_pids.append(pid) clients[pid] = client # and 10 processes with RestartMode.ABNORMAL for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL) restartable_pids.append(pid) clients[pid] = client # and 10 with RestartMode.NEVER for _ in range(10): pid, client = self._add_test_process(ProcessRestartMode.NEVER) nonrestartable_pids.append(pid) clients[pid] = client all_pids = restartable_pids + nonrestartable_pids self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING) for pid in all_pids: client = clients[pid] self.assertFalse(client.is_restart()) self.assertEqual(client.count(), 1) # now kill the whole eeagent and restart it. processes should # show up as FAILED in the next heartbeat. resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid] persistence_dir = self._eea_pid_to_persistence_dir[ self._initial_eea_pid] log.debug("Restarting eeagent %s", self._initial_eea_pid) self._kill_eeagent(self._initial_eea_pid) # manually kill the processes to simulate a real container failure for pid in all_pids: self.container.terminate_process(pid) self._start_eeagent(self.node1_id, resource_id=resource_id, persistence_dir=persistence_dir) # wait for restartables to restart self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING) # query the processes again. it should have restart mode config for pid in restartable_pids: client = clients[pid] self.assertTrue(client.is_restart()) self.assertEqual(client.count(), 1) # meanwhile some procs should not have restarted for pid in nonrestartable_pids: proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_state, ProcessStateEnum.FAILED) # guard against extraneous events we were receiving as part of a bug: # processes restarting again after they were already restarted self.waiter.await_nothing(timeout=5) def test_idempotency(self): # ensure every operation can be safely retried process_schedule = ProcessSchedule() process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS proc_name = 'myreallygoodname' pid = self.pd_cli.create_process(self.process_definition_id) self.waiter.start(pid) # note: if we import UNSCHEDULED state into ProcessStateEnum, # this assertion will need to change. proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED) pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING) # repeating schedule is harmless pid2 = self.pd_cli.schedule_process(self.process_definition_id, process_schedule, configuration={}, process_id=pid, name=proc_name) self.assertEqual(pid, pid2) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING) self.pd_cli.cancel_process(pid) self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED) # repeating cancel is harmless self.pd_cli.cancel_process(pid) proc = self.pd_cli.read_process(pid) self.assertEqual(proc.process_id, pid) self.assertEqual(proc.process_configuration, {}) self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class HeartbeaterIntTest(IonIntegrationTestCase): @needs_eeagent def setUp(self): self._start_container() self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 300, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, } }, 'agent': { 'resource_id': self.resource_id }, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = SimpleResourceAgentClient( self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent @unittest.skipIf(os.getenv('CEI_LAUNCH_TEST', False), 'Skip test while in CEI LAUNCH mode') def test_heartbeater(self): """test_heartbeater Test whether the eeagent waits until the eeagent listener is ready before sending a heartbeat to the PD """ beat_died = threading.Event() beat_succeeded = threading.Event() def heartbeat_callback(heartbeat, headers): eeagent_id = heartbeat['eeagent_id'] agent_client = SimpleResourceAgentClient(eeagent_id, name=eeagent_id, process=FakeProcess()) ee_client = ExecutionEngineAgentClient(agent_client, timeout=10) try: ee_client.dump_state() beat_succeeded.set() except: log.exception("Heartbeat Failed!") beat_died.set() self.beat_subscriber = HeartbeatSubscriber("heartbeat_queue", callback=heartbeat_callback, node=self.container.node) self.beat_subscriber.start() self._start_eeagent() success = beat_succeeded.wait(20) if success is False: died = beat_died.wait(20) assert died is False, "A Hearbeat callback wasn't able to contact the eeagent"
class ExecutionEngineAgentPyonIntTest(IonIntegrationTestCase): from ion.agents.cei.execution_engine_agent import ExecutionEngineAgentClient def setUp(self): self._start_container() self.container.start_rel_from_url('res/deploy/r2cei.yml') self.resource_id = "eeagent_123456789" self._eea_name = "eeagent" self.persistence_directory = tempfile.mkdtemp() self.agent_config = { 'eeagent': { 'heartbeat': 0, 'slots': 100, 'name': 'pyon_eeagent', 'launch_type': { 'name': 'pyon', 'persistence_directory': self.persistence_directory, }, }, 'agent': { 'resource_id': self.resource_id }, 'logging': { 'loggers': { 'eeagent': { 'level': 'DEBUG', 'handlers': ['console'] } }, 'root': { 'handlers': ['console'] }, } } self._start_eeagent() def _start_eeagent(self): self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name) self.container = self.container_client._get_container_instance() # Start eeagent. self._eea_pid = self.container_client.spawn_process( name=self._eea_name, module="ion.agents.cei.execution_engine_agent", cls="ExecutionEngineAgent", config=self.agent_config) log.info('Agent pid=%s.', str(self._eea_pid)) # Start a resource agent client to talk with the instrument agent. self._eea_pyon_client = ResourceAgentClient(self.resource_id, process=FakeProcess()) log.info('Got eea client %s.', str(self._eea_pyon_client)) self.eea_client = ExecutionEngineAgentClient(self._eea_pyon_client) def tearDown(self): self.container.terminate_process(self._eea_pid) shutil.rmtree(self.persistence_directory) @needs_eeagent def test_basics(self): u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_x' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [500, 'RUNNING']) self.eea_client.terminate_process(u_pid, round) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) @needs_eeagent def test_kill_and_revive(self): """test_kill_and_revive Ensure that when an eeagent dies, it pulls the processes it owned from persistence, and marks them as failed, so the PD can figure out what to do with them """ u_pid = "test0" round = 0 run_type = "pyon" proc_name = 'test_transform' module = 'ion.agents.cei.test.test_eeagent' cls = 'TestProcess' parameters = {'name': proc_name, 'module': module, 'cls': cls} self.eea_client.launch_process(u_pid, round, run_type, parameters) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [500, 'RUNNING']) # Kill and restart eeagent. Also, kill proc started by eea to simulate # a killed container old_eea_pid = str(self._eea_pid) self.container.terminate_process(self._eea_pid) proc_to_kill = self.container.proc_manager.procs_by_name.get(proc_name) self.assertIsNotNone(proc_to_kill) self.container.terminate_process(proc_to_kill.id) self._start_eeagent() self.assertNotEqual(old_eea_pid, self._eea_pid) state = self.eea_client.dump_state().result proc = get_proc_for_upid(state, u_pid) self.assertIsNotNone(proc, "There is no state retrieved from eeagent") self.assertEqual(proc.get('state'), [850, 'FAILED'])