async def test_controller_spot_termination_handler(ray_fix): controller = Controller(100, 5) job = RayAdaptDLJob(None, 0, 0) controller._job = job controller.rescheduled = False async def mocked_reschedule(): controller.rescheduled = True controller._cluster = Cluster(None, 0) controller._reschedule_jobs = mocked_reschedule asyncio.create_task(controller._reschedule_listener()) controller._cluster.marked = None def mocked_mark_node_for_termination(ip): controller._cluster.marked = ip controller._cluster.mark_node_for_termination = \ mocked_mark_node_for_termination async def task(): return "some ip" async def wrapper(): awaitable_task = asyncio.create_task(task()) await controller._spot_termination_handler(awaitable_task) await wrapper() await asyncio.sleep(4) assert controller.rescheduled assert controller._cluster.marked == "some ip"
async def test_controller_handle_report(): controller = Controller(100, 1) job = RayAdaptDLJob(None, 0, 1) controller._job = job controller.rescheduled = False async def mocked_reschedule(): controller.rescheduled = True controller._reschedule_jobs = mocked_reschedule asyncio.create_task(controller._reschedule_listener()) class MockedRequest: def __init__(self, body): self._body = body async def json(self): return self._body hints = {"some": "hints"} hints_json = MockedRequest(json.dumps(hints)) await controller._handle_report(hints_json) await asyncio.sleep(5) assert( controller.rescheduled and json.loads(job._last_metrics) == hints and id(job._last_metrics) != id(hints))
async def test_controller_create_job(ray_fix): controller = Controller(100, 5) controller._ready.set() controller.rescheduled = False async def mocked_reschedule(): controller.rescheduled = True controller._job.completed.set() controller._job._status = Status.SUCCEEDED controller._reschedule_jobs = mocked_reschedule resources = {"CPU": 1, "GPU": 2} asyncio.create_task(controller._reschedule_listener()) await controller.create_job( worker_resources=resources, worker_port_offset=0, checkpoint_timeout=1) assert controller._job._worker_resources == resources assert controller._job._worker_port_offset == 0 assert controller._job._checkpoint_timeout == 1 assert controller._cluster._worker_resources == resources assert controller.rescheduled assert controller._job._status == Status.SUCCEEDED
def run_adaptdl_on_ray_cluster(path, argv, ray_uri, working_dir, worker_resources, cluster_size, worker_port_offset, checkpoint_timeout, rescale_timeout): LOG.info("Starting AdaptDLJob") if ray.is_initialized(): return if not os.path.exists(working_dir): raise RuntimeError(f"Cannot find local directory {working_dir}") if not os.path.exists(os.path.join(working_dir, path)): raise RuntimeError( f"Cannot find local file {os.path.join(working_dir, path)}") runtime_env = {"working_dir": working_dir} ray.init(ray_uri, runtime_env=runtime_env) controller = Controller.options(name="AdaptDLController").remote( cluster_size, rescale_timeout) controller.run_controller.remote() try: status_obj = controller.create_job.remote(worker_resources, worker_port_offset, checkpoint_timeout, path=path, argv=argv) status = ray.get(status_obj) if status.value == Status.SUCCEEDED.value: LOG.info("Job succeeded") return 0 else: raise RuntimeError("Job failed") except Exception as e: raise e
async def test_controller_register_status(): controller = Controller(100, 5) job = RayAdaptDLJob(None, 0, 0) controller._job = job status = Status.RUNNING.value await controller.register_status(status) assert(job._status == Status.RUNNING and not job.completed.is_set()) status = Status.SUCCEEDED.value await controller.register_status(status) assert(job._status == Status.SUCCEEDED and job.completed.is_set())
async def test_controller_register_checkpoint(ray_fix): controller = Controller(100, 5) job = RayAdaptDLJob(None, 0, 0) controller._job = job checkpoint = "foo" checkpoint_received = await controller.register_checkpoint(checkpoint) assert checkpoint_received assert job._checkpoint_received assert job._checkpoint == "foo" assert ray.get(job._checkpoint_ref) == "foo"
async def test_controller_reschedule_jobs(ray_fix): controller = Controller(100, 5) job = RayAdaptDLJob({"CPU": 1}, 0, 0) controller._job = job job.forced_checkpoint = False job.updated = 0 controller.handled_workers = [] async def mocked_handle_worker_failure(tasks): controller.handled_workers += tasks async def mocked_update_workers(allocation): await asyncio.sleep(3) job.updated += 1 if job._workers != allocation: job._workers = allocation return allocation return None controller._handle_worker_failure = mocked_handle_worker_failure job.update_workers = mocked_update_workers controller._cluster = Cluster(None, 0) controller._cluster.expanded = None async def mocked_expand_cluster(workers, allocation): controller._cluster.expanded = allocation return allocation controller._cluster.expand_cluster = mocked_expand_cluster async def wrapped_call(duration): await asyncio.sleep(duration) await controller._reschedule_jobs() await asyncio.wait_for( asyncio.gather( wrapped_call(0), wrapped_call(1), wrapped_call(2)), 15) await asyncio.sleep(4) assert job.updated == 3 # Default allocation assert controller.handled_workers == ['adaptdl_virtual_node_0'] assert controller._cluster.expanded == ['adaptdl_virtual_node_0']
async def test_controller_run(ray_fix): controller = Controller(100, 5) controller.app_ran = False class MockedRunner(): def __init__(self): self.cleaned_up = False async def cleanup(self): self.cleaned_up = True async def mocked_run_app(): controller._completed.set() controller._runner = MockedRunner() controller.app_ran = True controller._run_app = mocked_run_app await controller.run_controller() assert controller._runner.cleaned_up assert controller.app_ran await controller._runner.cleanup()
async def test_controller_run_app(aiohttp_client): controller = Controller(4, 100) controller.called_report = False controller.called_discover = False async def mocked_report(request): controller.called_report = True return web.Response(text="Success Hints") async def mocked_discover(request): controller.called_discover = True return web.Response(text="Success Discover") async def put(): await asyncio.sleep(5) client = await aiohttp_client(controller._runner.app()) return await client.put("/hints/namespace/name") async def get(): await asyncio.sleep(5) client = await aiohttp_client(controller._runner.app()) return await client.get("/discover/namespace/name/group") controller._handle_discover = mocked_discover controller._handle_report = mocked_report await controller._run_app() put_response = await put() get_response = await get() put_text = await(put_response.text()) assert put_text == "Success Hints" get_text = await(get_response.text()) assert get_text == "Success Discover" assert controller.called_report assert controller.called_discover await controller._runner.cleanup()
async def test_controller_register_worker(ray_fix): controller = Controller(100, 5) job = RayAdaptDLJob(None, 0, 0) controller._job = job controller._spot_listener_tasks = {"some-ip": 1} controller.task_result = None async def mocked_spot_termination_handler(task): controller.task_result = ray.get(task) controller._spot_termination_handler = mocked_spot_termination_handler ip = ray._private.services.get_node_ip_address() await controller.register_worker(0, "some-ip") await controller.register_worker( 1, ray._private.services.get_node_ip_address()) await asyncio.sleep(1) assert job._workers[0] == "some-ip" assert job._workers[1] == ip assert controller.task_result == "a different ip"
async def test_controller_handle_discover(): controller = Controller(4, 100) controller._job = MockedJob( workers={0: "127.0.0.1", 1: "127.0.0.2", 2: "0.0.0.0"}) workers = await controller._handle_discover(None) assert (json.loads(workers.text) == ["127.0.0.1", "127.0.0.2", "0.0.0.0"])