예제 #1
0
async def test_controller_spot_termination_handler(ray_fix):
    controller = Controller(100, 5)
    job = RayAdaptDLJob(None, 0, 0)
    controller._job = job
    controller.rescheduled = False

    async def mocked_reschedule():
        controller.rescheduled = True

    controller._cluster = Cluster(None, 0)
    controller._reschedule_jobs = mocked_reschedule
    asyncio.create_task(controller._reschedule_listener())

    controller._cluster.marked = None

    def mocked_mark_node_for_termination(ip):
        controller._cluster.marked = ip

    controller._cluster.mark_node_for_termination = \
        mocked_mark_node_for_termination

    async def task():
        return "some ip"

    async def wrapper():
        awaitable_task = asyncio.create_task(task())
        await controller._spot_termination_handler(awaitable_task)

    await wrapper()
    await asyncio.sleep(4)
    assert controller.rescheduled
    assert controller._cluster.marked == "some ip"
예제 #2
0
async def test_controller_handle_report():
    controller = Controller(100, 1)
    job = RayAdaptDLJob(None, 0, 1)
    controller._job = job
    controller.rescheduled = False

    async def mocked_reschedule():
        controller.rescheduled = True

    controller._reschedule_jobs = mocked_reschedule
    asyncio.create_task(controller._reschedule_listener())

    class MockedRequest:
        def __init__(self, body):
            self._body = body

        async def json(self):
            return self._body
    hints = {"some": "hints"}
    hints_json = MockedRequest(json.dumps(hints))
    await controller._handle_report(hints_json)

    await asyncio.sleep(5)

    assert(
        controller.rescheduled and
        json.loads(job._last_metrics) == hints and
        id(job._last_metrics) != id(hints))
예제 #3
0
async def test_controller_create_job(ray_fix):
    controller = Controller(100, 5)
    controller._ready.set()
    controller.rescheduled = False

    async def mocked_reschedule():
        controller.rescheduled = True
        controller._job.completed.set()
        controller._job._status = Status.SUCCEEDED

    controller._reschedule_jobs = mocked_reschedule

    resources = {"CPU": 1, "GPU": 2}
    asyncio.create_task(controller._reschedule_listener())
    await controller.create_job(
        worker_resources=resources,
        worker_port_offset=0,
        checkpoint_timeout=1)

    assert controller._job._worker_resources == resources
    assert controller._job._worker_port_offset == 0
    assert controller._job._checkpoint_timeout == 1
    assert controller._cluster._worker_resources == resources
    assert controller.rescheduled
    assert controller._job._status == Status.SUCCEEDED
예제 #4
0
def run_adaptdl_on_ray_cluster(path, argv, ray_uri, working_dir,
                               worker_resources, cluster_size,
                               worker_port_offset, checkpoint_timeout,
                               rescale_timeout):
    LOG.info("Starting AdaptDLJob")
    if ray.is_initialized():
        return
    if not os.path.exists(working_dir):
        raise RuntimeError(f"Cannot find local directory {working_dir}")
    if not os.path.exists(os.path.join(working_dir, path)):
        raise RuntimeError(
            f"Cannot find local file {os.path.join(working_dir, path)}")
    runtime_env = {"working_dir": working_dir}
    ray.init(ray_uri, runtime_env=runtime_env)

    controller = Controller.options(name="AdaptDLController").remote(
        cluster_size, rescale_timeout)

    controller.run_controller.remote()
    try:
        status_obj = controller.create_job.remote(worker_resources,
                                                  worker_port_offset,
                                                  checkpoint_timeout,
                                                  path=path,
                                                  argv=argv)
        status = ray.get(status_obj)
        if status.value == Status.SUCCEEDED.value:
            LOG.info("Job succeeded")
            return 0
        else:
            raise RuntimeError("Job failed")
    except Exception as e:
        raise e
예제 #5
0
async def test_controller_register_status():
    controller = Controller(100, 5)
    job = RayAdaptDLJob(None, 0, 0)
    controller._job = job
    status = Status.RUNNING.value
    await controller.register_status(status)
    assert(job._status == Status.RUNNING and not job.completed.is_set())
    status = Status.SUCCEEDED.value
    await controller.register_status(status)
    assert(job._status == Status.SUCCEEDED and job.completed.is_set())
예제 #6
0
async def test_controller_register_checkpoint(ray_fix):
    controller = Controller(100, 5)
    job = RayAdaptDLJob(None, 0, 0)
    controller._job = job
    checkpoint = "foo"
    checkpoint_received = await controller.register_checkpoint(checkpoint)
    assert checkpoint_received
    assert job._checkpoint_received
    assert job._checkpoint == "foo"
    assert ray.get(job._checkpoint_ref) == "foo"
예제 #7
0
async def test_controller_reschedule_jobs(ray_fix):
    controller = Controller(100, 5)
    job = RayAdaptDLJob({"CPU": 1}, 0, 0)
    controller._job = job
    job.forced_checkpoint = False
    job.updated = 0
    controller.handled_workers = []

    async def mocked_handle_worker_failure(tasks):
        controller.handled_workers += tasks

    async def mocked_update_workers(allocation):
        await asyncio.sleep(3)
        job.updated += 1
        if job._workers != allocation:
            job._workers = allocation
            return allocation
        return None

    controller._handle_worker_failure = mocked_handle_worker_failure
    job.update_workers = mocked_update_workers

    controller._cluster = Cluster(None, 0)
    controller._cluster.expanded = None

    async def mocked_expand_cluster(workers, allocation):
        controller._cluster.expanded = allocation
        return allocation

    controller._cluster.expand_cluster = mocked_expand_cluster

    async def wrapped_call(duration):
        await asyncio.sleep(duration)
        await controller._reschedule_jobs()

    await asyncio.wait_for(
        asyncio.gather(
            wrapped_call(0), wrapped_call(1), wrapped_call(2)),
        15)

    await asyncio.sleep(4)
    assert job.updated == 3

    # Default allocation
    assert controller.handled_workers == ['adaptdl_virtual_node_0']
    assert controller._cluster.expanded == ['adaptdl_virtual_node_0']
예제 #8
0
async def test_controller_run(ray_fix):
    controller = Controller(100, 5)
    controller.app_ran = False

    class MockedRunner():
        def __init__(self):
            self.cleaned_up = False

        async def cleanup(self):
            self.cleaned_up = True

    async def mocked_run_app():
        controller._completed.set()
        controller._runner = MockedRunner()
        controller.app_ran = True

    controller._run_app = mocked_run_app
    await controller.run_controller()

    assert controller._runner.cleaned_up
    assert controller.app_ran
    await controller._runner.cleanup()
예제 #9
0
async def test_controller_run_app(aiohttp_client):
    controller = Controller(4, 100)
    controller.called_report = False
    controller.called_discover = False

    async def mocked_report(request):
        controller.called_report = True
        return web.Response(text="Success Hints")

    async def mocked_discover(request):
        controller.called_discover = True
        return web.Response(text="Success Discover")

    async def put():
        await asyncio.sleep(5)
        client = await aiohttp_client(controller._runner.app())
        return await client.put("/hints/namespace/name")

    async def get():
        await asyncio.sleep(5)
        client = await aiohttp_client(controller._runner.app())
        return await client.get("/discover/namespace/name/group")

    controller._handle_discover = mocked_discover
    controller._handle_report = mocked_report
    await controller._run_app()
    put_response = await put()
    get_response = await get()

    put_text = await(put_response.text())
    assert put_text == "Success Hints"
    get_text = await(get_response.text())
    assert get_text == "Success Discover"

    assert controller.called_report
    assert controller.called_discover
    await controller._runner.cleanup()
예제 #10
0
async def test_controller_register_worker(ray_fix):
    controller = Controller(100, 5)
    job = RayAdaptDLJob(None, 0, 0)
    controller._job = job
    controller._spot_listener_tasks = {"some-ip": 1}

    controller.task_result = None

    async def mocked_spot_termination_handler(task):
        controller.task_result = ray.get(task)

    controller._spot_termination_handler = mocked_spot_termination_handler

    ip = ray._private.services.get_node_ip_address()

    await controller.register_worker(0, "some-ip")
    await controller.register_worker(
        1, ray._private.services.get_node_ip_address())

    await asyncio.sleep(1)

    assert job._workers[0] == "some-ip"
    assert job._workers[1] == ip
    assert controller.task_result == "a different ip"
예제 #11
0
async def test_controller_handle_discover():
    controller = Controller(4, 100)
    controller._job = MockedJob(
        workers={0: "127.0.0.1", 1: "127.0.0.2", 2: "0.0.0.0"})
    workers = await controller._handle_discover(None)
    assert (json.loads(workers.text) == ["127.0.0.1", "127.0.0.2", "0.0.0.0"])