Example #1
0
async def test_cancel_subtask(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    def sleep(timeout: int):
        time.sleep(timeout)
        return timeout

    a = mr.spawn(sleep, 2)

    subtask = _gen_subtask(a, session_id)
    subtask_runner: SubtaskRunnerActor = await manager.get_free_slot()
    asyncio.create_task(subtask_runner.run_subtask(subtask))
    await asyncio.sleep(0.2)
    with Timer() as timer:
        # normal cancel by cancel asyncio Task
        await manager.free_slot(subtask_runner, timeout=5)
    # do not need to wait 5 sec
    assert timer.duration < 5
    assert await manager.is_slot_free(subtask_runner) is True

    b = mr.spawn(sleep, 100)

    subtask2 = _gen_subtask(b, session_id)
    subtask_runner: SubtaskRunnerActor = await manager.get_free_slot()
    asyncio.create_task(subtask_runner.run_subtask(subtask2))
    await asyncio.sleep(0.2)
    with Timer() as timer:
        # normal cancel by cancel asyncio Task
        aio_task = asyncio.create_task(manager.free_slot(subtask_runner, timeout=1))
        assert await manager.is_slot_free(subtask_runner) is False
        await aio_task
    # need 1 sec to reach timeout, then killing actor and wait for auto recovering
    # the time would not be over 5 sec
    assert timer.duration < 5
    assert await manager.is_slot_free(subtask_runner) is True
Example #2
0
async def test_cancel_subtask(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool
    subtask_runner: SubtaskRunnerRef = await mo.actor_ref(
        SubtaskRunnerActor.gen_uid('numa-0', 0), address=pool.external_address)

    def sleep(timeout: int):
        time.sleep(timeout)
        return timeout

    b = mr.spawn(sleep, 100)

    subtask = _gen_subtask(b, session_id)
    asyncio.create_task(subtask_runner.run_subtask(subtask))
    await asyncio.sleep(0.2)
    with Timer() as timer:
        # normal cancel by cancel asyncio Task
        aio_task = asyncio.create_task(asyncio.wait_for(
            subtask_runner.cancel_subtask(), timeout=1))
        assert await subtask_runner.is_runner_free() is False
        with pytest.raises(asyncio.TimeoutError):
            await aio_task
    # need 1 sec to reach timeout, then killing actor and wait for auto recovering
    # the time would not be over 5 sec
    assert timer.duration < 5

    async def wait_slot_restore():
        while True:
            try:
                assert await subtask_runner.is_runner_free() is True
            except (mo.ServerClosed, ConnectionRefusedError, mo.ActorNotExist):
                await asyncio.sleep(0.5)
            else:
                break

    await mo.kill_actor(subtask_runner)
    await wait_slot_restore()

    a = mr.spawn(sleep, 2)

    subtask2 = _gen_subtask(a, session_id)
    asyncio.create_task(subtask_runner.run_subtask(subtask2))
    await asyncio.sleep(0.2)
    with Timer() as timer:
        # normal cancel by cancel asyncio Task
        await asyncio.wait_for(subtask_runner.cancel_subtask(), timeout=6)
    # do not need to wait 10 sec
    assert timer.duration < 10
    assert await subtask_runner.is_runner_free() is True
Example #3
0
async def test_cancel_task(actor_pool):
    pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool

    def func():
        time.sleep(200)

    rs = [mr.spawn(func) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await asyncio.sleep(.5)

    with Timer() as timer:
        await manager.cancel_task(task_id)
        result = await manager.get_task_result(task_id)
        assert result.status == TaskStatus.terminated

    assert timer.duration < 20

    keys = [r.key for r in rs]
    del rs
    gc.collect()
    await asyncio.sleep(0.5)

    # test ref counts
    assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
Example #4
0
async def test_supervisor_peer_locator(actor_pool, temp_address_file):
    addresses = [
        '1.2.3.4:1234', '1.2.3.4:1235', '1.2.3.4:1236', '1.2.3.4:1237'
    ]
    with open(temp_address_file, 'w') as file_obj:
        file_obj.write('\n'.join(addresses))

    locator_ref = await mo.create_actor(
        SupervisorPeerLocatorActor,
        'test',
        temp_address_file,
        uid=SupervisorPeerLocatorActor.default_uid(),
        address=actor_pool.external_address)

    # test starting nodes filled
    info_ref = await mo.actor_ref(uid=NodeInfoCollectorActor.default_uid(),
                                  address=actor_pool.external_address)
    assert set(await info_ref.get_nodes_info()) == set(addresses)

    # test watch nodes changes
    version, result = await locator_ref.watch_supervisors_by_keys(
        ['mock_name'])
    assert result[0] in addresses

    with open(temp_address_file, 'w') as file_obj:
        file_obj.write('\n'.join(addresses[2:]))

    version, result = await locator_ref.watch_supervisors_by_keys(
        ['mock_name'], version=version)
    assert result[0] in addresses[2:]

    # test wait all supervisors ready
    with open(temp_address_file, 'w') as file_obj:
        file_obj.write('\n'.join(f'{a},{idx % 2}'
                                 for idx, a in enumerate(addresses)))

    async def delay_read_fun():
        await asyncio.sleep(0.2)
        with open(temp_address_file, 'w') as file_obj:
            file_obj.write('\n'.join(f'{a},{(idx + 1) % 2}'
                                     for idx, a in enumerate(addresses)))
        await asyncio.sleep(0.3)
        with open(temp_address_file, 'w') as file_obj:
            file_obj.write('\n'.join(addresses))

    asyncio.create_task(delay_read_fun())

    with Timer() as timer:
        await locator_ref.wait_all_supervisors_ready()
    assert timer.duration > 0.4

    await mo.destroy_actor(locator_ref)
Example #5
0
async def test_changing_locator(actor_pool):
    addresses = ['1.2.3.4:1234', '1.2.3.4:1235',
                 '1.2.3.4:1236', '1.2.3.4:1237']
    locator_ref = await mo.create_actor(
        SupervisorLocatorActor, 'fixed', ','.join(addresses),
        address=actor_pool.external_address)

    assert (await locator_ref.watch_supervisors_by_keys(['mock_name']))[0] in addresses
    assert (await locator_ref.watch_supervisors_by_keys(['mock_name']))[0] in addresses

    assert all(addr in addresses for addr in await locator_ref.watch_supervisors())

    with Timer() as timer:
        await locator_ref.wait_all_supervisors_ready()
    assert timer.duration > 0.1

    await mo.destroy_actor(locator_ref)
Example #6
0
async def test_fixed_locator(actor_pool):
    addresses = ['1.2.3.4:1234', '1.2.3.4:1235',
                 '1.2.3.4:1236', '1.2.3.4:1237']
    locator_ref = await mo.create_actor(
        SupervisorLocatorActor, 'fixed', ','.join(addresses),
        address=actor_pool.external_address)

    assert await locator_ref.get_supervisor('mock_name') in addresses

    dbl_addrs = await locator_ref.get_supervisor('mock_name', 2)
    assert len(dbl_addrs) == 2
    assert all(addr in addresses for addr in dbl_addrs)

    with Timer() as timer:
        await locator_ref.wait_all_supervisors_ready()
    assert timer.duration < 0.1

    await mo.destroy_actor(locator_ref)
Example #7
0
def test_merge_index_value():
    with Timer() as timer:
        index_values = {i: parse_index(pd.RangeIndex(1e7)) for i in range(20)}
        index_value = merge_index_value(index_values)
        pd.testing.assert_index_equal(index_value.to_pandas(),
                                      pd.Index([], dtype=np.int64))
        assert index_value.min_val == 0
        assert index_value.max_val == 1e7 - 1

        # range indexes that are continuous
        index_values = {
            i: parse_index(pd.RangeIndex(i * 1e7, (i + 1) * 1e7))
            for i in range(20)
        }
        index_value = merge_index_value(index_values)
        pd.testing.assert_index_equal(index_value.to_pandas(),
                                      pd.RangeIndex(1e7 * 20))
        assert index_value.min_val == 0
        assert index_value.max_val == 1e7 * 20 - 1
    assert timer.duration < 1
Example #8
0
async def test_cancel_task(actor_pool):
    pool, session_id, meta_api, storage_api, manager = actor_pool

    def func():
        time.sleep(20)

    rs = [mr.spawn(func) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False)
    assert isinstance(task_id, str)

    await asyncio.sleep(.5)

    with Timer() as timer:
        await manager.cancel_task(task_id)
        result = await manager.get_task_result(task_id)
        assert result.status == TaskStatus.terminated

    assert timer.duration < 15
Example #9
0
async def test_task_cancel(start_test_service):
    _sv_pool_address, task_api, storage_api = start_test_service

    # test job cancel
    def f1():
        time.sleep(100)

    rs = [mr.spawn(f1) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
    await asyncio.sleep(.5)
    with Timer() as timer:
        await task_api.cancel_task(task_id)
        result = await task_api.get_task_result(task_id)
        assert result.status == TaskStatus.terminated
    assert timer.duration < 20
    await asyncio.sleep(.1)
    assert await task_api.get_last_idle_time() is not None

    results = await task_api.get_task_results(progress=True)
    assert all(result.status == TaskStatus.terminated for result in results)
Example #10
0
async def test_subtask_service(actor_pools):
    sv_pool, worker_pool = actor_pools

    config = {
        "services": [
            "cluster", "session", "meta", "lifecycle", "scheduling", "subtask",
            "task"
        ],
        "cluster": {
            "backend": "fixed",
            "lookup_address": sv_pool.external_address,
            "resource": {
                "numa-0": 2
            }
        },
        "meta": {
            "store": "dict"
        },
        "scheduling": {},
        "subtask": {},
    }
    await start_services(NodeRole.SUPERVISOR,
                         config,
                         address=sv_pool.external_address)
    await start_services(NodeRole.WORKER,
                         config,
                         address=worker_pool.external_address)

    session_id = 'test_session'
    session_api = await SessionAPI.create(sv_pool.external_address)
    await session_api.create_session(session_id)
    ref = await mo.actor_ref(FakeTaskManager.gen_uid(session_id),
                             address=sv_pool.external_address)
    await mo.destroy_actor(ref)
    await mo.create_actor(FakeTaskManager,
                          session_id,
                          uid=FakeTaskManager.gen_uid(session_id),
                          address=sv_pool.external_address)

    subtask_api = await SubtaskAPI.create(worker_pool.external_address)
    # create mock meta and storage APIs
    meta_api = await MetaAPI.create(session_id, sv_pool.external_address)
    storage_api = await MockStorageAPI.create(session_id,
                                              worker_pool.external_address)

    a = mt.ones((10, 10), chunk_size=10)
    b = a + 1

    subtask = _gen_subtask(b, session_id)
    await subtask_api.run_subtask_in_slot('numa-0', 0, subtask)

    # check storage
    expected = np.ones((10, 10)) + 1
    result_key = subtask.chunk_graph.results[0].key
    result = await storage_api.get(result_key)
    np.testing.assert_array_equal(expected, result)

    # check meta
    chunk_meta = await meta_api.get_chunk_meta(result_key)
    assert chunk_meta is not None
    assert chunk_meta['bands'][0] == (worker_pool.external_address, 'numa-0')

    def sleep(timeout: int):
        time.sleep(timeout)
        return timeout

    b = mr.spawn(sleep, 1)

    subtask2 = _gen_subtask(b, session_id)
    asyncio.create_task(subtask_api.run_subtask_in_slot('numa-0', 0, subtask2))
    await asyncio.sleep(0.2)
    with Timer() as timer:
        # normal cancel by cancel asyncio Task
        await asyncio.wait_for(subtask_api.cancel_subtask_in_slot('numa-0', 0),
                               timeout=2)
    # need 1 sec to reach timeout, then killing actor and wait for auto recovering
    # the time would not be over 5 sec
    assert timer.duration < 2

    await MockStorageAPI.cleanup(worker_pool.external_address)
Example #11
0
async def test_execute_with_cancel(actor_pool, cancel_phase):
    pool, session_id, meta_api, storage_api, execution_ref = actor_pool

    # config for different phases
    ref_to_delay = None
    if cancel_phase == 'prepare':
        ref_to_delay = await mo.actor_ref(StorageManagerActor.default_uid(),
                                          address=pool.external_address)
    elif cancel_phase == 'quota':
        ref_to_delay = await mo.actor_ref(QuotaActor.gen_uid('numa-0'),
                                          address=pool.external_address)
    elif cancel_phase == 'slot':
        ref_to_delay = await mo.actor_ref(
            BandSlotManagerActor.gen_uid('numa-0'),
            address=pool.external_address)
    if ref_to_delay:
        await ref_to_delay.set_delay_fetch_time(100)

    def delay_fun(delay, _inp1):
        time.sleep(delay)
        return delay

    input1 = TensorFetch(key='input1',
                         source_key='input1',
                         dtype=np.dtype(int)).new_chunk([])
    remote_result = RemoteFunction(function=delay_fun, function_args=[100, input1],
                                   function_kwargs={}, n_output=1) \
        .new_chunk([input1])

    data1 = np.random.rand(10, 10)
    await meta_api.set_chunk_meta(input1,
                                  memory_size=data1.nbytes,
                                  store_size=data1.nbytes,
                                  bands=[(pool.external_address, 'numa-0')])
    await storage_api.put(input1.key, data1)

    chunk_graph = ChunkGraph([remote_result])
    chunk_graph.add_node(input1)
    chunk_graph.add_node(remote_result)
    chunk_graph.add_edge(input1, remote_result)

    subtask = Subtask(f'test_task_{uuid.uuid4()}',
                      session_id=session_id,
                      chunk_graph=chunk_graph)
    aiotask = asyncio.create_task(
        execution_ref.run_subtask(subtask, 'numa-0', pool.external_address))
    await asyncio.sleep(1)

    with Timer() as timer:
        await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1)
        with pytest.raises(asyncio.CancelledError):
            await asyncio.wait_for(aiotask, timeout=30)
    assert timer.duration < 6

    # check for different phases
    if ref_to_delay is not None:
        assert await ref_to_delay.get_is_cancelled()
        await ref_to_delay.set_delay_fetch_time(0)

    # test if slot is restored
    remote_tileable = mr.spawn(delay_fun, args=(0.5, None))
    graph = TileableGraph([remote_tileable.data])
    next(TileableGraphBuilder(graph).build())

    chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build())

    subtask = Subtask(f'test_task2_{uuid.uuid4()}',
                      session_id=session_id,
                      chunk_graph=chunk_graph)
    await asyncio.wait_for(execution_ref.run_subtask(subtask, 'numa-0',
                                                     pool.external_address),
                           timeout=30)
Example #12
0
async def test_task_service(actor_pools, use_web_api):
    sv_pool, worker_pool = actor_pools

    config = {
        "services": [
            "cluster", "session", "lifecycle", "meta", "lifecycle",
            "scheduling", "task", "subtask"
        ],
        "cluster": {
            "backend": "fixed",
            "lookup_address": sv_pool.external_address,
            "resource": {
                "numa-0": 2
            }
        },
        "meta": {
            "store": "dict"
        },
        "scheduling": {},
        "task": {},
    }
    if use_web_api:
        config['services'].append('web')

    await start_services(NodeRole.SUPERVISOR,
                         config,
                         address=sv_pool.external_address)
    await start_services(NodeRole.WORKER,
                         config,
                         address=worker_pool.external_address)

    session_id = 'test_session'
    session_api = await SessionAPI.create(sv_pool.external_address)
    await session_api.create_session(session_id)

    if not use_web_api:
        task_api = await TaskAPI.create(session_id, sv_pool.external_address)
    else:
        web_actor = await mo.actor_ref(WebActor.default_uid(),
                                       address=sv_pool.external_address)
        web_address = await web_actor.get_web_address()
        task_api = WebTaskAPI(session_id, web_address)

    # create mock meta and storage APIs
    _ = await MetaAPI.create(session_id, sv_pool.external_address)
    storage_api = await MockStorageAPI.create(session_id,
                                              worker_pool.external_address)

    def f1():
        return np.arange(5)

    def f2():
        return np.arange(5, 10)

    def f3(f1r, f2r):
        return np.concatenate([f1r, f2r]).sum()

    r1 = mr.spawn(f1)
    r2 = mr.spawn(f2)
    r3 = mr.spawn(f3, args=(r1, r2))

    graph = TileableGraph([r3.data])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
    assert await task_api.get_last_idle_time() is None
    assert isinstance(task_id, str)

    await task_api.wait_task(task_id)
    task_result = await task_api.get_task_result(task_id)

    assert task_result.status == TaskStatus.terminated
    assert await task_api.get_last_idle_time() is not None
    if task_result.error is not None:
        raise task_result.error.with_traceback(task_result.traceback)

    result_tileable = (await task_api.get_fetch_tileables(task_id))[0]
    data_key = result_tileable.chunks[0].key
    assert await storage_api.get(data_key) == 45

    # test job cancel
    def f4():
        time.sleep(100)

    rs = [mr.spawn(f4) for _ in range(10)]

    graph = TileableGraph([r.data for r in rs])
    next(TileableGraphBuilder(graph).build())

    task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False)
    await asyncio.sleep(.5)
    with Timer() as timer:
        await task_api.cancel_task(task_id)
        result = await task_api.get_task_result(task_id)
        assert result.status == TaskStatus.terminated
    assert timer.duration < 20
    await asyncio.sleep(.1)
    assert await task_api.get_last_idle_time() is not None

    await MockStorageAPI.cleanup(worker_pool.external_address)