async def test_subtask_manager(actor_pool): pool, session_id, execution_ref, manager_ref, queue_ref, task_manager_ref = actor_pool subtask1 = Subtask('subtask1', session_id) subtask2 = Subtask('subtask2', session_id) await manager_ref.add_subtasks([subtask1, subtask2], [(1, ), (2, )]) await manager_ref.submit_subtask_to_band(subtask1.subtask_id, (pool.external_address, 'gpu-0')) await manager_ref.submit_subtask_to_band(subtask1.subtask_id, (pool.external_address, 'gpu-1')) await manager_ref.cancel_subtasks( [subtask1.subtask_id, subtask2.subtask_id]) await asyncio.wait_for(asyncio.gather( execution_ref.wait_subtask(subtask1.subtask_id, 'gpu-0'), execution_ref.wait_subtask(subtask1.subtask_id, 'gpu-1'), ), timeout=10) assert (await task_manager_ref.get_result(subtask1.subtask_id)).status \ == SubtaskStatus.cancelled assert (await task_manager_ref.get_result(subtask2.subtask_id)).status \ == SubtaskStatus.cancelled subtask3 = Subtask('subtask3', session_id) await queue_ref.set_error(ValueError()) await manager_ref.add_subtasks.tell([subtask3], [(3, )]) await asyncio.sleep(0.1) subtask3_result = await task_manager_ref.get_result(subtask3.subtask_id) assert subtask3_result.status == SubtaskStatus.errored assert isinstance(subtask3_result.error, ValueError)
async def test_subtask_queueing(actor_pool): _pool, session_id, queueing_ref, slots_ref, manager_ref = actor_pool await slots_ref.set_capacity(2) subtasks = [Subtask(str(i)) for i in range(5)] priorities = [(i, ) for i in range(5)] await queueing_ref.add_subtasks(subtasks, priorities) # queue: [4 3 2 1 0] await queueing_ref.submit_subtasks() # queue: [2 1 0] commited_subtask_ids, _commited_bands = await manager_ref.dump_data() assert commited_subtask_ids == ['4', '3'] await queueing_ref.remove_queued_subtasks(['1']) # queue: [2 0] await queueing_ref.update_subtask_priority.batch( queueing_ref.update_subtask_priority.delay('0', (3, )), queueing_ref.update_subtask_priority.delay('4', (5, )), ) # queue: [0(3) 2] await queueing_ref.submit_subtasks() # queue: [] commited_subtasks, _commited_bands = await manager_ref.dump_data() assert commited_subtasks == ['4', '3', '0', '2']
async def test_assign_gpu_tasks(actor_pool): pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool input1 = TensorFetch(key='a', source_key='a', dtype=np.dtype(int)).new_chunk([]) input2 = TensorFetch(key='b', source_key='b', dtype=np.dtype(int)).new_chunk([]) result_chunk = TensorTreeAdd(args=[input1, input2], gpu=True) \ .new_chunk([input1, input2]) chunk_graph = ChunkGraph([result_chunk]) chunk_graph.add_node(input1) chunk_graph.add_node(input2) chunk_graph.add_node(result_chunk) chunk_graph.add_edge(input1, result_chunk) chunk_graph.add_edge(input2, result_chunk) await meta_api.set_chunk_meta(input1, memory_size=200, store_size=200, bands=[('address0', 'numa-0')]) await meta_api.set_chunk_meta(input2, memory_size=200, store_size=200, bands=[('address0', 'numa-0')]) subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph) [result] = await assigner_ref.assign_subtasks([subtask]) assert result[1].startswith('gpu')
def _gen_subtask(t, session_id): graph = TileableGraph([t.data]) next(TileableGraphBuilder(graph).build()) chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) subtask = Subtask(new_task_id(), session_id, new_task_id(), chunk_graph) return subtask
async def _queue_subtasks(num_subtasks, expect_bands, queueing_ref): if not num_subtasks: return subtasks = [ Subtask(expect_bands[0] + '-' + str(i)) for i in range(num_subtasks) ] for subtask in subtasks: subtask.expect_bands = [expect_bands] priorities = [(i, ) for i in range(num_subtasks)] await queueing_ref.add_subtasks(subtasks, priorities)
async def test_execute_tensor(actor_pool): pool, session_id, meta_api, storage_api, execution_ref = actor_pool data1 = np.random.rand(10, 10) data2 = np.random.rand(10, 10) input1 = TensorFetch(key='input1', source_key='input2', dtype=np.dtype(int)).new_chunk([]) input2 = TensorFetch(key='input2', source_key='input2', dtype=np.dtype(int)).new_chunk([]) result_chunk = TensorTreeAdd(args=[input1, input2]) \ .new_chunk([input1, input2], shape=data1.shape, dtype=data1.dtype) await meta_api.set_chunk_meta(input1, memory_size=data1.nbytes, store_size=data1.nbytes, bands=[(pool.external_address, 'numa-0')]) await meta_api.set_chunk_meta(input2, memory_size=data1.nbytes, store_size=data2.nbytes, bands=[(pool.external_address, 'numa-0')]) # todo use different storage level when storage ready await storage_api.put(input1.key, data1) await storage_api.put(input2.key, data2) chunk_graph = ChunkGraph([result_chunk]) chunk_graph.add_node(input1) chunk_graph.add_node(input2) chunk_graph.add_node(result_chunk) chunk_graph.add_edge(input1, result_chunk) chunk_graph.add_edge(input2, result_chunk) subtask = Subtask('test_task', session_id=session_id, chunk_graph=chunk_graph) await execution_ref.run_subtask(subtask, 'numa-0', pool.external_address) # check if results are correct result = await storage_api.get(result_chunk.key) np.testing.assert_array_equal(data1 + data2, result) # check if quota computations are correct quota_ref = await mo.actor_ref(QuotaActor.gen_uid('numa-0'), address=pool.external_address) [quota] = await quota_ref.get_batch_quota_reqs() assert quota[(subtask.subtask_id, subtask.subtask_id)] == data1.nbytes # check if metas are correct result_meta = await meta_api.get_chunk_meta(result_chunk.key) assert result_meta['object_id'] == result_chunk.key assert result_meta['shape'] == result.shape
async def test_cancel_without_kill(actor_pool): pool, session_id, meta_api, storage_api, execution_ref = actor_pool def delay_fun(delay): import mars time.sleep(delay) mars._slot_marker = 1 return delay def check_fun(): import mars return getattr(mars, '_slot_marker', False) remote_result = RemoteFunction(function=delay_fun, function_args=[2], function_kwargs={}).new_chunk([]) chunk_graph = ChunkGraph([remote_result]) chunk_graph.add_node(remote_result) subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) aiotask = asyncio.create_task(execution_ref.run_subtask( subtask, 'numa-0', pool.external_address)) await asyncio.sleep(0.5) await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1) with pytest.raises(asyncio.CancelledError): await asyncio.wait_for(aiotask, timeout=30) remote_result = RemoteFunction(function=check_fun, function_args=[], function_kwargs={}).new_chunk([]) chunk_graph = ChunkGraph([remote_result]) chunk_graph.add_node(remote_result) subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) await execution_ref.run_subtask( subtask, 'numa-0', pool.external_address) # check if results are correct assert await storage_api.get(remote_result.key)
async def test_assigner(actor_pool): pool, session_id, assigner_ref, meta_api = actor_pool input1 = TensorFetch(key='a', source_key='a', dtype=np.dtype(int)).new_chunk([]) input2 = TensorFetch(key='b', source_key='b', dtype=np.dtype(int)).new_chunk([]) input3 = TensorFetch(key='c', source_key='c', dtype=np.dtype(int)).new_chunk([]) result_chunk = TensorTreeAdd(args=[input1, input2, input3]) \ .new_chunk([input1, input2, input3]) chunk_graph = ChunkGraph([result_chunk]) chunk_graph.add_node(input1) chunk_graph.add_node(input2) chunk_graph.add_node(input3) chunk_graph.add_node(result_chunk) chunk_graph.add_edge(input1, result_chunk) chunk_graph.add_edge(input2, result_chunk) chunk_graph.add_edge(input3, result_chunk) await meta_api.set_chunk_meta(input1, memory_size=200, store_size=200, bands=[('address0', 'numa-0')]) await meta_api.set_chunk_meta(input2, memory_size=400, store_size=400, bands=[('address1', 'numa-0')]) await meta_api.set_chunk_meta(input3, memory_size=400, store_size=400, bands=[('address2', 'numa-0')]) subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph) [result] = await assigner_ref.assign_subtasks([subtask]) assert result in (('address1', 'numa-0'), ('address2', 'numa-0'))
async def test_execute_with_cancel(actor_pool, cancel_phase): pool, session_id, meta_api, storage_api, execution_ref = actor_pool # config for different phases ref_to_delay = None if cancel_phase == 'prepare': ref_to_delay = await mo.actor_ref(StorageManagerActor.default_uid(), address=pool.external_address) elif cancel_phase == 'quota': ref_to_delay = await mo.actor_ref(QuotaActor.gen_uid('numa-0'), address=pool.external_address) elif cancel_phase == 'slot': ref_to_delay = await mo.actor_ref( BandSlotManagerActor.gen_uid('numa-0'), address=pool.external_address) if ref_to_delay: await ref_to_delay.set_delay_fetch_time(100) def delay_fun(delay, _inp1): time.sleep(delay) return delay input1 = TensorFetch(key='input1', source_key='input1', dtype=np.dtype(int)).new_chunk([]) remote_result = RemoteFunction(function=delay_fun, function_args=[100, input1], function_kwargs={}, n_output=1) \ .new_chunk([input1]) data1 = np.random.rand(10, 10) await meta_api.set_chunk_meta(input1, memory_size=data1.nbytes, store_size=data1.nbytes, bands=[(pool.external_address, 'numa-0')]) await storage_api.put(input1.key, data1) chunk_graph = ChunkGraph([remote_result]) chunk_graph.add_node(input1) chunk_graph.add_node(remote_result) chunk_graph.add_edge(input1, remote_result) subtask = Subtask(f'test_task_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) aiotask = asyncio.create_task( execution_ref.run_subtask(subtask, 'numa-0', pool.external_address)) await asyncio.sleep(1) with Timer() as timer: await execution_ref.cancel_subtask(subtask.subtask_id, kill_timeout=1) with pytest.raises(asyncio.CancelledError): await asyncio.wait_for(aiotask, timeout=30) assert timer.duration < 6 # check for different phases if ref_to_delay is not None: assert await ref_to_delay.get_is_cancelled() await ref_to_delay.set_delay_fetch_time(0) # test if slot is restored remote_tileable = mr.spawn(delay_fun, args=(0.5, None)) graph = TileableGraph([remote_tileable.data]) next(TileableGraphBuilder(graph).build()) chunk_graph = next(ChunkGraphBuilder(graph, fuse_enabled=False).build()) subtask = Subtask(f'test_task2_{uuid.uuid4()}', session_id=session_id, chunk_graph=chunk_graph) await asyncio.wait_for(execution_ref.run_subtask(subtask, 'numa-0', pool.external_address), timeout=30)
async def test_assign_cpu_tasks(actor_pool): pool, session_id, assigner_ref, cluster_api, meta_api = actor_pool input1 = TensorFetch(key='a', source_key='a', dtype=np.dtype(int)).new_chunk([]) input2 = TensorFetch(key='b', source_key='b', dtype=np.dtype(int)).new_chunk([]) input3 = TensorFetch(key='c', source_key='c', dtype=np.dtype(int)).new_chunk([]) result_chunk = TensorTreeAdd(args=[input1, input2, input3]) \ .new_chunk([input1, input2, input3]) chunk_graph = ChunkGraph([result_chunk]) chunk_graph.add_node(input1) chunk_graph.add_node(input2) chunk_graph.add_node(input3) chunk_graph.add_node(result_chunk) chunk_graph.add_edge(input1, result_chunk) chunk_graph.add_edge(input2, result_chunk) chunk_graph.add_edge(input3, result_chunk) await meta_api.set_chunk_meta(input1, memory_size=200, store_size=200, bands=[('address0', 'numa-0')]) await meta_api.set_chunk_meta(input2, memory_size=400, store_size=400, bands=[('address1', 'numa-0')]) await meta_api.set_chunk_meta(input3, memory_size=400, store_size=400, bands=[('address2', 'numa-0')]) await cluster_api.set_node_status(node='address1', role=NodeRole.WORKER, status=NodeStatus.STOPPING) await cluster_api.set_node_status(node='address3', role=NodeRole.WORKER, status=NodeStatus.STOPPING) subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph) [result] = await assigner_ref.assign_subtasks([subtask]) assert result in (('address0', 'numa-0'), ('address2', 'numa-0')) subtask.expect_bands = [('address0', 'numa-0')] [result] = await assigner_ref.assign_subtasks([subtask]) assert result == ('address0', 'numa-0') subtask.expect_bands = [('address0', 'numa-0'), ('address1', 'numa-0')] [result] = await assigner_ref.assign_subtasks([subtask]) assert result == ('address0', 'numa-0') subtask.expect_bands = [('address1', 'numa-0')] [result] = await assigner_ref.assign_subtasks([subtask]) assert result in (('address0', 'numa-0'), ('address2', 'numa-0')) result_chunk.op.gpu = True subtask = Subtask('test_task', session_id, chunk_graph=chunk_graph) with pytest.raises(NoMatchingSlots) as err: await assigner_ref.assign_subtasks([subtask]) assert 'gpu' in str(err.value)