def testChunkBroadcast(self, *_): proc_count = 2 endpoints = ['127.0.0.1:%d' % get_next_port() for _ in range(proc_count)] keys = [] def _mock_get_scheduler(key): return endpoints[keys.index(key[1]) % len(endpoints)] ChunkMetaActor.get_scheduler.side_effect = _mock_get_scheduler session_id = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[0]) as pool1: pool1.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool1.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) with create_actor_pool(n_process=1, backend='gevent', address=endpoints[1]) as pool2: pool2.create_actor(ClusterInfoActor, endpoints, uid=ClusterInfoActor.default_name()) pool2.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) client = new_client() ref1 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0]) ref2 = client.actor_ref(ChunkMetaActor.default_name(), address=endpoints[0]) local_ref1 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[0]) local_ref2 = client.actor_ref(LocalChunkMetaActor.default_name(), address=endpoints[1]) key1 = str(uuid.uuid4()) key2 = str(uuid.uuid4()) keys = [key1, key2] ref1.set_chunk_broadcasts(session_id, key1, [endpoints[1]]) ref1.set_chunk_size(session_id, key1, 512) ref1.set_chunk_shape(session_id, key1, (10,) * 2) ref1.add_worker(session_id, key1, 'abc') ref2.set_chunk_broadcasts(session_id, key2, [endpoints[0]]) ref2.set_chunk_size(session_id, key2, 512) ref1.set_chunk_shape(session_id, key2, (10,) * 2) ref2.add_worker(session_id, key2, 'def') pool2.sleep(0.1) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref1.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref1.get_chunk_broadcasts(session_id, key1), [endpoints[1]]) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_size, 512) self.assertEqual(local_ref2.get_chunk_meta(session_id, key1).chunk_shape, (10,) * 2) self.assertEqual(local_ref2.get_chunk_broadcasts(session_id, key2), [endpoints[0]]) ref1.delete_meta(session_id, key1) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key1)) self.assertIsNone(local_ref1.get_chunk_broadcasts(session_id, key1)) local_ref1.remove_workers_in_session(session_id, ['def']) local_ref2.remove_workers_in_session(session_id, ['def']) pool2.sleep(0.1) self.assertIsNone(local_ref1.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_meta(session_id, key2)) self.assertIsNone(local_ref2.get_chunk_broadcasts(session_id, key2))
def create_standard_actors(cls, pool, address, quota_size=None, with_daemon=True, with_status=True, with_resource=False): quota_size = quota_size or (1024 * 1024) pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(ClusterInfoActor, schedulers=[address], uid=ClusterInfoActor.default_name()) if with_resource: pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) if with_daemon: pool.create_actor(WorkerDaemonActor, uid=WorkerDaemonActor.default_name()) if with_status: pool.create_actor(StatusActor, address, uid=StatusActor.default_name()) pool.create_actor(ChunkHolderActor, cls.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(TaskQueueActor, uid=TaskQueueActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, quota_size, uid=MemQuotaActor.default_name()) pool.create_actor(ExecutionActor, uid=ExecutionActor.default_name())
def execute_case(): pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.gen_name(session_id)) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name( session_id, graph_key)) execution_ref = pool.create_actor(FakeExecutionActor, sleep=1) # handle mock objects OperandActor._get_raw_execution_ref.side_effect = lambda: execution_ref mock_resource = dict( hardware=dict(cpu=4, cpu_total=4, memory=512)) def write_mock_meta(): resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) v = gevent.spawn(write_mock_meta) v.join() graph_ref.prepare_graph() fetched_graph = graph_ref.get_chunk_graph() graph_ref.scan_node() graph_ref.place_initial_chunks() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref( GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() cancel_called = False while True: gevent.sleep(0.1) if not cancel_called and time.time() > start_time + 0.8: cancel_called = True graph_ref.stop_graph(_tell=True) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break
def testHolder(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 10, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) try: test_ref = pool.create_actor(CacheTestActor) test_ref.run_test_cache() while not test_ref.get_exc_info()[0]: gevent.sleep(0.1) exc_info = test_ref.get_exc_info()[1] if exc_info: six.reraise(*exc_info) finally: pool.destroy_actor(cache_ref)
def _prepare_test_graph(self, session_id, graph_key, mock_workers): addr = '127.0.0.1:%d' % get_next_port() a1 = mt.random.random((100,)) a2 = mt.random.random((100,)) s = a1 + a2 v1, v2 = mt.split(s, 2) graph = DAG() v1.build_graph(graph=graph, compose=False) v2.build_graph(graph=graph, compose=False) with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name(session_id, graph_key)) for w in mock_workers: resource_ref.set_worker_meta(w, dict(hardware=dict(cpu_total=4))) graph_ref.prepare_graph() graph_ref.analyze_graph() graph_ref.create_operand_actors(_start=False) yield pool, graph_ref
def testEnsureTimeout(self, *_): from mars.errors import PromiseTimeout pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 10, uid=MemQuotaActor.default_name()) pool.create_actor(SpillActor, uid=SpillActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) try: options.worker.prepare_data_timeout = 2 test_ref = pool.create_actor(CacheTestActor) test_ref.run_test_ensure_timeout() while not test_ref.get_exc_info()[0]: pool.sleep(0.1) exc_info = test_ref.get_exc_info()[1] self.assertIsNotNone(exc_info) self.assertIsInstance(exc_info[1], PromiseTimeout) finally: options.worker.prepare_data_timeout = 600 pool.destroy_actor(cache_ref)
def testExecute(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self._plasma_helper._size, uid='ChunkHolderActor') pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid='DispatchActor') pool.create_actor(QuotaActor, 1024 * 1024, uid='MemQuotaActor') pool.create_actor(CpuCalcActor) pool.create_actor(ExecutionActor, uid='ExecutionActor') try: test_ref = pool.create_actor(ExecuteTestActor) test_ref.run_test() while not test_ref.get_exc_info()[0]: gevent.sleep(0.1) exc_info = test_ref.get_exc_info()[1] if exc_info: six.reraise(*exc_info) finally: pool.destroy_actor(cache_ref)
def testStatus(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) status_ref = pool.create_actor(StatusActor, '127.0.0.1:1234', uid=StatusActor.default_name()) def delay_read(): gevent.sleep(2) return resource_ref.get_workers_meta() gl = gevent.spawn(delay_read) gl.join() v = gl.value self.assertIsNotNone(v) pool.destroy_actor(status_ref)
def testAssignerActor(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) endpoint1 = 'localhost:12345' endpoint2 = 'localhost:23456' res = dict(hardware=dict(cpu=4, memory=4096)) def write_mock_meta(): resource_ref.set_worker_meta(endpoint1, res) resource_ref.set_worker_meta(endpoint2, res) g = gevent.spawn(write_mock_meta) g.join() assigner_ref = pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_id = str(uuid.uuid4()) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) op_info = { 'op_name': 'test_op', 'io_meta': dict(input_chunks=[chunk_key1, chunk_key2, chunk_key3]), 'output_size': 512, 'retries': 0, 'optimize': { 'depth': 0, 'demand_depths': (), 'successor_size': 1, 'descendant_size': 0 } } chunk_meta_ref.set_chunk_meta(session_id, chunk_key1, size=512, workers=(endpoint1, )) chunk_meta_ref.set_chunk_meta(session_id, chunk_key2, size=512, workers=(endpoint1, )) chunk_meta_ref.set_chunk_meta(session_id, chunk_key3, size=512, workers=(endpoint2, )) workers = assigner_ref.get_worker_assignments(session_id, op_info) self.assertEqual(workers[0], endpoint1)
def testLocalCluster(self): endpoint = gen_endpoint('0.0.0.0') with LocalDistributedCluster(endpoint, scheduler_n_process=2, worker_n_process=3) as cluster: pool = cluster.pool self.assertTrue( pool.has_actor(pool.actor_ref( ClusterInfoActor.default_name()))) self.assertTrue( pool.has_actor( pool.actor_ref(SessionManagerActor.default_name()))) self.assertTrue( pool.has_actor(pool.actor_ref(DispatchActor.default_name()))) with new_session(endpoint) as session: api = session._api t = mt.ones((3, 3), chunk_size=2) result = session.run(t) np.testing.assert_array_equal(result, np.ones((3, 3))) self.assertNotIn(session._session_id, api.session_manager.get_sessions())
def testErrorOnPrepare(self, *_): session_id = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4))) resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4))) # error occurred in create_operand_actors graph_key = str(uuid.uuid4()) expr = mt.random.random((8, 2), chunk_size=2) + 1 graph = expr.build_graph(compose=False) serialized_graph = serialize_graph(graph) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_raises(*_, **__): raise RuntimeError with patch_method(GraphActor.create_operand_actors, new=_mock_raises): with self.assertRaises(RuntimeError): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.FAILED) graph_ref.destroy() # interrupted during create_operand_actors graph_key = str(uuid.uuid4()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_cancels(*_, **__): graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) graph_meta_ref.set_state(GraphState.CANCELLING) with patch_method(GraphActor.create_operand_actors, new=_mock_cancels): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED) # interrupted during previous steps graph_key = str(uuid.uuid4()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name(session_id, graph_key)) def _mock_cancels(*_, **__): graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) graph_meta_ref.set_state(GraphState.CANCELLING) return dict() with patch_method(GraphAnalyzer.calc_operand_assignments, new=_mock_cancels): graph_ref.execute_graph() self.assertEqual(graph_ref.get_state(), GraphState.CANCELLED)
def testExecute(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) cache_ref = pool.create_actor( ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(TaskQueueActor, uid=TaskQueueActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024, uid=MemQuotaActor.default_name()) pool.create_actor(CpuCalcActor) pool.create_actor(ExecutionActor, uid=ExecutionActor.default_name()) try: with self.run_actor_test(pool) as test_actor: import mars.tensor as mt from mars.tensor.expressions.datasource import TensorOnes, TensorFetchChunk arr = mt.ones((10, 8), chunk_size=10) arr_add = mt.ones((10, 8), chunk_size=10) arr2 = arr + arr_add graph = arr2.build_graph(compose=False, tiled=True) for chunk in graph: if isinstance(chunk.op, TensorOnes): chunk._op = TensorFetchChunk( dtype=chunk.dtype, _outputs=[weakref.ref(o) for o in chunk.op.outputs], _key=chunk.op.key) session_id = str(uuid.uuid4()) chunk_holder_ref = test_actor.promise_ref(ChunkHolderActor.default_name()) refs = test_actor._chunk_store.put(session_id, arr.chunks[0].key, np.ones((10, 8), dtype=np.int16)) chunk_holder_ref.register_chunk(session_id, arr.chunks[0].key) del refs refs = test_actor._chunk_store.put(session_id, arr_add.chunks[0].key, np.ones((10, 8), dtype=np.int16)) chunk_holder_ref.register_chunk(session_id, arr_add.chunks[0].key) del refs executor_ref = test_actor.promise_ref(ExecutionActor.default_name()) def _validate(_): data = test_actor._chunk_store.get(session_id, arr2.chunks[0].key) assert_array_equal(data, 2 * np.ones((10, 8))) executor_ref.enqueue_graph(session_id, str(id(graph)), serialize_graph(graph), dict(chunks=[arr2.chunks[0].key]), None, _promise=True) \ .then(lambda *_: executor_ref.start_execution(session_id, str(id(graph)), _promise=True)) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result() finally: pool.destroy_actor(cache_ref)
def testAssignerActor(self): with create_actor_pool(backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) kv_store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) endpoint1 = 'localhost:12345' endpoint2 = 'localhost:23456' res = dict(hardware=dict(cpu=4, memory=4096)) def write_mock_meta(): resource_ref.set_worker_meta(endpoint1, res) resource_ref.set_worker_meta(endpoint2, res) g = gevent.spawn(write_mock_meta) g.join() assigner_ref = pool.create_actor(AssignerActor, uid='AssignerActor') session_id = str(uuid.uuid4()) op_key = str(uuid.uuid4()) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) op_info = { 'op_name': 'test_op', 'io_meta': dict(input_chunks=[chunk_key1, chunk_key2, chunk_key3]), 'output_size': 512, 'retries': 0, 'optimize': { 'depth': 0, 'demand_depths': (), 'successor_size': 1, 'descendant_size': 0 } } kv_store_ref.write('/sessions/%s/chunks/%s/data_size' % (session_id, chunk_key1), 512) kv_store_ref.write('/sessions/%s/chunks/%s/workers/%s' % (session_id, chunk_key1, endpoint1), '') kv_store_ref.write('/sessions/%s/chunks/%s/data_size' % (session_id, chunk_key2), 512) kv_store_ref.write('/sessions/%s/chunks/%s/workers/%s' % (session_id, chunk_key2, endpoint1), '') kv_store_ref.write('/sessions/%s/chunks/%s/data_size' % (session_id, chunk_key3), 512) kv_store_ref.write('/sessions/%s/chunks/%s/workers/%s' % (session_id, chunk_key3, endpoint2), '') reply_ref = pool.create_actor(PromiseReplyTestActor) reply_callback = ((reply_ref.uid, reply_ref.address), 'reply') assigner_ref.apply_for_resource(session_id, op_key, op_info, callback=reply_callback) while not reply_ref.get_reply(): gevent.sleep(0.1) _, ret_value = reply_ref.get_reply() self.assertEqual(ret_value, endpoint1)
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir, msg_queue): from mars.config import options options.worker.spill_directory = spill_dir plasma_size = 1024 * 1024 * 10 # don't use multiple with-statement as we need the options be forked with plasma.start_plasma_store(plasma_size) as store_args: options.worker.plasma_socket = plasma_socket = store_args[0] with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor( ChunkHolderActor, plasma_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(plasma_socket) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref)
def _run_operand_case(session_id, graph_key, tensor, execution_creator): graph = tensor.build_graph(compose=False) with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name( session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = execution_creator(pool) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512)) resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) graph_ref.prepare_graph() fetched_graph = graph_ref.get_chunk_graph() graph_ref.analyze_graph() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref( GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() while True: pool.sleep(0.1) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break
def execute_case(): pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) kv_store_ref = pool.create_actor( KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.gen_name(session_id)) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name( session_id, graph_key)) execution_ref = execution_creator(pool) # handle mock objects OperandActor._get_raw_execution_ref.side_effect = lambda: execution_ref mock_resource = dict( hardware=dict(cpu=4, cpu_total=4, memory=512)) def write_mock_meta(): resource_ref.set_worker_meta('localhost:12345', mock_resource) resource_ref.set_worker_meta('localhost:23456', mock_resource) v = gevent.spawn(write_mock_meta) v.join() graph_ref.prepare_graph() graph_data = kv_store_ref.read( '/sessions/%s/graphs/%s/chunk_graph' % (session_id, graph_key)).value fetched_graph = deserialize_graph(graph_data) graph_ref.scan_node() graph_ref.place_initial_chunks() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() start_time = time.time() while True: gevent.sleep(0.1) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if kv_store_ref.read('/sessions/%s/graph/%s/state' % (session_id, graph_key)).value.lower() \ in ('succeeded', 'failed', 'cancelled'): break
def testExecuteWorker(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[mock_scheduler_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--ignore-avail-mem']) worker_ips = [] def waiter(): check_time = time.time() while True: if not resource_ref.get_workers_meta(): gevent.sleep(0.5) if proc.poll() is not None: raise SystemError('Worker dead. exit code %s' % proc.poll()) if time.time() - check_time > 20: raise SystemError('Check meta_timestamp timeout') continue else: break val = resource_ref.get_workers_meta() worker_ips.extend(val.keys()) gl = gevent.spawn(waiter) gl.join() test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_ips[0], _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise SystemError('Check reply timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(1) if proc.poll() is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def setUp(self): endpoint = '127.0.0.1:%d' % get_next_port() self.endpoint = endpoint self.pool = create_actor_pool(n_process=1, backend='gevent', address=endpoint) self.pool.create_actor(ClusterInfoActor, [endpoint], uid=ClusterInfoActor.default_name()) self.pool.create_actor(SessionManagerActor, uid=SessionManagerActor.default_name()) self.pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) self.api = MarsAPI(endpoint)
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys, spill_dir, msg_queue): from mars.config import options from mars.utils import PlasmaProcessHelper options.worker.plasma_socket = plasma_socket options.worker.spill_directory = spill_dir plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10, socket=options.worker.plasma_socket) try: plasma_helper.run() with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_helper._size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(1) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref) finally: plasma_helper.stop()
def testWorkerProcessRestart(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[mock_scheduler_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name(), address=worker_endpoint) dispatch_ref = pool.actor_ref(DispatchActor.default_name(), address=worker_endpoint) cpu_slots = dispatch_ref.get_slots('cpu') calc_ref = pool.actor_ref(cpu_slots[0], address=worker_endpoint) daemon_ref.kill_actor_process(calc_ref) check_start = time.time() while not daemon_ref.is_actor_process_alive(calc_ref): gevent.sleep(0.1) if time.time() - check_start > 10: raise TimeoutError('Check process restart timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def start_transfer_test_pool(**kwargs): address = kwargs.pop('address') plasma_size = kwargs.pop('plasma_size') with create_actor_pool(n_process=1, backend='gevent', address=address, **kwargs) as pool: pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(ClusterInfoActor, schedulers=[address], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(StatusActor, address, uid=StatusActor.default_name()) yield pool chunk_holder_ref.destroy()
def testFailoverMessage(self): mock_session_id = str(uuid.uuid4()) mock_graph_key = str(uuid.uuid4()) mock_chunk_key = str(uuid.uuid4()) addr = '127.0.0.1:%d' % get_next_port() mock_worker_addr = '127.0.0.1:54132' options.scheduler.worker_blacklist_time = 0.5 with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) session_manager_ref = pool.create_actor( SessionManagerActor, uid=SessionManagerActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) chunk_meta_ref = pool.create_actor( ChunkMetaActor, uid=ChunkMetaActor.default_name()) session_ref = pool.actor_ref(session_manager_ref.create_session(mock_session_id)) chunk_meta_ref.set_chunk_meta(mock_session_id, mock_chunk_key, size=80, shape=(10,), workers=(mock_worker_addr,)) with mock.patch(GraphActor.__module__ + '.' + GraphActor.__name__, new=MockGraphActor): session_ref.submit_tensor_graph(None, mock_graph_key) graph_ref = pool.actor_ref(GraphActor.gen_name(mock_session_id, mock_graph_key)) expire_time = time.time() - options.scheduler.status_timeout - 1 resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=expire_time)) resource_ref.detect_dead_workers(_tell=True) pool.sleep(0.2) _, removes, lost_chunks = graph_ref.get_worker_change_args() self.assertListEqual(removes, [mock_worker_addr]) self.assertListEqual(lost_chunks, [mock_chunk_key]) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertNotIn(mock_worker_addr, resource_ref.get_workers_meta()) pool.sleep(0.4) resource_ref.set_worker_meta(mock_worker_addr, dict(update_time=time.time())) self.assertIn(mock_worker_addr, resource_ref.get_workers_meta())
def testExecuteWorker(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[mock_scheduler_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--spill-dir', self._spill_dir, '--ignore-avail-mem' ]) worker_endpoint = self._wait_worker_ready(proc, resource_ref) test_ref = pool.create_actor(WorkerProcessTestActor) test_ref.run_test(worker_endpoint, _tell=True) check_time = time.time() while not test_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise TimeoutError('Check reply timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(0.1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)
def testStatus(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid='KVStoreActor') pool.create_actor(ChunkHolderActor, self._plasma_helper._size, uid='ChunkHolderActor') pool.create_actor(StatusActor, '127.0.0.1:1234', uid='StatusActor') def delay_read(): gevent.sleep(2) return self._kv_store.read('/workers/meta', recursive=True) gl = gevent.spawn(delay_read) gl.join() v = gl.value print(v)
def testStatus(self): pool_address = '127.0.0.1:%d' % get_next_port() old_spill_dir = options.worker.spill_directory dir_name = options.worker.spill_directory = tempfile.mkdtemp( prefix='temp-mars-spill-') try: with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor( ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) status_ref = pool.create_actor(StatusActor, pool_address, uid=StatusActor.default_name()) status_ref.update_slots(dict(cpu=4)) status_ref.update_stats(dict(min_est_finish_time=10)) def delay_read(): gevent.sleep(1.5) return resource_ref.get_workers_meta() gl = gevent.spawn(delay_read) gl.join() v = gl.value self.assertIsNotNone(v) pool.destroy_actor(status_ref) finally: options.worker.spill_directory = old_spill_dir shutil.rmtree(dir_name)
def start_processes(self, n_schedulers=2, n_workers=2, etcd=False, modules=None, log_scheduler=True, log_worker=True): old_not_errors = gevent.hub.Hub.NOT_ERROR gevent.hub.Hub.NOT_ERROR = (Exception, ) scheduler_ports = [str(get_next_port()) for _ in range(n_schedulers)] self.scheduler_endpoints = ['127.0.0.1:' + p for p in scheduler_ports] append_args = [] if modules: append_args.extend(['--load-modules', ','.join(modules)]) if etcd: etcd_port = get_next_port() self.etcd_helper = EtcdProcessHelper(port_range_start=etcd_port) self.etcd_helper.run() options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port append_args.extend(['--kv-store', options.kv_store]) else: append_args.extend( ['--schedulers', ','.join(self.scheduler_endpoints)]) if 'DUMP_GRAPH_DATA' in os.environ: append_args += ['-Dscheduler.dump_graph_data=true'] self.proc_schedulers = [ subprocess.Popen([ sys.executable, '-m', 'mars.scheduler', '-H', '127.0.0.1', '--level', 'debug' if log_scheduler else 'warning', '-p', p, '--format', '%(asctime)-15s %(message)s', '-Dscheduler.retry_delay=5' ] + append_args) for p in scheduler_ports ] self.proc_workers = [ subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--cpu-procs', '1', '--level', 'debug' if log_worker else 'warning', '--cache-mem', '16m', '--ignore-avail-mem', '-Dworker.prepare_data_timeout=30' ] + append_args) for _ in range(n_workers) ] actor_client = new_client() self.cluster_info = actor_client.actor_ref( ClusterInfoActor.default_name(), address=self.scheduler_endpoints[0]) check_time = time.time() while True: try: started_schedulers = self.cluster_info.get_schedulers() if len(started_schedulers) < n_schedulers: raise RuntimeError( 'Schedulers does not met requirement: %d < %d.' % (len(started_schedulers), n_schedulers)) actor_address = self.cluster_info.get_scheduler( SessionManagerActor.default_name()) self.session_manager_ref = actor_client.actor_ref( SessionManagerActor.default_name(), address=actor_address) actor_address = self.cluster_info.get_scheduler( ResourceActor.default_name()) resource_ref = actor_client.actor_ref( ResourceActor.default_name(), address=actor_address) if resource_ref.get_worker_count() < n_workers: raise RuntimeError( 'Workers does not met requirement: %d < %d.' % (resource_ref.get_worker_count(), n_workers)) break except: if time.time() - check_time > 20: raise time.sleep(0.1) gevent.hub.Hub.NOT_ERROR = old_not_errors
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_spill_dir = os.path.join( tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process(target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_chunk_keys, remote_spill_dir, msg_queue)) proc.start() try: remote_plasma_socket = msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(1), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] with self.run_actor_test(pool) as test_actor: from mars.worker.spill import build_spill_file_name from mars.serialize import dataserializer from numpy.testing import assert_array_equal remote_dispatch_ref = test_actor.promise_ref( DispatchActor.default_name(), address=remote_pool_addr) remote_plasma_client = plasma.connect( remote_plasma_socket, '', 0) remote_store = PlasmaChunkStore(remote_plasma_client) def _call_send_data(sender_uid): sender_ref = test_actor.promise_ref( sender_uid, address=remote_pool_addr) return sender_ref.send_data(session_id, chunk_key, local_pool_addr, _promise=True) def _test_data_exist(*_): try: local_data = test_actor._chunk_store.get( session_id, chunk_key) except KeyError: with open(build_spill_file_name(chunk_key), 'rb') as spill_file: local_data = dataserializer.load( spill_file) try: remote_data = remote_store.get( session_id, chunk_key) except KeyError: with open( build_spill_file_name( chunk_key, remote_spill_dir), 'rb') as spill_file: remote_data = dataserializer.load( spill_file) assert_array_equal(local_data, remote_data) del local_data, remote_data remote_dispatch_ref.get_free_slot('sender', _promise=True) \ .then(_call_send_data) \ .then(_test_data_exist) \ .then( lambda *_: test_actor.set_result(chunk_key), lambda *exc: test_actor.set_result(exc, False), ) self.assertEqual(self.get_result(60), chunk_key) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) os.unlink(remote_plasma_socket) os.kill(proc.pid, signal.SIGINT) t = time.time() while proc.is_alive() and time.time() < t + 5: time.sleep(1) if proc.is_alive(): proc.terminate()
def testGraphActor(self): session_id = str(uuid.uuid4()) graph_key = str(uuid.uuid4()) arr = mt.random.randint(10, size=(10, 8), chunk_size=4) arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4) arr2 = arr + arr_add graph = arr2.build_graph(compose=False) serialized_graph = serialize_graph(graph) chunked_graph = arr2.build_graph(compose=False, tiled=True) addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=addr) as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.gen_name(session_id)) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialized_graph, uid=GraphActor.gen_name( session_id, graph_key)) graph_ref.prepare_graph(compose=False) fetched_graph = graph_ref.get_chunk_graph() self.assertIsNotNone(fetched_graph) self.assertEqual(len(chunked_graph), len(fetched_graph)) graph_ref.scan_node() op_infos = graph_ref.get_operand_info() for n in fetched_graph: depth = op_infos[n.op.key]['optimize']['depth'] self.assertIsNotNone(depth) successor_size = op_infos[ n.op.key]['optimize']['successor_size'] self.assertIsNotNone(successor_size) descendant_size = op_infos[ n.op.key]['optimize']['descendant_size'] self.assertIsNotNone(descendant_size) resource_ref.set_worker_meta('localhost:12345', dict(hardware=dict(cpu_total=4))) resource_ref.set_worker_meta('localhost:23456', dict(hardware=dict(cpu_total=4))) graph_ref.place_initial_chunks() op_infos = graph_ref.get_operand_info() for n in fetched_graph: if fetched_graph.count_predecessors(n) != 0: continue target_worker = op_infos[n.op.key]['target_worker'] self.assertIsNotNone(target_worker) graph_ref.create_operand_actors(_clean_io_meta=False) op_infos = graph_ref.get_operand_info() for n in fetched_graph: self.assertEqual(op_infos[n.op.key]['op_name'], type(n.op).__name__) io_meta = op_infos[n.op.key]['io_meta'] orig_io_meta = dict( predecessors=list( set(pn.op.key for pn in fetched_graph.iter_predecessors(n))), successors=list( set(sn.op.key for sn in fetched_graph.iter_successors(n))), input_chunks=list( set(pn.key for pn in fetched_graph.iter_predecessors(n))), chunks=list(c.key for c in n.op.outputs), ) self.assertSetEqual(set(io_meta['predecessors']), set(orig_io_meta['predecessors'])) self.assertSetEqual(set(io_meta['successors']), set(orig_io_meta['successors'])) self.assertSetEqual(set(io_meta['input_chunks']), set(orig_io_meta['input_chunks'])) self.assertSetEqual(set(io_meta['chunks']), set(orig_io_meta['chunks'])) self.assertEqual(op_infos[n.op.key]['output_size'], sum(ch.nbytes for ch in n.op.outputs))
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_plasma_socket = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(run_transfer_worker)) remote_spill_dir = os.path.join(tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process( target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_plasma_socket, remote_chunk_keys, remote_spill_dir, msg_queue) ) proc.start() try: msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(3), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] test_ref = pool.create_actor(TransferTestActor, local_pool_addr, remote_pool_addr, remote_plasma_socket, remote_spill_dir) try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] test_ref.do_transfer_test(session_id, chunk_key) check_time = time.time() while test_ref.get_results()[0] != chunk_key: gevent.sleep(0.5) if not proc.is_alive(): raise SystemError('Transfer worker dead. exit code %s' % proc.exitcode) if time.time() - check_time > 60: raise SystemError('Wait result timeout') exc = test_ref.get_results()[1] if exc: six.reraise(*exc) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) pool.destroy_actor(test_ref) os.unlink(remote_plasma_socket) if proc.is_alive(): proc.terminate()
def testOperandActorWithCancel(self, *_): import logging logging.basicConfig(level=logging.DEBUG) arr = mt.random.randint(10, size=(10, 8), chunk_size=4) arr_add = mt.random.randint(10, size=(10, 8), chunk_size=4) arr2 = arr + arr_add session_id = str(uuid.uuid4()) graph_key = str(uuid.uuid4()) graph = arr2.build_graph(compose=False) with create_actor_pool(n_process=1, backend='gevent') as pool: pool.create_actor(ClusterInfoActor, [pool.cluster_info.address], uid=ClusterInfoActor.default_name()) resource_ref = pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(AssignerActor, uid=AssignerActor.default_name()) graph_ref = pool.create_actor(GraphActor, session_id, graph_key, serialize_graph(graph), uid=GraphActor.gen_name(session_id, graph_key)) addr_dict = dict() def _build_mock_ref(uid=None, address=None): if address in addr_dict: return addr_dict[address] else: r = addr_dict[address] = pool.create_actor( FakeExecutionActor, exec_delay=0.2) return r # handle mock objects OperandActor._get_raw_execution_ref.side_effect = _build_mock_ref mock_resource = dict(hardware=dict(cpu=4, cpu_total=4, memory=512)) for idx in range(20): resource_ref.set_worker_meta('localhost:%d' % (idx + 12345), mock_resource) graph_ref.prepare_graph(compose=False) fetched_graph = graph_ref.get_chunk_graph() graph_ref.analyze_graph() final_keys = set() for c in fetched_graph: if fetched_graph.count_successors(c) == 0: final_keys.add(c.op.key) graph_ref.create_operand_actors() graph_meta_ref = pool.actor_ref(GraphMetaActor.gen_name(session_id, graph_key)) start_time = time.time() cancel_called = False while True: pool.sleep(0.05) if not cancel_called and time.time() > start_time + 0.3: cancel_called = True graph_ref.stop_graph(_tell=True) if time.time() - start_time > 30: raise SystemError('Wait for execution finish timeout') if graph_meta_ref.get_state() in (GraphState.SUCCEEDED, GraphState.FAILED, GraphState.CANCELLED): break self.assertEqual(graph_meta_ref.get_state(), GraphState.CANCELLED)