def testSendTargets(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=1, backend='gevent', address=pool_address, distributor=WorkerDistributor(2)) as pool: self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False) pool.create_actor(CpuCalcActor) import mars.tensor as mt arr = mt.ones((4,), chunk_size=4) arr_add = mt.array(mock_data) result_tensor = arr + arr_add graph = result_tensor.build_graph(compose=False, tiled=True) result_key = result_tensor.chunks[0].key pool.create_actor(MockSenderActor, mock_data + np.ones((4,)), 'out', uid='w:mock_sender') with self.run_actor_test(pool) as test_actor: def _validate(_): data = test_actor._chunk_store.get(session_id, result_tensor.chunks[0].key) assert_array_equal(data, mock_data + np.ones((4,))) graph_key = str(uuid.uuid4()) execution_ref = test_actor.promise_ref(ExecutionActor.default_name()) execution_ref.enqueue_graph(session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, send_addresses={result_key: (pool_address,)}, _promise=True) \ .then(lambda *_: execution_ref.start_execution(session_id, graph_key, _promise=True)) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result()
def testCalcProcessFailure(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=2, backend='gevent', address=pool_address, distributor=WorkerDistributor(2)) as pool: self.create_standard_actors(pool, pool_address, with_status=False) daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name()) dispatch_ref = pool.actor_ref(DispatchActor.default_name()) calc_ref = daemon_ref.create_actor(MockCpuCalcActor, session_id, mock_data, 10, uid='w:1:cpu-calc-a') daemon_ref.create_actor(ProcessHelperActor, uid='w:1:proc-helper-a') test_actor = pool.create_actor(ExecutionTestActor, uid='w:test_actor') test_actor.run_simple_calc(session_id, _tell=True) pool.sleep(2) proc_id = pool.distributor.distribute(calc_ref.uid) daemon_ref.kill_actor_process(calc_ref) assert not daemon_ref.is_actor_process_alive(calc_ref) pool.restart_process(proc_id) daemon_ref.handle_process_down([proc_id]) with self.assertRaises(WorkerProcessStopped): self.wait_for_result(pool, test_actor) self.assertEqual(len(dispatch_ref.get_slots('cpu')), 1)
def testStopGraphCalc(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=2, backend='gevent', address=pool_address, distributor=WorkerDistributor(2)) as pool: self.create_standard_actors(pool, pool_address, with_status=False) daemon_ref = pool.actor_ref(WorkerDaemonActor.default_name()) execution_ref = pool.actor_ref(ExecutionActor.default_name()) calc_ref = daemon_ref.create_actor( MockCpuCalcActor, session_id, mock_data, 10, uid='w:1:cpu-calc-a') daemon_ref.create_actor(ProcessHelperActor, uid='w:1:proc-helper-a') test_actor = pool.create_actor(ExecutionTestActor, uid='w:test_actor') test_actor.run_simple_calc(session_id, _tell=True) pool.sleep(2) proc_id = pool.distributor.distribute(calc_ref.uid) execution_ref.stop_execution(session_id, test_actor.get_graph_key(), _tell=True) while daemon_ref.is_actor_process_alive(calc_ref): pool.sleep(0.1) pool.restart_process(proc_id) daemon_ref.handle_process_down([proc_id]) with self.assertRaises(ExecutionInterrupted): self.wait_for_result(pool, test_actor)
def testDaemon(self): mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=mock_scheduler_addr) as pool: daemon_ref = pool.create_actor( WorkerDaemonActor, uid=WorkerDaemonActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) sleeper_ref = daemon_ref.create_actor(DaemonSleeperActor, uid='w:1:DaemonSleeperActor') daemon_ref.create_actor(ProcessHelperActor, uid='w:1:ProcHelper') test_actor = pool.create_actor(DaemonTestActor) daemon_ref.register_callback(test_actor, 'handle_process_down') test_actor.run_test_sleep(sleeper_ref, 10, _tell=True) self.assertTrue(daemon_ref.is_actor_process_alive(sleeper_ref)) pool.sleep(0.5) daemon_ref.kill_actor_process(sleeper_ref) # repeated kill shall not produce errors daemon_ref.kill_actor_process(sleeper_ref) self.assertFalse(daemon_ref.is_actor_process_alive(sleeper_ref)) pool.restart_process(1) daemon_ref.handle_process_down([1]) pool.sleep(1) self.assertTrue(pool.has_actor(sleeper_ref)) with self.assertRaises(WorkerProcessStopped): test_actor.get_result() test_actor.run_test_sleep(sleeper_ref, 1) pool.sleep(1.5) test_actor.get_result()
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir, msg_queue): from mars.config import options options.worker.spill_directory = spill_dir plasma_size = 1024 * 1024 * 10 # don't use multiple with-statement as we need the options be forked with plasma.start_plasma_store(plasma_size) as store_args: options.worker.plasma_socket = plasma_socket = store_args[0] with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor( ChunkHolderActor, plasma_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(plasma_socket) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref)
def testFetchRemoteData(self): pool_address = '127.0.0.1:%d' % get_next_port() session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) with create_actor_pool(n_process=1, backend='gevent', address=pool_address, distributor=WorkerDistributor(2)) as pool: self.create_standard_actors(pool, pool_address, with_daemon=False, with_status=False) pool.create_actor(CpuCalcActor) pool.create_actor(MockSenderActor, mock_data, 'in', uid='w:mock_sender') chunk_meta_ref = pool.actor_ref(ChunkMetaActor.default_name()) import mars.tensor as mt from mars.tensor.expressions.datasource import TensorFetch arr = mt.ones((4, ), chunk_size=4) arr_add = mt.array(mock_data) result_tensor = arr + arr_add graph = result_tensor.build_graph(compose=False, tiled=True) modified_chunk = arr_add.chunks[0] arr_add.chunks[0]._op = TensorFetch( dtype=modified_chunk.dtype, _outputs=[weakref.ref(o) for o in modified_chunk.op.outputs], _key=modified_chunk.op.key) chunk_meta_ref.set_chunk_meta(session_id, modified_chunk.key, size=mock_data.nbytes, shape=mock_data.shape, workers=('0.0.0.0:1234', pool_address)) with self.run_actor_test(pool) as test_actor: def _validate(_): data = test_actor._chunk_store.get( session_id, result_tensor.chunks[0].key) assert_array_equal(data, mock_data + np.ones((4, ))) graph_key = str(uuid.uuid4()) execution_ref = test_actor.promise_ref( ExecutionActor.default_name()) execution_ref.enqueue_graph(session_id, graph_key, serialize_graph(graph), dict(chunks=[result_tensor.chunks[0].key]), None, _promise=True) \ .then(lambda *_: execution_ref.start_execution(session_id, graph_key, _promise=True)) \ .then(_validate) \ .then(lambda *_: test_actor.set_result(None)) \ .catch(lambda *exc: test_actor.set_result(exc, False)) self.get_result()
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys, spill_dir, msg_queue): from mars.config import options from mars.utils import PlasmaProcessHelper options.worker.plasma_socket = plasma_socket options.worker.spill_directory = spill_dir plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10, socket=options.worker.plasma_socket) try: plasma_helper.run() with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_helper._size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(1) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref) finally: plasma_helper.stop()
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_spill_dir = os.path.join( tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process(target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_chunk_keys, remote_spill_dir, msg_queue)) proc.start() try: remote_plasma_socket = msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(1), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(ChunkMetaActor, uid=ChunkMetaActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] with self.run_actor_test(pool) as test_actor: from mars.worker.spill import build_spill_file_name from mars.serialize import dataserializer from numpy.testing import assert_array_equal remote_dispatch_ref = test_actor.promise_ref( DispatchActor.default_name(), address=remote_pool_addr) remote_plasma_client = plasma.connect( remote_plasma_socket, '', 0) remote_store = PlasmaChunkStore(remote_plasma_client) def _call_send_data(sender_uid): sender_ref = test_actor.promise_ref( sender_uid, address=remote_pool_addr) return sender_ref.send_data(session_id, chunk_key, local_pool_addr, _promise=True) def _test_data_exist(*_): try: local_data = test_actor._chunk_store.get( session_id, chunk_key) except KeyError: with open(build_spill_file_name(chunk_key), 'rb') as spill_file: local_data = dataserializer.load( spill_file) try: remote_data = remote_store.get( session_id, chunk_key) except KeyError: with open( build_spill_file_name( chunk_key, remote_spill_dir), 'rb') as spill_file: remote_data = dataserializer.load( spill_file) assert_array_equal(local_data, remote_data) del local_data, remote_data remote_dispatch_ref.get_free_slot('sender', _promise=True) \ .then(_call_send_data) \ .then(_test_data_exist) \ .then( lambda *_: test_actor.set_result(chunk_key), lambda *exc: test_actor.set_result(exc, False), ) self.assertEqual(self.get_result(60), chunk_key) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) os.unlink(remote_plasma_socket) os.kill(proc.pid, signal.SIGINT) t = time.time() while proc.is_alive() and time.time() < t + 5: time.sleep(1) if proc.is_alive(): proc.terminate()
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_plasma_socket = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(run_transfer_worker)) remote_spill_dir = os.path.join(tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process( target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_plasma_socket, remote_chunk_keys, remote_spill_dir, msg_queue) ) proc.start() try: msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(3), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] test_ref = pool.create_actor(TransferTestActor, local_pool_addr, remote_pool_addr, remote_plasma_socket, remote_spill_dir) try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] test_ref.do_transfer_test(session_id, chunk_key) check_time = time.time() while test_ref.get_results()[0] != chunk_key: gevent.sleep(0.5) if not proc.is_alive(): raise SystemError('Transfer worker dead. exit code %s' % proc.exitcode) if time.time() - check_time > 60: raise SystemError('Wait result timeout') exc = test_ref.get_results()[1] if exc: six.reraise(*exc) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) pool.destroy_actor(test_ref) os.unlink(remote_plasma_socket) if proc.is_alive(): proc.terminate()