def testHolder(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(WorkerClusterInfoActor, schedulers=[pool_address], uid=WorkerClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 10, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) try: test_ref = pool.create_actor(CacheTestActor) test_ref.run_test_cache() while not test_ref.get_exc_info()[0]: pool.sleep(0.1) exc_info = test_ref.get_exc_info()[1] if exc_info: six.reraise(*exc_info) finally: pool.destroy_actor(cache_ref)
def testEnsureTimeout(self, *_): from mars.errors import PromiseTimeout pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_name()) pool.create_actor(WorkerClusterInfoActor, schedulers=[pool_address], uid=WorkerClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 10, uid=MemQuotaActor.default_name()) pool.create_actor(SpillActor, uid=SpillActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) try: options.worker.prepare_data_timeout = 2 test_ref = pool.create_actor(CacheTestActor) test_ref.run_test_ensure_timeout() while not test_ref.get_exc_info()[0]: pool.sleep(0.1) exc_info = test_ref.get_exc_info()[1] self.assertIsNotNone(exc_info) self.assertIsInstance(exc_info[1], PromiseTimeout) finally: options.worker.prepare_data_timeout = 600 pool.destroy_actor(cache_ref)
def testKVStoreActor(self): etcd_port = get_next_port() proc_helper = EtcdProcessHelper(port_range_start=etcd_port) options.kv_store = 'etcd://127.0.0.1:%s' % etcd_port with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3']) store_ref.delete('/node', dir=True, recursive=True) with self.assertRaises(KeyError): store_ref.delete('/node', dir=True, recursive=True) store_ref.delete('/node', dir=True, recursive=True, silent=True)
def post_create(self): super(TransferTestActor, self).post_create() self._remote_plasma_client = plasma.connect(self._remote_plasma_socket, '', 0) self._remote_store = PlasmaChunkStore( self._remote_plasma_client, self.ctx.actor_ref(KVStoreActor.default_name()))
def testExecute(self): pool_address = '127.0.0.1:%d' % get_next_port() with create_actor_pool(n_process=1, backend='gevent', address=pool_address) as pool: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self._plasma_helper._size, uid='ChunkHolderActor') pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid='DispatchActor') pool.create_actor(QuotaActor, 1024 * 1024, uid='MemQuotaActor') pool.create_actor(CpuCalcActor) pool.create_actor(ExecutionActor, uid='ExecutionActor') try: test_ref = pool.create_actor(ExecuteTestActor) test_ref.run_test() while not test_ref.get_exc_info()[0]: gevent.sleep(0.1) exc_info = test_ref.get_exc_info()[1] if exc_info: six.reraise(*exc_info) finally: pool.destroy_actor(cache_ref)
def run_transfer_worker(pool_address, session_id, chunk_keys, spill_dir, msg_queue): from mars.config import options options.worker.spill_directory = spill_dir plasma_size = 1024 * 1024 * 10 # don't use multiple with-statement as we need the options be forked with plasma.start_plasma_store(plasma_size) as store_args: options.worker.plasma_socket = plasma_socket = store_args[0] with create_actor_pool(n_process=2, backend='gevent', distributor=WorkerDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor( ChunkHolderActor, plasma_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(plasma_socket) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref)
def testKVStoreActor(self): proc_helper = EtcdProcessHelper(port_range_start=54131) with proc_helper.run(), create_actor_pool(n_process=1, backend='gevent') as pool: store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) store_ref.write('/node/v1', 'value1') store_ref.write('/node/v2', 'value2') store_ref.write_batch([ ('/node/v2', 'value2'), ('/node/v3', 'value3'), ]) self.assertEqual(store_ref.read('/node/v1').value, 'value1') self.assertListEqual([ v.value for v in store_ref.read_batch(['/node/v2', '/node/v3']) ], ['value2', 'value3'])
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_plasma_socket = '/tmp/plasma_%d_%d.sock' % (os.getpid(), id(run_transfer_worker)) remote_spill_dir = os.path.join(tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process( target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_plasma_socket, remote_chunk_keys, remote_spill_dir, msg_queue) ) proc.start() try: msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(3), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] test_ref = pool.create_actor(TransferTestActor, local_pool_addr, remote_pool_addr, remote_plasma_socket, remote_spill_dir) try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] test_ref.do_transfer_test(session_id, chunk_key) check_time = time.time() while test_ref.get_results()[0] != chunk_key: gevent.sleep(0.5) if not proc.is_alive(): raise SystemError('Transfer worker dead. exit code %s' % proc.exitcode) if time.time() - check_time > 60: raise SystemError('Wait result timeout') exc = test_ref.get_results()[1] if exc: six.reraise(*exc) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) pool.destroy_actor(test_ref) os.unlink(remote_plasma_socket) if proc.is_alive(): proc.terminate()
def run_transfer_worker(pool_address, session_id, plasma_socket, chunk_keys, spill_dir, msg_queue): from mars.config import options from mars.utils import PlasmaProcessHelper options.worker.plasma_socket = plasma_socket options.worker.spill_directory = spill_dir plasma_helper = PlasmaProcessHelper(size=1024 * 1024 * 10, socket=options.worker.plasma_socket) try: plasma_helper.run() with create_actor_pool(n_process=2, backend='gevent', distributor=BaseDistributor(2), address=pool_address) as pool: try: pool.create_actor(ClusterInfoActor, schedulers=[pool_address], uid=ClusterInfoActor.default_name()) pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid='DispatchActor') pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid='MemQuotaActor') holder_ref = pool.create_actor(HolderActor, uid='HolderActor') chunk_holder_ref = pool.create_actor(ChunkHolderActor, plasma_helper._size, uid='ChunkHolderActor') pool.create_actor(SpillActor) pool.create_actor(SenderActor, uid='w:%s' % str(uuid.uuid4())) pool.create_actor(SenderActor, uid='w:%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='w:%s' % str(uuid.uuid4())) pool.create_actor(ReceiverActor, uid='w:%s' % str(uuid.uuid4())) register_actor = pool.create_actor(WorkerRegistrationTestActor) register_actor.register(session_id, chunk_keys) check_time = time.time() while not register_actor.get_finished(): gevent.sleep(0.5) if time.time() - check_time > 60: raise SystemError('Wait result timeout') register_actor.destroy() msg_queue.put(1) check_time = time.time() while not holder_ref.obtain(): gevent.sleep(1) if time.time() - check_time > 60: raise SystemError('Wait result timeout') finally: pool.destroy_actor(chunk_holder_ref) finally: plasma_helper.stop()
def testSimpleTransfer(self): import tempfile session_id = str(uuid.uuid4()) local_pool_addr = 'localhost:%d' % get_next_port() remote_pool_addr = 'localhost:%d' % get_next_port() remote_chunk_keys = [str(uuid.uuid4()) for _ in range(9)] msg_queue = multiprocessing.Queue() remote_spill_dir = os.path.join(tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) proc = multiprocessing.Process( target=run_transfer_worker, args=(remote_pool_addr, session_id, remote_chunk_keys, remote_spill_dir, msg_queue) ) proc.start() try: remote_plasma_socket = msg_queue.get(30) except: if proc.is_alive(): proc.terminate() raise with create_actor_pool(n_process=1, distributor=WorkerDistributor(1), backend='gevent', address=local_pool_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[local_pool_addr], uid=ClusterInfoActor.default_name()) kv_store_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(DispatchActor, uid=DispatchActor.default_name()) pool.create_actor(QuotaActor, 1024 * 1024 * 20, uid=MemQuotaActor.default_name()) cache_ref = pool.create_actor(ChunkHolderActor, self.plasma_storage_size, uid=ChunkHolderActor.default_name()) pool.create_actor(SpillActor) sender_refs = [ pool.create_actor(SenderActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(SenderActor, uid='w:2:%s' % str(uuid.uuid4())), ] receiver_refs = [ pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:1:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), pool.create_actor(ReceiverActor, uid='w:2:%s' % str(uuid.uuid4())), ] try: for data_id in (-1, 1): chunk_key = remote_chunk_keys[data_id] with self.run_actor_test(pool) as test_actor: from mars.worker.spill import build_spill_file_name from mars.serialize import dataserializer from numpy.testing import assert_array_equal remote_dispatch_ref = test_actor.promise_ref( DispatchActor.default_name(), address=remote_pool_addr) remote_plasma_client = plasma.connect(remote_plasma_socket, '', 0) remote_store = PlasmaChunkStore(remote_plasma_client, kv_store_ref) def _call_send_data(sender_uid): sender_ref = test_actor.promise_ref(sender_uid, address=remote_pool_addr) return sender_ref.send_data(session_id, chunk_key, local_pool_addr, _promise=True) def _test_data_exist(*_): try: local_data = test_actor._chunk_store.get(session_id, chunk_key) except KeyError: with open(build_spill_file_name(chunk_key), 'rb') as spill_file: local_data = dataserializer.load(spill_file) try: remote_data = remote_store.get(session_id, chunk_key) except KeyError: with open(build_spill_file_name(chunk_key, remote_spill_dir), 'rb') as spill_file: remote_data = dataserializer.load(spill_file) assert_array_equal(local_data, remote_data) del local_data, remote_data remote_dispatch_ref.get_free_slot('sender', _promise=True) \ .then(_call_send_data) \ .then(_test_data_exist) \ .then( lambda *_: test_actor.set_result(chunk_key), lambda *exc: test_actor.set_result(exc, False), ) self.assertEqual(self.get_result(60), chunk_key) remote_holder_ref = pool.actor_ref('HolderActor', address=remote_pool_addr) remote_holder_ref.trigger() finally: for ref in sender_refs: pool.destroy_actor(ref) for ref in receiver_refs: pool.destroy_actor(ref) pool.destroy_actor(cache_ref) os.unlink(remote_plasma_socket) os.kill(proc.pid, signal.SIGINT) t = time.time() while proc.is_alive() and time.time() < t + 5: time.sleep(1) if proc.is_alive(): proc.terminate()
def testExecuteWorker(self): import mars.tensor as mt mock_scheduler_addr = '127.0.0.1:%d' % get_next_port() try: session_id = str(uuid.uuid4()) with create_actor_pool(n_process=1, backend='gevent', address=mock_scheduler_addr) as pool: pool.create_actor(ClusterInfoActor, schedulers=[mock_scheduler_addr], uid=ClusterInfoActor.default_name()) kv_ref = pool.create_actor(KVStoreActor, uid=KVStoreActor.default_name()) pool.create_actor(ResourceActor, uid=ResourceActor.default_name()) proc = subprocess.Popen([ sys.executable, '-m', 'mars.worker', '-a', '127.0.0.1', '--schedulers', mock_scheduler_addr, '--cpu-procs', '1', '--cache-mem', '10m', '--ignore-avail-mem' ]) worker_ips = [] def waiter(): check_time = time.time() while True: if kv_ref.read('/workers/meta_timestamp', silent=True) is None: gevent.sleep(0.5) if proc.poll() is not None: raise SystemError('Worker dead. exit code %s' % proc.poll()) if time.time() - check_time > 20: raise SystemError( 'Check meta_timestamp timeout') continue else: break val = kv_ref.read('/workers/meta') worker_ips.extend( [c.key.rsplit('/', 1)[-1] for c in val.children]) gl = gevent.spawn(waiter) gl.join() a = mt.ones((100, 50), chunks=30) b = mt.ones((50, 200), chunks=30) result = a.dot(b) graph = result.build_graph(tiled=True) reply_ref = pool.create_actor(PromiseReplyTestActor) reply_callback = ((reply_ref.uid, reply_ref.address), 'reply') executor_ref = pool.actor_ref(ExecutionActor.default_name(), address=worker_ips[0]) io_meta = dict(chunks=[c.key for c in result.chunks]) executor_ref.execute_graph(session_id, str(id(graph)), serialize_graph(graph), io_meta, None, callback=reply_callback) check_time = time.time() while not reply_ref.get_reply(): gevent.sleep(0.1) if time.time() - check_time > 20: raise SystemError('Check reply timeout') finally: if proc.poll() is None: proc.send_signal(signal.SIGINT) check_time = time.time() while True: time.sleep(1) if proc.poll( ) is not None or time.time() - check_time >= 5: break if proc.poll() is None: proc.kill() if os.path.exists(options.worker.plasma_socket): os.unlink(options.worker.plasma_socket)