def testDataSerialize(self): try: import numpy as np from numpy.testing import assert_array_equal except ImportError: np = None try: import scipy.sparse as sps except ImportError: sps = None if np: array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal(array, dataserializer.loads(dataserializer.dumps( array, compress=dataserializer.COMPRESS_FLAG_LZ4))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal(array, dataserializer.load(BytesIO(dataserializer.dumps( array, compress=dataserializer.COMPRESS_FLAG_LZ4)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal(array, dataserializer.loads(dataserializer.dumps( array, compress=dataserializer.COMPRESS_FLAG_LZ4))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal(array, dataserializer.loads(dataserializer.dumps( array, compress=dataserializer.COMPRESS_FLAG_LZ4))) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.COMPRESS_FLAG_LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.COMPRESS_FLAG_LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)
def testFileBufferIO(self): if not np: return from numpy.testing import assert_array_equal compressions = [dataserializer.CompressType.NONE] + \ list(dataserializer.get_supported_compressions()) for c1 in compressions: for c2 in compressions: data = np.random.random((1000, 100)) # test complete read compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) compressed = reader.read() self.assertEqual(c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed)) # test partial read compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) block = reader.read(128) data_left = reader.read() assert_array_equal(data, dataserializer.loads(block + data_left)) # test read by chunks bio = BytesIO() compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) while True: block = reader.read(128) if not block: break bio.write(block) compressed = bio.getvalue() self.assertEqual(c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed)) # test write by chunks compressed_read_file.seek(0) compressed_write_file = BytesIO() writer = FileBufferIO(compressed_write_file, 'w', compress_in=c2, managed=False) while True: block = compressed_read_file.read(128) if not block: break writer.write(block) writer.close() compressed = compressed_write_file.getvalue() self.assertEqual(c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed))
def testDiskReadAndWritePacked(self, *_): test_addr = f'127.0.0.1:{get_next_port()}' with self.create_pool(n_process=1, address=test_addr) as pool, \ self.run_actor_test(pool) as test_actor: pool.create_actor(WorkerClusterInfoActor, [test_addr], uid=WorkerClusterInfoActor.default_uid()) pool.create_actor(StatusActor, test_addr, uid=StatusActor.default_uid()) pool.create_actor(EventsActor, uid=EventsActor.default_uid()) pool.create_actor(WorkerDaemonActor, uid=WorkerDaemonActor.default_uid()) storage_manager_ref = pool.create_actor( StorageManagerActor, uid=StorageManagerActor.default_uid()) session_id = str(uuid.uuid4()) data1 = np.random.random((10, 10)) ser_data1 = dataserializer.serialize(data1) storage_client = test_actor.storage_client handler = storage_client.get_storage_handler( (0, DataStorageDevice.DISK)) for handler._compress in self._get_compress_types(): data_key1 = str(uuid.uuid4()) storage_client.delete(session_id, [data_key1]) self.rm_spill_dirs() block_data1 = dataserializer.dumps(data1, compress=handler._compress) def _write_data(ser, writer): with writer: writer.write(ser) return writer.filename handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes, packed=True, _promise=True) \ .then(functools.partial(_write_data, block_data1)) \ .then(test_actor.set_result, lambda *exc: test_actor.set_result(exc, accept=False)) file_name = self.get_result(5) self.assertEqual( sorted( storage_manager_ref.get_data_locations( session_id, [data_key1])[0]), [(0, DataStorageDevice.DISK)]) self.assertTrue(os.path.exists(file_name)) def _read_data(reader): with reader: return dataserializer.loads(reader.read()) handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \ .then(_read_data) \ .then(functools.partial(test_actor.set_result), lambda *exc: test_actor.set_result(exc, accept=False)) assert_allclose(self.get_result(5), data1)
def testToPandas(self): rs = np.random.RandomState(0) df = pd.DataFrame({'a': rs.rand(100), 'b': ['s' + str(i) for i in rs.randint(100, size=100)]}) batch_size = 15 n_batch = len(df) // 15 + 1 batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size]) for i in range(n_batch)] table = pa.Table.from_batches(batches) df2 = arrow_table_to_pandas_dataframe(table) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) self.assertLess(df2.memory_usage(deep=True).sum(), df.memory_usage(deep=True).sum()) # test serialize df3 = dataserializer.loads(dataserializer.dumps(df2)) self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype()) pd.testing.assert_frame_equal(df3, df2) # test df method df4 = df2.groupby('b').sum() expected = df.groupby('b').sum() pd.testing.assert_frame_equal(df4, expected) s = ('s' + df2['b']).astype('string') expected = ('s' + df['b']).astype('string') pd.testing.assert_series_equal(s, expected) s2 = df2['b'].str[:2] expected = df['b'].astype('string').str[:2] pd.testing.assert_series_equal(s2, expected)
async def test_cuda_backend(): params, teardown_params = await CudaStorage.setup() storage = CudaStorage(**params) assert storage.level == StorageLevel.GPU data1 = cupy.asarray(np.random.rand(10, 10)) put_info1 = await storage.put(data1) get_data1 = await storage.get(put_info1.object_id) cupy.testing.assert_array_equal(data1, get_data1) info1 = await storage.object_info(put_info1.object_id) assert info1.size == put_info1.size await storage.delete(put_info1.object_id) data2 = cudf.DataFrame( pd.DataFrame( { 'col1': np.arange(10), 'col2': [f'str{i}' for i in range(10)], 'col3': np.random.rand(10) }, )) put_info2 = await storage.put(data2) get_data2 = await storage.get(put_info2.object_id) cudf.testing.assert_frame_equal(data2, get_data2) info2 = await storage.object_info(put_info2.object_id) assert info2.size == put_info2.size await CudaStorage.teardown(**teardown_params) # test writer and reader t = np.random.random(10) b = dataserializer.dumps(t) async with await storage.open_writer(size=len(b)) as writer: split = len(b) // 2 await writer.write(b[:split]) await writer.write(b[split:]) async with await storage.open_reader(writer.object_id) as reader: content = await reader.read() b = content.to_host_array().tobytes() t2 = dataserializer.loads(b) np.testing.assert_array_equal(t, t2) # write cupy array t = cupy.random.random((10, )) headers, buffers = serialize(t) async with await storage.open_writer(size=len(b)) as writer: for buffer in buffers: await writer.write(buffer.data) async with await storage.open_reader(writer.object_id) as reader: b2 = await reader.read() t2 = deserialize(headers, [b2]) cupy.testing.assert_array_equal(t, t2) await CudaStorage.teardown(**teardown_params)
def mocked_requests_get(*arg, **_): url = arg[0] if '/worker' in url: return MockResponse(200, json_text=1) if url.split('/')[-2] == 'graph': return MockResponse(200, json_text={"state": 'succeeded'}) elif url.split('/')[-2] == 'data': data = dumps(np.ones((100, 100)) * 100) return MockResponse(200, data=data)
async def test_base_operations(storage_context): storage = storage_context data1 = np.random.rand(10, 10) put_info1 = await storage.put(data1) get_data1 = await storage.get(put_info1.object_id) np.testing.assert_array_equal(data1, get_data1) info1 = await storage.object_info(put_info1.object_id) # FIXME: remove os check when size issue fixed assert info1.size == put_info1.size or not sys.platform.startswith('linux') data2 = pd.DataFrame( { 'col1': np.arange(10), 'col2': [f'str{i}' for i in range(10)], 'col3': np.random.rand(10) }, ) put_info2 = await storage.put(data2) get_data2 = await storage.get(put_info2.object_id) pd.testing.assert_frame_equal(data2, get_data2) info2 = await storage.object_info(put_info2.object_id) # FIXME: remove os check when size issue fixed assert info2.size == put_info2.size or not sys.platform.startswith('linux') # FIXME: remove when list functionality is ready for vineyard. if not isinstance(storage, (VineyardStorage, SharedMemoryStorage, RayStorage)): num = len(await storage.list()) assert num == 2 await storage.delete(info2.object_id) # test SparseMatrix s1 = sps.csr_matrix([[1, 0, 1], [0, 0, 1]]) s = SparseNDArray(s1) put_info3 = await storage.put(s) get_data3 = await storage.get(put_info3.object_id) assert isinstance(get_data3, SparseMatrix) np.testing.assert_array_equal(get_data3.toarray(), s1.A) np.testing.assert_array_equal(get_data3.todense(), s1.A) # test writer and reader t = np.random.random(10) b = dataserializer.dumps(t) async with await storage.open_writer(size=len(b)) as writer: split = len(b) // 2 await writer.write(b[:split]) await writer.write(b[split:]) async with await storage.open_reader(writer.object_id) as reader: content = await reader.read() t2 = dataserializer.loads(content) np.testing.assert_array_equal(t, t2)
async def test_storage_mock_api(storage_configs): start_method = 'fork' if sys.platform != 'win32' else None pool = await mo.create_actor_pool('127.0.0.1', 1, subprocess_start_method=start_method) async with pool: session_id = 'mock_session_id' storage_api = await MockStorageAPI.create( address=pool.external_address, session_id=session_id, storage_configs=storage_configs) # test put and get value1 = np.random.rand(10, 10) await storage_api.put('data1', value1) get_value1 = await storage_api.get('data1') np.testing.assert_array_equal(value1, get_value1) value2 = pd.DataFrame({ 'col1': [str(i) for i in range(10)], 'col2': np.random.randint(0, 100, (10, )) }) await storage_api.put('data2', value2) await storage_api.prefetch('data2') get_value2 = await storage_api.get('data2') pd.testing.assert_frame_equal(value2, get_value2) sliced_value = await storage_api.get( 'data2', conditions=[slice(3, 5), slice(None, None)]) pd.testing.assert_frame_equal(value2.iloc[3:5, :], sliced_value) infos = await storage_api.get_infos('data2') assert infos[0].store_size > 0 await storage_api.delete('data2') await storage_api.prefetch('data1') write_data = dataserializer.dumps(value2) # test open_reader and open_writer writer = await storage_api.open_writer('write_key', len(write_data), StorageLevel.MEMORY) async with writer: await writer.write(write_data) reader = await storage_api.open_reader('write_key') async with reader: read_bytes = await reader.read() read_value = dataserializer.loads(read_bytes) pd.testing.assert_frame_equal(value2, read_value)
def testReceiver(self): pool_addr = 'localhost:%d' % get_next_port() options.worker.spill_directory = os.path.join( tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker))) session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) serialized_mock_data = dataserializer.dumps(mock_data) serialized_crc32 = zlib.crc32(serialized_mock_data) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) chunk_key4 = str(uuid.uuid4()) chunk_key5 = str(uuid.uuid4()) chunk_key6 = str(uuid.uuid4()) with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool: chunk_holder_ref = pool.actor_ref(ChunkHolderActor.default_name()) mapper_ref = pool.actor_ref(PlasmaKeyMapActor.default_name()) receiver_ref = pool.create_actor(ReceiverActor, uid=str(uuid.uuid4())) store = PlasmaChunkStore(self._plasma_client, mapper_ref) # check_status on receiving and received self.assertEqual(receiver_ref.check_status(session_id, chunk_key1), ReceiveStatus.NOT_STARTED) write_spill_file(chunk_key1, mock_data) self.assertEqual(receiver_ref.check_status(session_id, chunk_key1), ReceiveStatus.RECEIVED) os.unlink(build_spill_file_name(chunk_key1)) ref = store.put(session_id, chunk_key1, mock_data) data_size = store.get_actual_size(session_id, chunk_key1) chunk_holder_ref.register_chunk(session_id, chunk_key1) del ref self.assertEqual(receiver_ref.check_status(session_id, chunk_key1), ReceiveStatus.RECEIVED) with self.run_actor_test(pool) as test_actor: receiver_ref_p = test_actor.promise_ref(receiver_ref) # cancel on an un-run / missing result will result in nothing receiver_ref_p.cancel_receive(session_id, chunk_key2) # start creating writer receiver_ref_p.create_data_writer(session_id, chunk_key1, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED)) receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVING)) receiver_ref_p.cancel_receive(session_id, chunk_key2) self.assertEqual(receiver_ref.check_status(session_id, chunk_key2), ReceiveStatus.NOT_STARTED) # test checksum error on receive_data_part receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, 0) with self.assertRaises(ChecksumMismatch): self.get_result(5) # test checksum error on finish_receive receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, serialized_crc32) receiver_ref_p.finish_receive(session_id, chunk_key2, 0) receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) with self.assertRaises(ChecksumMismatch): self.get_result(5) receiver_ref_p.cancel_receive(session_id, chunk_key2) # test intermediate cancellation receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[:64], zlib.crc32(serialized_mock_data[:64])) receiver_ref_p.cancel_receive(session_id, chunk_key2) receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[64:], serialized_crc32) with self.assertRaises(ExecutionInterrupted): self.get_result(5) # test transfer in memory receiver_ref_p.register_finish_callback(session_id, chunk_key3, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[:64], zlib.crc32(serialized_mock_data[:64])) receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[64:], serialized_crc32) receiver_ref_p.finish_receive(session_id, chunk_key3, serialized_crc32) self.assertTupleEqual((), self.get_result(5)) receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED)) # test transfer in spill file def mocked_store_create(*_): raise StoreFull with patch_method(PlasmaChunkStore.create, new=mocked_store_create): # test receive aborted receiver_ref_p.create_data_writer( session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data[:64], zlib.crc32(serialized_mock_data[:64])) receiver_ref_p.cancel_receive(session_id, chunk_key4) with self.assertRaises(ExecutionInterrupted): self.get_result(5) # test receive into spill receiver_ref_p.create_data_writer( session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data, serialized_crc32) receiver_ref_p.finish_receive(session_id, chunk_key4, serialized_crc32) self.assertTupleEqual((), self.get_result(5)) # test intermediate error def mocked_store_create(*_): raise SpillNotConfigured with patch_method(PlasmaChunkStore.create, new=mocked_store_create): receiver_ref_p.create_data_writer( session_id, chunk_key5, data_size, test_actor, ensure_cached=False, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False), lambda *s: test_actor.set_result(s, accept=False, destroy=False)) with self.assertRaises(SpillNotConfigured): self.get_result(5) # test receive timeout receiver_ref_p.register_finish_callback(session_id, chunk_key6, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False)) receiver_ref_p.create_data_writer(session_id, chunk_key6, data_size, test_actor, timeout=2, _promise=True) \ .then(lambda *s: test_actor.set_result(s, destroy=False)) self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None)) receiver_ref_p.receive_data_part(session_id, chunk_key6, serialized_mock_data[:64], zlib.crc32(serialized_mock_data[:64])) with self.assertRaises(TimeoutError): self.get_result(5)
def testDataSerialize(self): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s)
def testDiskReadAndWriteMerger(self): import logging logging.basicConfig(level=logging.DEBUG) test_addr = f'127.0.0.1:{get_next_port()}' options.worker.filemerger.max_file_size = 2400 options.worker.filemerger.concurrency = 16 with self.create_pool(n_process=1, address=test_addr) as pool, \ self.run_actor_test(pool) as test_actor: pool.create_actor(WorkerClusterInfoActor, [test_addr], uid=WorkerClusterInfoActor.default_uid()) pool.create_actor(StatusActor, test_addr, uid=StatusActor.default_uid()) pool.create_actor(EventsActor, uid=EventsActor.default_uid()) disk_file_merger_ref = pool.create_actor( DiskFileMergerActor, uid=DiskFileMergerActor.default_uid()) pool.create_actor(WorkerDaemonActor, uid=WorkerDaemonActor.default_uid()) storage_manager_ref = pool.create_actor( StorageManagerActor, uid=StorageManagerActor.default_uid()) session_id = str(uuid.uuid4()) data_count = 30 data = [ np.random.rand(random.randint(10, 30), random.randint(10, 30)) for _ in range(data_count) ] ser_data = [dataserializer.serialize(d) for d in data] storage_client = test_actor.storage_client handler = storage_client.get_storage_handler( (0, DataStorageDevice.DISK)) for handler._compress in self._get_compress_types(): data_keys = [str(uuid.uuid4()) for _ in range(data_count)] promises = [] for idx in range(data_count): block_data = dataserializer.dumps( data[idx], compress=handler._compress) def _write_data(ser, writer): with writer: writer.write(ser) return writer.filename promises.append( handler.create_bytes_writer(session_id, data_keys[idx], ser_data[idx].total_bytes, packed=True, with_merger_lock=True, _promise=True).then( functools.partial( _write_data, block_data))) promise.all_(promises).then( lambda *_: test_actor.set_result(0), lambda *exc: test_actor.set_result(exc, accept=False)) self.get_result(50) for key in data_keys: self.assertEqual( sorted( storage_manager_ref.get_data_locations( session_id, [key])[0]), [(0, DataStorageDevice.DISK)]) dump_result = disk_file_merger_ref.dump_info() written_files = list(dump_result[2]) for fn in written_files: self.assertTrue(os.path.exists(fn)) data_store = [None] * len(data) promises = [] for idx in range(data_count): def _read_data(reader, idx): with reader: data_store[idx] = dataserializer.loads( reader.read()) promises.append( handler.create_bytes_reader(session_id, data_keys[idx], with_merger_lock=True, packed=True, _promise=True).then( functools.partial( _read_data, idx=idx))) promise.all_(promises).then( lambda *_: test_actor.set_result(0), lambda *exc: test_actor.set_result(exc, accept=False)) self.get_result(50) for true_data, read_data in zip(data, data_store): assert_allclose(true_data, read_data) data_store = [None] * len(data) promises = [] for idx in range(data_count): def _read_data(reader, idx): with reader: data_store[idx] = dataserializer.deserialize( reader.read()) promises.append( handler.create_bytes_reader(session_id, data_keys[idx], _promise=True).then( functools.partial( _read_data, idx=idx))) promise.all_(promises).then( lambda *_: test_actor.set_result(0), lambda *exc: test_actor.set_result(exc, accept=False)) self.get_result(50) for true_data, read_data in zip(data, data_store): assert_allclose(true_data, read_data) storage_client.delete(session_id, data_keys) pool.sleep(0.1) for fn in written_files: self.assertFalse(os.path.exists(fn))
def testDataSerialize(self): try: import numpy as np from numpy.testing import assert_array_equal except ImportError: np = None try: import scipy.sparse as sps except ImportError: sps = None if np: array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4)))) if not six.PY2: assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.float64(0.2345) assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump( array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) if not six.PY2: with open(fn, 'wb') as dump_file: dataserializer.dump( array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) if not six.PY2: des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) if not six.PY2: des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue( (vector.spmatrix != des_vector.spmatrix).nnz == 0)
def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object,), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test pickle d = ClassToPickle(dict(a=1, b='uvw')) dest_d = dataserializer.loads((dataserializer.dumps(d))) self.assertIs(type(d), type(dest_d)) self.assertEqual(d.a, dest_d.a) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']]}) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype( pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)
def testSharedReadAndWritePacked(self, *_): test_addr = '127.0.0.1:%d' % get_next_port() io_size = dataserializer.HEADER_LENGTH * 2 with self.create_pool(n_process=1, address=test_addr) as pool, \ self.run_actor_test(pool) as test_actor: pool.create_actor(WorkerDaemonActor, uid=WorkerDaemonActor.default_uid()) storage_manager_ref = pool.create_actor( StorageManagerActor, uid=StorageManagerActor.default_uid()) pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_uid()) pool.create_actor(SharedHolderActor, uid=SharedHolderActor.default_uid()) data1 = np.random.random((100, 100)) ser_data1 = dataserializer.serialize(data1) block_data1 = dataserializer.dumps(data1, dataserializer.CompressType.NONE) session_id = str(uuid.uuid4()) data_key1 = str(uuid.uuid4()) storage_client = test_actor.storage_client handler = storage_client.get_storage_handler((0, DataStorageDevice.SHARED_MEMORY)) def _write_data(ser, writer): with writer: writer.write(ser) handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes, packed=True, _promise=True) \ .then(functools.partial(_write_data, block_data1)) \ .then(lambda *_: test_actor.set_result(None), lambda *exc: test_actor.set_result(exc, accept=False)) self.get_result(5) self.assertEqual(sorted(storage_manager_ref.get_data_locations(session_id, [data_key1])[0]), [(0, DataStorageDevice.SHARED_MEMORY)]) handler.delete(session_id, [data_key1]) def _write_data(ser, writer): with writer: with self.assertRaises(IOError): writer.write(ser[:1]) for start in range(0, len(ser), io_size): writer.write(ser[start:start + io_size]) handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes, packed=True, _promise=True) \ .then(functools.partial(_write_data, block_data1)) \ .then(lambda *_: test_actor.set_result(None), lambda *exc: test_actor.set_result(exc, accept=False)) self.get_result(5) self.assertEqual(sorted(storage_manager_ref.get_data_locations(session_id, [data_key1])[0]), [(0, DataStorageDevice.SHARED_MEMORY)]) def _read_data_all(reader): with reader: return dataserializer.loads(reader.read()) handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \ .then(_read_data_all) \ .then(functools.partial(test_actor.set_result), lambda *exc: test_actor.set_result(exc, accept=False)) assert_allclose(self.get_result(5), data1) def _read_data_batch(reader): bio = BytesIO() with reader: while True: buf = reader.read(io_size) if buf: bio.write(buf) else: break return dataserializer.loads(bio.getvalue()) handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \ .then(_read_data_batch) \ .then(functools.partial(test_actor.set_result), lambda *exc: test_actor.set_result(exc, accept=False)) assert_allclose(self.get_result(5), data1) handler.delete(session_id, [data_key1])
def testReceiverWorker(self): pool_addr = f'localhost:{get_next_port()}' options.worker.spill_directory = tempfile.mkdtemp( prefix='mars_test_receiver_') session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) serialized_arrow_data = dataserializer.serialize(mock_data) data_size = serialized_arrow_data.total_bytes dumped_mock_data = dataserializer.dumps(mock_data) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) chunk_key4 = str(uuid.uuid4()) chunk_key5 = str(uuid.uuid4()) chunk_key6 = str(uuid.uuid4()) chunk_key7 = str(uuid.uuid4()) chunk_key8 = str(uuid.uuid4()) chunk_key9 = str(uuid.uuid4()) with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool, \ self.run_actor_test(pool) as test_actor: storage_client = test_actor.storage_client receiver_ref = test_actor.promise_ref( pool.create_actor(ReceiverWorkerActor, uid=str(uuid.uuid4()))) receiver_manager_ref = test_actor.promise_ref( ReceiverManagerActor.default_uid()) # SCENARIO 1: create two writers and write with chunks self.waitp( receiver_ref.create_data_writers(session_id, [chunk_key1, chunk_key2], [data_size] * 2, test_actor, _promise=True)) receiver_ref.receive_data_part( session_id, [chunk_key1, chunk_key2], [True, False], dumped_mock_data, dumped_mock_data[:len(dumped_mock_data) // 2]) self.assertEqual(receiver_ref.check_status(session_id, chunk_key1), ReceiveStatus.RECEIVED) self.assertEqual(receiver_ref.check_status(session_id, chunk_key2), ReceiveStatus.RECEIVING) receiver_ref.receive_data_part( session_id, [chunk_key2], [True], dumped_mock_data[len(dumped_mock_data) // 2:]) self.assertEqual(receiver_ref.check_status(session_id, chunk_key2), ReceiveStatus.RECEIVED) assert_array_equal( storage_client.get_object(session_id, chunk_key1, [DataStorageDevice.SHARED_MEMORY], _promise=False), mock_data) assert_array_equal( storage_client.get_object(session_id, chunk_key2, [DataStorageDevice.SHARED_MEMORY], _promise=False), mock_data) # SCENARIO 2: one of the writers failed to create, # will test both existing and non-existing keys old_create_writer = StorageClient.create_writer def _create_writer_with_fail(self, session_id, chunk_key, *args, **kwargs): if chunk_key == fail_key: if kwargs.get('_promise', True): return promise.finished(*build_exc_info(ValueError), **dict(_accept=False)) else: raise ValueError return old_create_writer(self, session_id, chunk_key, *args, **kwargs) with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \ self.assertRaises(ValueError): fail_key = chunk_key4 self.waitp( receiver_ref.create_data_writers( session_id, [chunk_key3, chunk_key4, chunk_key5], [data_size] * 3, test_actor, ensure_cached=False, _promise=True)) self.assertEqual(receiver_ref.check_status(session_id, chunk_key3), ReceiveStatus.NOT_STARTED) self.assertEqual(receiver_ref.check_status(session_id, chunk_key4), ReceiveStatus.NOT_STARTED) self.assertEqual(receiver_ref.check_status(session_id, chunk_key5), ReceiveStatus.NOT_STARTED) with patch_method(StorageClient.create_writer, new=_create_writer_with_fail): fail_key = chunk_key2 self.waitp( receiver_ref.create_data_writers(session_id, [chunk_key2, chunk_key3], [data_size] * 2, test_actor, ensure_cached=False, _promise=True)) # SCENARIO 3: transfer timeout receiver_manager_ref.register_pending_keys(session_id, [chunk_key6]) self.waitp( receiver_ref.create_data_writers(session_id, [chunk_key6], [data_size], test_actor, timeout=1, _promise=True)) with self.assertRaises(TimeoutError): self.waitp( receiver_manager_ref.add_keys_callback(session_id, [chunk_key6], _promise=True)) # SCENARIO 4: cancelled transfer (both before and during transfer) receiver_manager_ref.register_pending_keys(session_id, [chunk_key7]) self.waitp( receiver_ref.create_data_writers(session_id, [chunk_key7], [data_size], test_actor, timeout=1, _promise=True)) receiver_ref.cancel_receive(session_id, [chunk_key2, chunk_key7]) with self.assertRaises(KeyError): receiver_ref.receive_data_part( session_id, [chunk_key7], [False], dumped_mock_data[:len(dumped_mock_data) // 2]) with self.assertRaises(KeyError): self.waitp( receiver_manager_ref.add_keys_callback(session_id, [chunk_key7], _promise=True)) # SCENARIO 5: sender halt and receiver is notified (reusing previous unsuccessful key) receiver_manager_ref.register_pending_keys(session_id, [chunk_key7]) mock_ref = pool.actor_ref(test_actor.uid, address='MOCK_ADDR') self.waitp( receiver_ref.create_data_writers(session_id, [chunk_key7], [data_size], mock_ref, timeout=1, _promise=True)) receiver_ref.notify_dead_senders(['MOCK_ADDR']) with self.assertRaises(WorkerDead): self.waitp( receiver_manager_ref.add_keys_callback(session_id, [chunk_key7], _promise=True)) # SCENARIO 6: successful transfer without promise receiver_ref.create_data_writers(session_id, [chunk_key8], [data_size], mock_ref, use_promise=False) receiver_ref.receive_data_part(session_id, [chunk_key8], [True], dumped_mock_data) self.assertEqual(receiver_ref.check_status(session_id, chunk_key8), ReceiveStatus.RECEIVED) assert_array_equal( storage_client.get_object(session_id, chunk_key8, [DataStorageDevice.SHARED_MEMORY], _promise=False), mock_data) # SCENARIO 7: failed transfer without promise with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \ self.assertRaises(ValueError): fail_key = chunk_key9 receiver_ref.create_data_writers(session_id, [chunk_key9], [data_size], mock_ref, use_promise=False)
def testReceiverManager(self): pool_addr = f'localhost:{get_next_port()}' session_id = str(uuid.uuid4()) mock_data = np.array([1, 2, 3, 4]) serialized_data = dataserializer.dumps(mock_data) data_size = len(serialized_data) chunk_key1 = str(uuid.uuid4()) chunk_key2 = str(uuid.uuid4()) chunk_key3 = str(uuid.uuid4()) chunk_key4 = str(uuid.uuid4()) chunk_key5 = str(uuid.uuid4()) chunk_key6 = str(uuid.uuid4()) chunk_key7 = str(uuid.uuid4()) with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool, \ self.run_actor_test(pool) as test_actor: mock_receiver_ref = pool.create_actor(MockReceiverWorkerActor, uid=str(uuid.uuid4())) storage_client = test_actor.storage_client receiver_manager_ref = test_actor.promise_ref( ReceiverManagerActor.default_uid()) # SCENARIO 1: test transferring existing keys self.waitp( storage_client.create_writer( session_id, chunk_key1, data_size, [DataStorageDevice.DISK ]).then(lambda writer: promise.finished().then( lambda *_: writer.write(serialized_data)).then( lambda *_: writer.close()))) result = self.waitp( receiver_manager_ref.create_data_writers(session_id, [chunk_key1], [data_size], test_actor, _promise=True)) self.assertEqual(result[0].uid, mock_receiver_ref.uid) self.assertEqual(result[1][0], ReceiveStatus.RECEIVED) # test adding callback for transferred key (should return immediately) result = self.waitp( receiver_manager_ref.add_keys_callback(session_id, [chunk_key1], _promise=True)) self.assertTupleEqual(result, ()) receiver_manager_ref.register_pending_keys( session_id, [chunk_key1, chunk_key2]) self.assertEqual( receiver_manager_ref.filter_receiving_keys( session_id, [chunk_key1, chunk_key2, 'non_exist']), [chunk_key2]) # SCENARIO 2: test transferring new keys and wait on listeners result = self.waitp( receiver_manager_ref.create_data_writers( session_id, [chunk_key2, chunk_key3], [data_size] * 2, test_actor, _promise=True)) self.assertEqual(result[0].uid, mock_receiver_ref.uid) self.assertIsNone(result[1][0]) # transfer with transferring keys will report RECEIVING result = self.waitp( receiver_manager_ref.create_data_writers(session_id, [chunk_key2], [data_size], test_actor, _promise=True)) self.assertEqual(result[1][0], ReceiveStatus.RECEIVING) # add listener and finish transfer receiver_manager_ref.add_keys_callback(session_id, [chunk_key1, chunk_key2], _promise=True) \ .then(lambda *s: test_actor.set_result(s)) mock_receiver_ref.receive_data_part(session_id, [chunk_key2], [True], serialized_data) mock_receiver_ref.receive_data_part(session_id, [chunk_key3], [True], serialized_data) self.get_result(5) # SCENARIO 3: test listening on multiple transfers receiver_manager_ref.create_data_writers( session_id, [chunk_key4, chunk_key5], [data_size] * 2, test_actor, _promise=True) \ .then(lambda *s: test_actor.set_result(s)) self.get_result(5) # add listener receiver_manager_ref.add_keys_callback(session_id, [chunk_key4, chunk_key5], _promise=True) \ .then(lambda *s: test_actor.set_result(s)) mock_receiver_ref.receive_data_part(session_id, [chunk_key4], [True], serialized_data) # when some chunks are not transferred, promise will not return with self.assertRaises(TimeoutError): self.get_result(0.5) mock_receiver_ref.receive_data_part(session_id, [chunk_key5], [True], serialized_data) self.get_result(5) # SCENARIO 4: test listening on transfer with errors self.waitp( receiver_manager_ref.create_data_writers(session_id, [chunk_key6], [data_size], test_actor, _promise=True)) receiver_manager_ref.add_keys_callback(session_id, [chunk_key6], _promise=True) \ .then(lambda *s: test_actor.set_result(s)) \ .catch(lambda *exc: test_actor.set_result(exc, accept=False)) mock_receiver_ref.cancel_receive(session_id, [chunk_key6]) with self.assertRaises(ExecutionInterrupted): self.get_result(5) # SCENARIO 5: test creating writers without promise ref, statuses = receiver_manager_ref.create_data_writers( session_id, [chunk_key7], [data_size], test_actor, use_promise=False) self.assertIsNone(statuses[0]) self.assertEqual(ref.uid, mock_receiver_ref.uid) # SCENARIO 6: test transferring lost keys storage_client.delete(session_id, [chunk_key1]) result = self.waitp( receiver_manager_ref.create_data_writers(session_id, [chunk_key1], [data_size], test_actor, _promise=True)) self.assertEqual(result[0].uid, mock_receiver_ref.uid) self.assertIsNone(result[1][0]) # add listener and finish transfer receiver_manager_ref.add_keys_callback(session_id, [chunk_key1], _promise=True) \ .then(lambda *s: test_actor.set_result(s)) mock_receiver_ref.receive_data_part(session_id, [chunk_key1], [True], serialized_data) self.get_result(5)