Esempio n. 1
0
    def testDataSerialize(self):
        try:
            import numpy as np
            from numpy.testing import assert_array_equal
        except ImportError:
            np = None

        try:
            import scipy.sparse as sps
        except ImportError:
            sps = None

        if np:
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(
                array, compress=dataserializer.COMPRESS_FLAG_LZ4)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(BytesIO(dataserializer.dumps(array))))
            assert_array_equal(array, dataserializer.load(BytesIO(dataserializer.dumps(
                array, compress=dataserializer.COMPRESS_FLAG_LZ4))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(
                array, compress=dataserializer.COMPRESS_FLAG_LZ4)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(array, dataserializer.loads(dataserializer.dumps(
                array, compress=dataserializer.COMPRESS_FLAG_LZ4)))

            fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self))
            try:
                array = np.random.rand(1000, 100).T  # test non c-contiguous
                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(array, dump_file)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))
                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(array, dump_file,
                                        compress=dataserializer.COMPRESS_FLAG_LZ4)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))
            finally:
                if os.path.exists(fn):
                    os.unlink(fn)

        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.COMPRESS_FLAG_LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)
Esempio n. 2
0
    def testFileBufferIO(self):
        if not np:
            return
        from numpy.testing import assert_array_equal

        compressions = [dataserializer.CompressType.NONE] + \
            list(dataserializer.get_supported_compressions())

        for c1 in compressions:
            for c2 in compressions:
                data = np.random.random((1000, 100))

                # test complete read
                compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2)
                compressed = reader.read()
                self.assertEqual(c2, dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))

                # test partial read
                compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2)
                block = reader.read(128)
                data_left = reader.read()
                assert_array_equal(data, dataserializer.loads(block + data_left))

                # test read by chunks
                bio = BytesIO()
                compressed_read_file = BytesIO(dataserializer.dumps(data, compress=c1))
                reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2)
                while True:
                    block = reader.read(128)
                    if not block:
                        break
                    bio.write(block)

                compressed = bio.getvalue()
                self.assertEqual(c2, dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))

                # test write by chunks
                compressed_read_file.seek(0)
                compressed_write_file = BytesIO()
                writer = FileBufferIO(compressed_write_file, 'w', compress_in=c2,
                                      managed=False)
                while True:
                    block = compressed_read_file.read(128)
                    if not block:
                        break
                    writer.write(block)
                writer.close()

                compressed = compressed_write_file.getvalue()
                self.assertEqual(c2, dataserializer.read_file_header(compressed).compress)
                assert_array_equal(data, dataserializer.loads(compressed))
Esempio n. 3
0
    def testDiskReadAndWritePacked(self, *_):
        test_addr = f'127.0.0.1:{get_next_port()}'
        with self.create_pool(n_process=1, address=test_addr) as pool, \
                self.run_actor_test(pool) as test_actor:
            pool.create_actor(WorkerClusterInfoActor, [test_addr],
                              uid=WorkerClusterInfoActor.default_uid())
            pool.create_actor(StatusActor,
                              test_addr,
                              uid=StatusActor.default_uid())
            pool.create_actor(EventsActor, uid=EventsActor.default_uid())

            pool.create_actor(WorkerDaemonActor,
                              uid=WorkerDaemonActor.default_uid())
            storage_manager_ref = pool.create_actor(
                StorageManagerActor, uid=StorageManagerActor.default_uid())

            session_id = str(uuid.uuid4())
            data1 = np.random.random((10, 10))
            ser_data1 = dataserializer.serialize(data1)

            storage_client = test_actor.storage_client
            handler = storage_client.get_storage_handler(
                (0, DataStorageDevice.DISK))

            for handler._compress in self._get_compress_types():
                data_key1 = str(uuid.uuid4())

                storage_client.delete(session_id, [data_key1])
                self.rm_spill_dirs()

                block_data1 = dataserializer.dumps(data1,
                                                   compress=handler._compress)

                def _write_data(ser, writer):
                    with writer:
                        writer.write(ser)
                    return writer.filename

                handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes,
                                            packed=True, _promise=True) \
                    .then(functools.partial(_write_data, block_data1)) \
                    .then(test_actor.set_result,
                          lambda *exc: test_actor.set_result(exc, accept=False))
                file_name = self.get_result(5)
                self.assertEqual(
                    sorted(
                        storage_manager_ref.get_data_locations(
                            session_id, [data_key1])[0]),
                    [(0, DataStorageDevice.DISK)])
                self.assertTrue(os.path.exists(file_name))

                def _read_data(reader):
                    with reader:
                        return dataserializer.loads(reader.read())

                handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \
                    .then(_read_data) \
                    .then(functools.partial(test_actor.set_result),
                          lambda *exc: test_actor.set_result(exc, accept=False))
                assert_allclose(self.get_result(5), data1)
Esempio n. 4
0
    def testToPandas(self):
        rs = np.random.RandomState(0)
        df = pd.DataFrame({'a': rs.rand(100),
                           'b': ['s' + str(i) for i in rs.randint(100, size=100)]})

        batch_size = 15
        n_batch = len(df) // 15 + 1
        batches = [pa.RecordBatch.from_pandas(df[i * batch_size: (i + 1) * batch_size])
                   for i in range(n_batch)]
        table = pa.Table.from_batches(batches)

        df2 = arrow_table_to_pandas_dataframe(table)
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        self.assertLess(df2.memory_usage(deep=True).sum(),
                        df.memory_usage(deep=True).sum())

        # test serialize
        df3 = dataserializer.loads(dataserializer.dumps(df2))
        self.assertEqual(df2.dtypes.iloc[1], ArrowStringDtype())
        pd.testing.assert_frame_equal(df3, df2)

        # test df method
        df4 = df2.groupby('b').sum()
        expected = df.groupby('b').sum()
        pd.testing.assert_frame_equal(df4, expected)

        s = ('s' + df2['b']).astype('string')
        expected = ('s' + df['b']).astype('string')
        pd.testing.assert_series_equal(s, expected)

        s2 = df2['b'].str[:2]
        expected = df['b'].astype('string').str[:2]
        pd.testing.assert_series_equal(s2, expected)
Esempio n. 5
0
async def test_cuda_backend():
    params, teardown_params = await CudaStorage.setup()
    storage = CudaStorage(**params)
    assert storage.level == StorageLevel.GPU

    data1 = cupy.asarray(np.random.rand(10, 10))
    put_info1 = await storage.put(data1)
    get_data1 = await storage.get(put_info1.object_id)
    cupy.testing.assert_array_equal(data1, get_data1)

    info1 = await storage.object_info(put_info1.object_id)
    assert info1.size == put_info1.size

    await storage.delete(put_info1.object_id)

    data2 = cudf.DataFrame(
        pd.DataFrame(
            {
                'col1': np.arange(10),
                'col2': [f'str{i}' for i in range(10)],
                'col3': np.random.rand(10)
            }, ))
    put_info2 = await storage.put(data2)
    get_data2 = await storage.get(put_info2.object_id)
    cudf.testing.assert_frame_equal(data2, get_data2)

    info2 = await storage.object_info(put_info2.object_id)
    assert info2.size == put_info2.size

    await CudaStorage.teardown(**teardown_params)

    # test writer and reader
    t = np.random.random(10)
    b = dataserializer.dumps(t)
    async with await storage.open_writer(size=len(b)) as writer:
        split = len(b) // 2
        await writer.write(b[:split])
        await writer.write(b[split:])

    async with await storage.open_reader(writer.object_id) as reader:
        content = await reader.read()
        b = content.to_host_array().tobytes()
        t2 = dataserializer.loads(b)
    np.testing.assert_array_equal(t, t2)

    # write cupy array
    t = cupy.random.random((10, ))
    headers, buffers = serialize(t)
    async with await storage.open_writer(size=len(b)) as writer:
        for buffer in buffers:
            await writer.write(buffer.data)

    async with await storage.open_reader(writer.object_id) as reader:
        b2 = await reader.read()
        t2 = deserialize(headers, [b2])

    cupy.testing.assert_array_equal(t, t2)

    await CudaStorage.teardown(**teardown_params)
Esempio n. 6
0
 def mocked_requests_get(*arg, **_):
     url = arg[0]
     if '/worker' in url:
         return MockResponse(200, json_text=1)
     if url.split('/')[-2] == 'graph':
         return MockResponse(200, json_text={"state": 'succeeded'})
     elif url.split('/')[-2] == 'data':
         data = dumps(np.ones((100, 100)) * 100)
         return MockResponse(200, data=data)
Esempio n. 7
0
async def test_base_operations(storage_context):
    storage = storage_context

    data1 = np.random.rand(10, 10)
    put_info1 = await storage.put(data1)
    get_data1 = await storage.get(put_info1.object_id)
    np.testing.assert_array_equal(data1, get_data1)

    info1 = await storage.object_info(put_info1.object_id)
    # FIXME: remove os check when size issue fixed
    assert info1.size == put_info1.size or not sys.platform.startswith('linux')

    data2 = pd.DataFrame(
        {
            'col1': np.arange(10),
            'col2': [f'str{i}' for i in range(10)],
            'col3': np.random.rand(10)
        }, )
    put_info2 = await storage.put(data2)
    get_data2 = await storage.get(put_info2.object_id)
    pd.testing.assert_frame_equal(data2, get_data2)

    info2 = await storage.object_info(put_info2.object_id)
    # FIXME: remove os check when size issue fixed
    assert info2.size == put_info2.size or not sys.platform.startswith('linux')

    # FIXME: remove when list functionality is ready for vineyard.
    if not isinstance(storage,
                      (VineyardStorage, SharedMemoryStorage, RayStorage)):
        num = len(await storage.list())
        assert num == 2
        await storage.delete(info2.object_id)

    # test SparseMatrix
    s1 = sps.csr_matrix([[1, 0, 1], [0, 0, 1]])
    s = SparseNDArray(s1)
    put_info3 = await storage.put(s)
    get_data3 = await storage.get(put_info3.object_id)
    assert isinstance(get_data3, SparseMatrix)
    np.testing.assert_array_equal(get_data3.toarray(), s1.A)
    np.testing.assert_array_equal(get_data3.todense(), s1.A)

    # test writer and reader
    t = np.random.random(10)
    b = dataserializer.dumps(t)
    async with await storage.open_writer(size=len(b)) as writer:
        split = len(b) // 2
        await writer.write(b[:split])
        await writer.write(b[split:])

    async with await storage.open_reader(writer.object_id) as reader:
        content = await reader.read()
        t2 = dataserializer.loads(content)

    np.testing.assert_array_equal(t, t2)
Esempio n. 8
0
async def test_storage_mock_api(storage_configs):
    start_method = 'fork' if sys.platform != 'win32' else None
    pool = await mo.create_actor_pool('127.0.0.1',
                                      1,
                                      subprocess_start_method=start_method)
    async with pool:
        session_id = 'mock_session_id'
        storage_api = await MockStorageAPI.create(
            address=pool.external_address,
            session_id=session_id,
            storage_configs=storage_configs)

        # test put and get
        value1 = np.random.rand(10, 10)
        await storage_api.put('data1', value1)
        get_value1 = await storage_api.get('data1')
        np.testing.assert_array_equal(value1, get_value1)

        value2 = pd.DataFrame({
            'col1': [str(i) for i in range(10)],
            'col2': np.random.randint(0, 100, (10, ))
        })
        await storage_api.put('data2', value2)
        await storage_api.prefetch('data2')
        get_value2 = await storage_api.get('data2')
        pd.testing.assert_frame_equal(value2, get_value2)

        sliced_value = await storage_api.get(
            'data2', conditions=[slice(3, 5), slice(None, None)])
        pd.testing.assert_frame_equal(value2.iloc[3:5, :], sliced_value)

        infos = await storage_api.get_infos('data2')
        assert infos[0].store_size > 0

        await storage_api.delete('data2')

        await storage_api.prefetch('data1')

        write_data = dataserializer.dumps(value2)
        # test open_reader and open_writer
        writer = await storage_api.open_writer('write_key', len(write_data),
                                               StorageLevel.MEMORY)
        async with writer:
            await writer.write(write_data)

        reader = await storage_api.open_reader('write_key')
        async with reader:
            read_bytes = await reader.read()
            read_value = dataserializer.loads(read_bytes)

        pd.testing.assert_frame_equal(value2, read_value)
Esempio n. 9
0
    def testReceiver(self):
        pool_addr = 'localhost:%d' % get_next_port()
        options.worker.spill_directory = os.path.join(
            tempfile.gettempdir(), 'mars_spill_%d_%d' % (os.getpid(), id(run_transfer_worker)))
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_mock_data = dataserializer.dumps(mock_data)
        serialized_crc32 = zlib.crc32(serialized_mock_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool:
            chunk_holder_ref = pool.actor_ref(ChunkHolderActor.default_name())
            mapper_ref = pool.actor_ref(PlasmaKeyMapActor.default_name())
            receiver_ref = pool.create_actor(ReceiverActor, uid=str(uuid.uuid4()))

            store = PlasmaChunkStore(self._plasma_client, mapper_ref)

            # check_status on receiving and received
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.NOT_STARTED)

            write_spill_file(chunk_key1, mock_data)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)
            os.unlink(build_spill_file_name(chunk_key1))

            ref = store.put(session_id, chunk_key1, mock_data)
            data_size = store.get_actual_size(session_id, chunk_key1)
            chunk_holder_ref.register_chunk(session_id, chunk_key1)
            del ref
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)

            with self.run_actor_test(pool) as test_actor:
                receiver_ref_p = test_actor.promise_ref(receiver_ref)

                # cancel on an un-run / missing result will result in nothing
                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # start creating writer
                receiver_ref_p.create_data_writer(session_id, chunk_key1, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVING))

                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                                 ReceiveStatus.NOT_STARTED)

                # test checksum error on receive_data_part
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, 0)

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                # test checksum error on finish_receive
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data, serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key2, 0)

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                with self.assertRaises(ChecksumMismatch):
                    self.get_result(5)

                receiver_ref_p.cancel_receive(session_id, chunk_key2)

                # test intermediate cancellation
                receiver_ref_p.create_data_writer(session_id, chunk_key2, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.register_finish_callback(session_id, chunk_key2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.cancel_receive(session_id, chunk_key2)
                receiver_ref_p.receive_data_part(session_id, chunk_key2, serialized_mock_data[64:],
                                                 serialized_crc32)
                with self.assertRaises(ExecutionInterrupted):
                    self.get_result(5)

                # test transfer in memory
                receiver_ref_p.register_finish_callback(session_id, chunk_key3, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))
                receiver_ref_p.receive_data_part(session_id, chunk_key3, serialized_mock_data[64:], serialized_crc32)
                receiver_ref_p.finish_receive(session_id, chunk_key3, serialized_crc32)

                self.assertTupleEqual((), self.get_result(5))

                receiver_ref_p.create_data_writer(session_id, chunk_key3, data_size, test_actor, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, ReceiveStatus.RECEIVED))

                # test transfer in spill file
                def mocked_store_create(*_):
                    raise StoreFull

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    # test receive aborted
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data[:64],
                                                     zlib.crc32(serialized_mock_data[:64]))
                    receiver_ref_p.cancel_receive(session_id, chunk_key4)
                    with self.assertRaises(ExecutionInterrupted):
                        self.get_result(5)

                    # test receive into spill
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key4, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False))
                    self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))

                    receiver_ref_p.register_finish_callback(session_id, chunk_key4, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                        .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                    receiver_ref_p.receive_data_part(session_id, chunk_key4, serialized_mock_data, serialized_crc32)
                    receiver_ref_p.finish_receive(session_id, chunk_key4, serialized_crc32)

                    self.assertTupleEqual((), self.get_result(5))

                # test intermediate error
                def mocked_store_create(*_):
                    raise SpillNotConfigured

                with patch_method(PlasmaChunkStore.create, new=mocked_store_create):
                    receiver_ref_p.create_data_writer(
                        session_id, chunk_key5, data_size, test_actor, ensure_cached=False, _promise=True) \
                        .then(lambda *s: test_actor.set_result(s, destroy=False),
                              lambda *s: test_actor.set_result(s, accept=False, destroy=False))

                    with self.assertRaises(SpillNotConfigured):
                        self.get_result(5)

                # test receive timeout
                receiver_ref_p.register_finish_callback(session_id, chunk_key6, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False)) \
                    .catch(lambda *exc: test_actor.set_result(exc, accept=False, destroy=False))

                receiver_ref_p.create_data_writer(session_id, chunk_key6, data_size, test_actor,
                                                  timeout=2, _promise=True) \
                    .then(lambda *s: test_actor.set_result(s, destroy=False))
                self.assertTupleEqual(self.get_result(5), (receiver_ref.address, None))
                receiver_ref_p.receive_data_part(session_id, chunk_key6, serialized_mock_data[:64],
                                                 zlib.crc32(serialized_mock_data[:64]))

                with self.assertRaises(TimeoutError):
                    self.get_result(5)
Esempio n. 10
0
    def testDataSerialize(self):
        array = np.random.rand(1000, 100)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.random.rand(1000, 100)
        assert_array_equal(
            array, dataserializer.load(BytesIO(dataserializer.dumps(array))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4))))
        assert_array_equal(
            array,
            dataserializer.load(
                BytesIO(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.GZIP))))

        array = np.random.rand(1000, 100).T  # test non c-contiguous
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        array = np.float64(0.2345)
        assert_array_equal(array,
                           dataserializer.loads(dataserializer.dumps(array)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.LZ4)))
        assert_array_equal(
            array,
            dataserializer.loads(
                dataserializer.dumps(
                    array, compress=dataserializer.CompressType.GZIP)))

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100, ), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(),
                          'test_dump_file_%d.bin' % id(self))
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array,
                                    dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(
                    mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(
                    vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({
            'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
            'c': list('aabaaddce')
        })
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)
Esempio n. 11
0
    def testDiskReadAndWriteMerger(self):
        import logging
        logging.basicConfig(level=logging.DEBUG)

        test_addr = f'127.0.0.1:{get_next_port()}'
        options.worker.filemerger.max_file_size = 2400
        options.worker.filemerger.concurrency = 16

        with self.create_pool(n_process=1, address=test_addr) as pool, \
                self.run_actor_test(pool) as test_actor:
            pool.create_actor(WorkerClusterInfoActor, [test_addr],
                              uid=WorkerClusterInfoActor.default_uid())
            pool.create_actor(StatusActor,
                              test_addr,
                              uid=StatusActor.default_uid())
            pool.create_actor(EventsActor, uid=EventsActor.default_uid())

            disk_file_merger_ref = pool.create_actor(
                DiskFileMergerActor, uid=DiskFileMergerActor.default_uid())

            pool.create_actor(WorkerDaemonActor,
                              uid=WorkerDaemonActor.default_uid())
            storage_manager_ref = pool.create_actor(
                StorageManagerActor, uid=StorageManagerActor.default_uid())

            session_id = str(uuid.uuid4())
            data_count = 30
            data = [
                np.random.rand(random.randint(10, 30), random.randint(10, 30))
                for _ in range(data_count)
            ]
            ser_data = [dataserializer.serialize(d) for d in data]

            storage_client = test_actor.storage_client
            handler = storage_client.get_storage_handler(
                (0, DataStorageDevice.DISK))

            for handler._compress in self._get_compress_types():
                data_keys = [str(uuid.uuid4()) for _ in range(data_count)]

                promises = []
                for idx in range(data_count):
                    block_data = dataserializer.dumps(
                        data[idx], compress=handler._compress)

                    def _write_data(ser, writer):
                        with writer:
                            writer.write(ser)
                        return writer.filename

                    promises.append(
                        handler.create_bytes_writer(session_id,
                                                    data_keys[idx],
                                                    ser_data[idx].total_bytes,
                                                    packed=True,
                                                    with_merger_lock=True,
                                                    _promise=True).then(
                                                        functools.partial(
                                                            _write_data,
                                                            block_data)))
                promise.all_(promises).then(
                    lambda *_: test_actor.set_result(0),
                    lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(50)

                for key in data_keys:
                    self.assertEqual(
                        sorted(
                            storage_manager_ref.get_data_locations(
                                session_id, [key])[0]),
                        [(0, DataStorageDevice.DISK)])

                dump_result = disk_file_merger_ref.dump_info()
                written_files = list(dump_result[2])
                for fn in written_files:
                    self.assertTrue(os.path.exists(fn))

                data_store = [None] * len(data)
                promises = []
                for idx in range(data_count):

                    def _read_data(reader, idx):
                        with reader:
                            data_store[idx] = dataserializer.loads(
                                reader.read())

                    promises.append(
                        handler.create_bytes_reader(session_id,
                                                    data_keys[idx],
                                                    with_merger_lock=True,
                                                    packed=True,
                                                    _promise=True).then(
                                                        functools.partial(
                                                            _read_data,
                                                            idx=idx)))
                promise.all_(promises).then(
                    lambda *_: test_actor.set_result(0),
                    lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(50)
                for true_data, read_data in zip(data, data_store):
                    assert_allclose(true_data, read_data)

                data_store = [None] * len(data)
                promises = []
                for idx in range(data_count):

                    def _read_data(reader, idx):
                        with reader:
                            data_store[idx] = dataserializer.deserialize(
                                reader.read())

                    promises.append(
                        handler.create_bytes_reader(session_id,
                                                    data_keys[idx],
                                                    _promise=True).then(
                                                        functools.partial(
                                                            _read_data,
                                                            idx=idx)))
                promise.all_(promises).then(
                    lambda *_: test_actor.set_result(0),
                    lambda *exc: test_actor.set_result(exc, accept=False))
                self.get_result(50)
                for true_data, read_data in zip(data, data_store):
                    assert_allclose(true_data, read_data)

                storage_client.delete(session_id, data_keys)
                pool.sleep(0.1)
                for fn in written_files:
                    self.assertFalse(os.path.exists(fn))
Esempio n. 12
0
    def testDataSerialize(self):
        try:
            import numpy as np
            from numpy.testing import assert_array_equal
        except ImportError:
            np = None

        try:
            import scipy.sparse as sps
        except ImportError:
            sps = None

        if np:
            array = np.random.rand(1000, 100)
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            array = np.random.rand(1000, 100)
            assert_array_equal(
                array,
                dataserializer.load(BytesIO(dataserializer.dumps(array))))
            assert_array_equal(
                array,
                dataserializer.load(
                    BytesIO(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.LZ4))))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.load(
                        BytesIO(
                            dataserializer.dumps(
                                array,
                                compress=dataserializer.CompressType.GZIP))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            array = np.float64(0.2345)
            assert_array_equal(
                array, dataserializer.loads(dataserializer.dumps(array)))
            assert_array_equal(
                array,
                dataserializer.loads(
                    dataserializer.dumps(
                        array, compress=dataserializer.CompressType.LZ4)))
            if not six.PY2:
                assert_array_equal(
                    array,
                    dataserializer.loads(
                        dataserializer.dumps(
                            array, compress=dataserializer.CompressType.GZIP)))

            # test structured arrays.
            rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'),
                                  ('c', '<U8')])
            array = np.ones((100, ), dtype=rec_dtype)
            array_loaded = dataserializer.loads(dataserializer.dumps(array))
            self.assertEqual(array.dtype, array_loaded.dtype)
            assert_array_equal(array, array_loaded)

            fn = os.path.join(tempfile.gettempdir(),
                              'test_dump_file_%d.bin' % id(self))
            try:
                array = np.random.rand(1000, 100).T  # test non c-contiguous
                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(array, dump_file)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))

                with open(fn, 'wb') as dump_file:
                    dataserializer.dump(
                        array,
                        dump_file,
                        compress=dataserializer.CompressType.LZ4)
                with open(fn, 'rb') as dump_file:
                    assert_array_equal(array, dataserializer.load(dump_file))

                if not six.PY2:
                    with open(fn, 'wb') as dump_file:
                        dataserializer.dump(
                            array,
                            dump_file,
                            compress=dataserializer.CompressType.GZIP)
                    with open(fn, 'rb') as dump_file:
                        assert_array_equal(array,
                                           dataserializer.load(dump_file))
            finally:
                if os.path.exists(fn):
                    os.unlink(fn)

        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(
                dataserializer.dumps(mat,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            if not six.PY2:
                des_mat = dataserializer.loads(
                    dataserializer.dumps(
                        mat, compress=dataserializer.CompressType.GZIP))
                self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)),
                                         shape=(2, ))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(
                dataserializer.dumps(vector,
                                     compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            if not six.PY2:
                des_vector = dataserializer.loads(
                    dataserializer.dumps(
                        vector, compress=dataserializer.CompressType.GZIP))
                self.assertTrue(
                    (vector.spmatrix != des_vector.spmatrix).nnz == 0)
Esempio n. 13
0
    def testDataSerialize(self):
        for type_, compress in itertools.product(
                (None,) + tuple(dataserializer.SerialType.__members__.values()),
                (None,) + tuple(dataserializer.CompressType.__members__.values())):
            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.random.rand(1000, 100)
            assert_array_equal(array, dataserializer.load(
                BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress))))

            array = np.random.rand(1000, 100).T  # test non c-contiguous
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

            array = np.float64(0.2345)
            assert_array_equal(array, dataserializer.loads(
                dataserializer.dumps(array, serial_type=type_, compress=compress)))

        # test non-serializable object
        if pyarrow:
            non_serial = type('non_serial', (object,), dict(nbytes=10))
            with self.assertRaises(SerializationFailed):
                dataserializer.dumps(non_serial())

        # test structured arrays.
        rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')])
        array = np.ones((100,), dtype=rec_dtype)
        array_loaded = dataserializer.loads(dataserializer.dumps(array))
        self.assertEqual(array.dtype, array_loaded.dtype)
        assert_array_equal(array, array_loaded)

        fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin')
        try:
            array = np.random.rand(1000, 100).T  # test non c-contiguous
            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.LZ4)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))

            with open(fn, 'wb') as dump_file:
                dataserializer.dump(array, dump_file,
                                    compress=dataserializer.CompressType.GZIP)
            with open(fn, 'rb') as dump_file:
                assert_array_equal(array, dataserializer.load(dump_file))
        finally:
            if os.path.exists(fn):
                os.unlink(fn)

        # test sparse
        if sps:
            mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr'))
            des_mat = dataserializer.loads(dataserializer.dumps(mat))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            des_mat = dataserializer.loads(dataserializer.dumps(
                mat, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0)

            vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,))
            des_vector = dataserializer.loads(dataserializer.dumps(vector))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.LZ4))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

            des_vector = dataserializer.loads(dataserializer.dumps(
                vector, compress=dataserializer.CompressType.GZIP))
            self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0)

        # test groupby
        df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3],
                            'b': [1, 3, 4, 5, 6, 5, 4, 4, 4],
                            'c': list('aabaaddce')})
        grouped = wrapped_groupby(df1, 'b')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b').c
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1, 'b')
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        grouped = wrapped_groupby(df1.b, lambda x: x % 2)
        getattr(grouped, 'indices')
        restored = dataserializer.loads(dataserializer.dumps(grouped))
        assert_groupby_equal(grouped, restored.groupby_obj)

        # test categorical
        s = np.random.RandomState(0).random(10)
        cat = pd.cut(s, [0.3, 0.5, 0.8])
        self.assertIsInstance(cat, pd.Categorical)
        des_cat = dataserializer.loads(dataserializer.dumps(cat))
        self.assertEqual(len(cat), len(des_cat))
        for c, dc in zip(cat, des_cat):
            np.testing.assert_equal(c, dc)

        # test IntervalIndex
        s = pd.interval_range(10, 100, 3)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_index_equal(s, dest_s)

        # test complex
        s = complex(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        s = np.complex64(10 + 5j)
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        self.assertIs(type(s), type(dest_s))
        self.assertEqual(s, dest_s)

        # test pickle
        d = ClassToPickle(dict(a=1, b='uvw'))
        dest_d = dataserializer.loads((dataserializer.dumps(d)))
        self.assertIs(type(d), type(dest_d))
        self.assertEqual(d.a, dest_d.a)

        # test ndarray with negative strides
        arr = np.zeros((5, 6, 3))
        arr2 = arr[:, :, ::-1]
        dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2))
        np.testing.assert_array_equal(arr2, dest_arr2)

        # test ArrowArray
        df = pd.DataFrame({'a': ['s1', 's2', 's3'],
                           'b': [['s1', 's2'], ['s3'], ['s4', 's5']]})
        df['a'] = df['a'].astype(ArrowStringDtype())
        df['b'] = df['b'].astype(ArrowListDtype(str))
        dest_df = dataserializer.loads(dataserializer.dumps(df))
        self.assertIs(type(df), type(dest_df))
        pd.testing.assert_frame_equal(df, dest_df)

        # test DataFrame with SparseDtype
        s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(
            pd.SparseDtype(np.dtype(np.float64), np.nan))
        dest_s = dataserializer.loads((dataserializer.dumps(s)))
        pd.testing.assert_series_equal(s, dest_s)
        df = pd.DataFrame({'s': s})
        dest_df = dataserializer.loads((dataserializer.dumps(df)))
        pd.testing.assert_frame_equal(df, dest_df)
Esempio n. 14
0
    def testSharedReadAndWritePacked(self, *_):
        test_addr = '127.0.0.1:%d' % get_next_port()
        io_size = dataserializer.HEADER_LENGTH * 2
        with self.create_pool(n_process=1, address=test_addr) as pool, \
                self.run_actor_test(pool) as test_actor:
            pool.create_actor(WorkerDaemonActor, uid=WorkerDaemonActor.default_uid())
            storage_manager_ref = pool.create_actor(
                StorageManagerActor, uid=StorageManagerActor.default_uid())

            pool.create_actor(PlasmaKeyMapActor, uid=PlasmaKeyMapActor.default_uid())
            pool.create_actor(SharedHolderActor, uid=SharedHolderActor.default_uid())

            data1 = np.random.random((100, 100))
            ser_data1 = dataserializer.serialize(data1)
            block_data1 = dataserializer.dumps(data1, dataserializer.CompressType.NONE)

            session_id = str(uuid.uuid4())
            data_key1 = str(uuid.uuid4())

            storage_client = test_actor.storage_client
            handler = storage_client.get_storage_handler((0, DataStorageDevice.SHARED_MEMORY))

            def _write_data(ser, writer):
                with writer:
                    writer.write(ser)

            handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes,
                                        packed=True, _promise=True) \
                .then(functools.partial(_write_data, block_data1)) \
                .then(lambda *_: test_actor.set_result(None),
                      lambda *exc: test_actor.set_result(exc, accept=False))
            self.get_result(5)
            self.assertEqual(sorted(storage_manager_ref.get_data_locations(session_id, [data_key1])[0]),
                             [(0, DataStorageDevice.SHARED_MEMORY)])
            handler.delete(session_id, [data_key1])

            def _write_data(ser, writer):
                with writer:
                    with self.assertRaises(IOError):
                        writer.write(ser[:1])

                    for start in range(0, len(ser), io_size):
                        writer.write(ser[start:start + io_size])

            handler.create_bytes_writer(session_id, data_key1, ser_data1.total_bytes,
                                        packed=True, _promise=True) \
                .then(functools.partial(_write_data, block_data1)) \
                .then(lambda *_: test_actor.set_result(None),
                      lambda *exc: test_actor.set_result(exc, accept=False))
            self.get_result(5)
            self.assertEqual(sorted(storage_manager_ref.get_data_locations(session_id, [data_key1])[0]),
                             [(0, DataStorageDevice.SHARED_MEMORY)])

            def _read_data_all(reader):
                with reader:
                    return dataserializer.loads(reader.read())

            handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \
                .then(_read_data_all) \
                .then(functools.partial(test_actor.set_result),
                      lambda *exc: test_actor.set_result(exc, accept=False))
            assert_allclose(self.get_result(5), data1)

            def _read_data_batch(reader):
                bio = BytesIO()
                with reader:
                    while True:
                        buf = reader.read(io_size)
                        if buf:
                            bio.write(buf)
                        else:
                            break
                return dataserializer.loads(bio.getvalue())

            handler.create_bytes_reader(session_id, data_key1, packed=True, _promise=True) \
                .then(_read_data_batch) \
                .then(functools.partial(test_actor.set_result),
                      lambda *exc: test_actor.set_result(exc, accept=False))
            assert_allclose(self.get_result(5), data1)
            handler.delete(session_id, [data_key1])
Esempio n. 15
0
    def testReceiverWorker(self):
        pool_addr = f'localhost:{get_next_port()}'
        options.worker.spill_directory = tempfile.mkdtemp(
            prefix='mars_test_receiver_')
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_arrow_data = dataserializer.serialize(mock_data)
        data_size = serialized_arrow_data.total_bytes
        dumped_mock_data = dataserializer.dumps(mock_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())
        chunk_key7 = str(uuid.uuid4())
        chunk_key8 = str(uuid.uuid4())
        chunk_key9 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool, \
                self.run_actor_test(pool) as test_actor:
            storage_client = test_actor.storage_client
            receiver_ref = test_actor.promise_ref(
                pool.create_actor(ReceiverWorkerActor, uid=str(uuid.uuid4())))
            receiver_manager_ref = test_actor.promise_ref(
                ReceiverManagerActor.default_uid())

            # SCENARIO 1: create two writers and write with chunks
            self.waitp(
                receiver_ref.create_data_writers(session_id,
                                                 [chunk_key1, chunk_key2],
                                                 [data_size] * 2,
                                                 test_actor,
                                                 _promise=True))
            receiver_ref.receive_data_part(
                session_id, [chunk_key1, chunk_key2], [True, False],
                dumped_mock_data,
                dumped_mock_data[:len(dumped_mock_data) // 2])
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key1),
                             ReceiveStatus.RECEIVED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                             ReceiveStatus.RECEIVING)
            receiver_ref.receive_data_part(
                session_id, [chunk_key2], [True],
                dumped_mock_data[len(dumped_mock_data) // 2:])
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key2),
                             ReceiveStatus.RECEIVED)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key1,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key2,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)

            # SCENARIO 2: one of the writers failed to create,
            # will test both existing and non-existing keys
            old_create_writer = StorageClient.create_writer

            def _create_writer_with_fail(self, session_id, chunk_key, *args,
                                         **kwargs):
                if chunk_key == fail_key:
                    if kwargs.get('_promise', True):
                        return promise.finished(*build_exc_info(ValueError),
                                                **dict(_accept=False))
                    else:
                        raise ValueError
                return old_create_writer(self, session_id, chunk_key, *args,
                                         **kwargs)

            with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \
                    self.assertRaises(ValueError):
                fail_key = chunk_key4
                self.waitp(
                    receiver_ref.create_data_writers(
                        session_id, [chunk_key3, chunk_key4, chunk_key5],
                        [data_size] * 3,
                        test_actor,
                        ensure_cached=False,
                        _promise=True))
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key3),
                             ReceiveStatus.NOT_STARTED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key4),
                             ReceiveStatus.NOT_STARTED)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key5),
                             ReceiveStatus.NOT_STARTED)

            with patch_method(StorageClient.create_writer,
                              new=_create_writer_with_fail):
                fail_key = chunk_key2
                self.waitp(
                    receiver_ref.create_data_writers(session_id,
                                                     [chunk_key2, chunk_key3],
                                                     [data_size] * 2,
                                                     test_actor,
                                                     ensure_cached=False,
                                                     _promise=True))

            # SCENARIO 3: transfer timeout
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key6])
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key6],
                                                 [data_size],
                                                 test_actor,
                                                 timeout=1,
                                                 _promise=True))
            with self.assertRaises(TimeoutError):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key6],
                                                           _promise=True))

            # SCENARIO 4: cancelled transfer (both before and during transfer)
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key7])
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key7],
                                                 [data_size],
                                                 test_actor,
                                                 timeout=1,
                                                 _promise=True))
            receiver_ref.cancel_receive(session_id, [chunk_key2, chunk_key7])
            with self.assertRaises(KeyError):
                receiver_ref.receive_data_part(
                    session_id, [chunk_key7], [False],
                    dumped_mock_data[:len(dumped_mock_data) // 2])
            with self.assertRaises(KeyError):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key7],
                                                           _promise=True))

            # SCENARIO 5: sender halt and receiver is notified (reusing previous unsuccessful key)
            receiver_manager_ref.register_pending_keys(session_id,
                                                       [chunk_key7])
            mock_ref = pool.actor_ref(test_actor.uid, address='MOCK_ADDR')
            self.waitp(
                receiver_ref.create_data_writers(session_id, [chunk_key7],
                                                 [data_size],
                                                 mock_ref,
                                                 timeout=1,
                                                 _promise=True))
            receiver_ref.notify_dead_senders(['MOCK_ADDR'])
            with self.assertRaises(WorkerDead):
                self.waitp(
                    receiver_manager_ref.add_keys_callback(session_id,
                                                           [chunk_key7],
                                                           _promise=True))

            # SCENARIO 6: successful transfer without promise
            receiver_ref.create_data_writers(session_id, [chunk_key8],
                                             [data_size],
                                             mock_ref,
                                             use_promise=False)
            receiver_ref.receive_data_part(session_id, [chunk_key8], [True],
                                           dumped_mock_data)
            self.assertEqual(receiver_ref.check_status(session_id, chunk_key8),
                             ReceiveStatus.RECEIVED)
            assert_array_equal(
                storage_client.get_object(session_id,
                                          chunk_key8,
                                          [DataStorageDevice.SHARED_MEMORY],
                                          _promise=False), mock_data)

            # SCENARIO 7: failed transfer without promise
            with patch_method(StorageClient.create_writer, new=_create_writer_with_fail), \
                    self.assertRaises(ValueError):
                fail_key = chunk_key9
                receiver_ref.create_data_writers(session_id, [chunk_key9],
                                                 [data_size],
                                                 mock_ref,
                                                 use_promise=False)
Esempio n. 16
0
    def testReceiverManager(self):
        pool_addr = f'localhost:{get_next_port()}'
        session_id = str(uuid.uuid4())

        mock_data = np.array([1, 2, 3, 4])
        serialized_data = dataserializer.dumps(mock_data)
        data_size = len(serialized_data)

        chunk_key1 = str(uuid.uuid4())
        chunk_key2 = str(uuid.uuid4())
        chunk_key3 = str(uuid.uuid4())
        chunk_key4 = str(uuid.uuid4())
        chunk_key5 = str(uuid.uuid4())
        chunk_key6 = str(uuid.uuid4())
        chunk_key7 = str(uuid.uuid4())

        with start_transfer_test_pool(address=pool_addr, plasma_size=self.plasma_storage_size) as pool, \
                self.run_actor_test(pool) as test_actor:
            mock_receiver_ref = pool.create_actor(MockReceiverWorkerActor,
                                                  uid=str(uuid.uuid4()))
            storage_client = test_actor.storage_client
            receiver_manager_ref = test_actor.promise_ref(
                ReceiverManagerActor.default_uid())

            # SCENARIO 1: test transferring existing keys
            self.waitp(
                storage_client.create_writer(
                    session_id, chunk_key1, data_size,
                    [DataStorageDevice.DISK
                     ]).then(lambda writer: promise.finished().then(
                         lambda *_: writer.write(serialized_data)).then(
                             lambda *_: writer.close())))
            result = self.waitp(
                receiver_manager_ref.create_data_writers(session_id,
                                                         [chunk_key1],
                                                         [data_size],
                                                         test_actor,
                                                         _promise=True))
            self.assertEqual(result[0].uid, mock_receiver_ref.uid)
            self.assertEqual(result[1][0], ReceiveStatus.RECEIVED)

            # test adding callback for transferred key (should return immediately)
            result = self.waitp(
                receiver_manager_ref.add_keys_callback(session_id,
                                                       [chunk_key1],
                                                       _promise=True))
            self.assertTupleEqual(result, ())

            receiver_manager_ref.register_pending_keys(
                session_id, [chunk_key1, chunk_key2])
            self.assertEqual(
                receiver_manager_ref.filter_receiving_keys(
                    session_id, [chunk_key1, chunk_key2, 'non_exist']),
                [chunk_key2])

            # SCENARIO 2: test transferring new keys and wait on listeners
            result = self.waitp(
                receiver_manager_ref.create_data_writers(
                    session_id, [chunk_key2, chunk_key3], [data_size] * 2,
                    test_actor,
                    _promise=True))
            self.assertEqual(result[0].uid, mock_receiver_ref.uid)
            self.assertIsNone(result[1][0])

            # transfer with transferring keys will report RECEIVING
            result = self.waitp(
                receiver_manager_ref.create_data_writers(session_id,
                                                         [chunk_key2],
                                                         [data_size],
                                                         test_actor,
                                                         _promise=True))
            self.assertEqual(result[1][0], ReceiveStatus.RECEIVING)

            # add listener and finish transfer
            receiver_manager_ref.add_keys_callback(session_id, [chunk_key1, chunk_key2], _promise=True) \
                .then(lambda *s: test_actor.set_result(s))
            mock_receiver_ref.receive_data_part(session_id, [chunk_key2],
                                                [True], serialized_data)
            mock_receiver_ref.receive_data_part(session_id, [chunk_key3],
                                                [True], serialized_data)
            self.get_result(5)

            # SCENARIO 3: test listening on multiple transfers
            receiver_manager_ref.create_data_writers(
                session_id, [chunk_key4, chunk_key5], [data_size] * 2, test_actor, _promise=True) \
                .then(lambda *s: test_actor.set_result(s))
            self.get_result(5)
            # add listener
            receiver_manager_ref.add_keys_callback(session_id, [chunk_key4, chunk_key5], _promise=True) \
                .then(lambda *s: test_actor.set_result(s))
            mock_receiver_ref.receive_data_part(session_id, [chunk_key4],
                                                [True], serialized_data)
            # when some chunks are not transferred, promise will not return
            with self.assertRaises(TimeoutError):
                self.get_result(0.5)
            mock_receiver_ref.receive_data_part(session_id, [chunk_key5],
                                                [True], serialized_data)
            self.get_result(5)

            # SCENARIO 4: test listening on transfer with errors
            self.waitp(
                receiver_manager_ref.create_data_writers(session_id,
                                                         [chunk_key6],
                                                         [data_size],
                                                         test_actor,
                                                         _promise=True))
            receiver_manager_ref.add_keys_callback(session_id, [chunk_key6], _promise=True) \
                .then(lambda *s: test_actor.set_result(s)) \
                .catch(lambda *exc: test_actor.set_result(exc, accept=False))
            mock_receiver_ref.cancel_receive(session_id, [chunk_key6])
            with self.assertRaises(ExecutionInterrupted):
                self.get_result(5)

            # SCENARIO 5: test creating writers without promise
            ref, statuses = receiver_manager_ref.create_data_writers(
                session_id, [chunk_key7], [data_size],
                test_actor,
                use_promise=False)
            self.assertIsNone(statuses[0])
            self.assertEqual(ref.uid, mock_receiver_ref.uid)

            # SCENARIO 6: test transferring lost keys
            storage_client.delete(session_id, [chunk_key1])

            result = self.waitp(
                receiver_manager_ref.create_data_writers(session_id,
                                                         [chunk_key1],
                                                         [data_size],
                                                         test_actor,
                                                         _promise=True))
            self.assertEqual(result[0].uid, mock_receiver_ref.uid)
            self.assertIsNone(result[1][0])

            # add listener and finish transfer
            receiver_manager_ref.add_keys_callback(session_id, [chunk_key1], _promise=True) \
                .then(lambda *s: test_actor.set_result(s))
            mock_receiver_ref.receive_data_part(session_id, [chunk_key1],
                                                [True], serialized_data)
            self.get_result(5)