def testDataSerialize(self): for type_, compress in itertools.product( (None,) + tuple(dataserializer.SerialType.__members__.values()), (None,) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal(array, dataserializer.load( BytesIO(dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal(array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100,), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads(dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2,)) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads(dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce')}) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s)
def testDataSerialize(self): try: import numpy as np from numpy.testing import assert_array_equal except ImportError: np = None try: import scipy.sparse as sps except ImportError: sps = None if np: array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load(BytesIO(dataserializer.dumps(array)))) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4)))) if not six.PY2: assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) array = np.float64(0.2345) assert_array_equal( array, dataserializer.loads(dataserializer.dumps(array))) assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.LZ4))) if not six.PY2: assert_array_equal( array, dataserializer.loads( dataserializer.dumps( array, compress=dataserializer.CompressType.GZIP))) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), 'test_dump_file_%d.bin' % id(self)) try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump( array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) if not six.PY2: with open(fn, 'wb') as dump_file: dataserializer.dump( array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) if not six.PY2: des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) if not six.PY2: des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue( (vector.spmatrix != des_vector.spmatrix).nnz == 0)
def testMainWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) # todo this behavior may change when eager mode is introduced state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.FAILED) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tensor_graph(json.dumps(graph.to_json()), graph_key, target_tensors=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected)
def _read_data_all(reader): with reader: return dataserializer.loads(reader.read())
def testMainDataFrameWithoutEtcd(self): import pandas as pd from mars.dataframe.expressions.datasource.dataframe import from_pandas from mars.dataframe.expressions.arithmetic import add self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=(10, 5)) data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result)) data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = add(df1, df2) graph = df3.build_graph() targets = [df3.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = data1 + data2 result = session_ref.fetch_result(graph_key, df3.key) pd.testing.assert_frame_equal(expected, loads(result))
def testWorkerFailOver(self): def kill_process_tree(proc): import psutil proc = psutil.Process(proc.pid) plasma_sock_dir = None for p in proc.children(recursive=True): try: if 'plasma' in p.name(): socks = [ conn.laddr for conn in p.connections('unix') if 'plasma' in conn.laddr ] if socks: plasma_sock_dir = os.path.dirname(socks[0]) p.kill() except psutil.NoSuchProcess: continue proc.kill() if plasma_sock_dir: shutil.rmtree(plasma_sock_dir, ignore_errors=True) delay_file = self.add_state_file('DELAY_STATE_FILE') open(delay_file, 'w').close() terminate_file = self.add_state_file('TERMINATE_STATE_FILE') self.start_processes(modules=['mars.scheduler.tests.op_delayer'], log_worker=True) session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) np_a = np.random.random((100, 100)) np_b = np.random.random((100, 100)) a = mt.array(np_a, chunk_size=30) * 2 + 1 b = mt.array(np_b, chunk_size=30) * 2 + 1 c = a.dot(b) * 2 + 1 graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) while not os.path.exists(terminate_file): actor_client.sleep(0.05) kill_process_tree(self.proc_workers[0]) logger.warning('Worker %s KILLED!\n\n', self.proc_workers[0].pid) self.proc_workers = self.proc_workers[1:] os.unlink(delay_file) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np_a * 2 + 1).dot(np_b * 2 + 1) * 2 + 1 assert_allclose(loads(result), expected)
def testMainTensorWithoutEtcd(self): self.start_processes() session_id = uuid.uuid1() actor_client = new_client() session_ref = actor_client.actor_ref( self.session_manager_ref.create_session(session_id)) a = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 b = mt.ones((100, 100), chunk_size=30) * 2 * 1 + 1 c = (a * b * 2 + 1).sum() graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = (np.ones(a.shape) * 2 * 1 + 1)**2 * 2 + 1 assert_allclose(loads(result), expected.sum()) a = mt.ones((100, 50), chunk_size=35) * 2 + 1 b = mt.ones((50, 200), chunk_size=35) * 2 + 1 c = a.dot(b) graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) assert_allclose(loads(result), np.ones((100, 200)) * 450) base_arr = np.random.random((100, 100)) a = mt.array(base_arr) sumv = reduce(operator.add, [a[:10, :10] for _ in range(10)]) graph = sumv.build_graph() targets = [sumv.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) expected = reduce(operator.add, [base_arr[:10, :10] for _ in range(10)]) result = session_ref.fetch_result(graph_key, sumv.key) assert_allclose(loads(result), expected) a = mt.ones((31, 27), chunk_size=10) b = a.reshape(27, 31) b.op.extra_params['_reshape_with_shuffle'] = True graph = b.build_graph() targets = [b.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, b.key) assert_allclose(loads(result), np.ones((27, 31)))
def testDataSerialize(self): for type_, compress in itertools.product( (None, ) + tuple(dataserializer.SerialType.__members__.values()), (None, ) + tuple(dataserializer.CompressType.__members__.values())): array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.random.rand(1000, 100) assert_array_equal( array, dataserializer.load( BytesIO( dataserializer.dumps(array, serial_type=type_, compress=compress)))) array = np.random.rand(1000, 100).T # test non c-contiguous assert_array_equal( array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) array = np.float64(0.2345) assert_array_equal( array, dataserializer.loads( dataserializer.dumps(array, serial_type=type_, compress=compress))) # test non-serializable object if pyarrow: non_serial = type('non_serial', (object, ), dict(nbytes=10)) with self.assertRaises(SerializationFailed): dataserializer.dumps(non_serial()) # test structured arrays. rec_dtype = np.dtype([('a', 'int64'), ('b', 'double'), ('c', '<U8')]) array = np.ones((100, ), dtype=rec_dtype) array_loaded = dataserializer.loads(dataserializer.dumps(array)) self.assertEqual(array.dtype, array_loaded.dtype) assert_array_equal(array, array_loaded) fn = os.path.join(tempfile.gettempdir(), f'test_dump_file_{id(self)}.bin') try: array = np.random.rand(1000, 100).T # test non c-contiguous with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.LZ4) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) with open(fn, 'wb') as dump_file: dataserializer.dump(array, dump_file, compress=dataserializer.CompressType.GZIP) with open(fn, 'rb') as dump_file: assert_array_equal(array, dataserializer.load(dump_file)) finally: if os.path.exists(fn): os.unlink(fn) # test sparse if sps: mat = sparse.SparseMatrix(sps.random(100, 100, 0.1, format='csr')) des_mat = dataserializer.loads(dataserializer.dumps(mat)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps(mat, compress=dataserializer.CompressType.LZ4)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) des_mat = dataserializer.loads( dataserializer.dumps( mat, compress=dataserializer.CompressType.GZIP)) self.assertTrue((mat.spmatrix != des_mat.spmatrix).nnz == 0) vector = sparse.SparseVector(sps.csr_matrix(np.random.rand(2)), shape=(2, )) des_vector = dataserializer.loads(dataserializer.dumps(vector)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps(vector, compress=dataserializer.CompressType.LZ4)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) des_vector = dataserializer.loads( dataserializer.dumps( vector, compress=dataserializer.CompressType.GZIP)) self.assertTrue((vector.spmatrix != des_vector.spmatrix).nnz == 0) # test groupby df1 = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce') }) grouped = wrapped_groupby(df1, 'b') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b').c restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1, 'b') getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) grouped = wrapped_groupby(df1.b, lambda x: x % 2) getattr(grouped, 'indices') restored = dataserializer.loads(dataserializer.dumps(grouped)) assert_groupby_equal(grouped, restored.groupby_obj) # test categorical s = np.random.RandomState(0).random(10) cat = pd.cut(s, [0.3, 0.5, 0.8]) self.assertIsInstance(cat, pd.Categorical) des_cat = dataserializer.loads(dataserializer.dumps(cat)) self.assertEqual(len(cat), len(des_cat)) for c, dc in zip(cat, des_cat): np.testing.assert_equal(c, dc) # test IntervalIndex s = pd.interval_range(10, 100, 3) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_index_equal(s, dest_s) # test complex s = complex(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) s = np.complex64(10 + 5j) dest_s = dataserializer.loads((dataserializer.dumps(s))) self.assertIs(type(s), type(dest_s)) self.assertEqual(s, dest_s) # test ndarray with negative strides arr = np.zeros((5, 6, 3)) arr2 = arr[:, :, ::-1] dest_arr2 = dataserializer.loads(dataserializer.dumps(arr2)) np.testing.assert_array_equal(arr2, dest_arr2) # test ArrowArray df = pd.DataFrame({ 'a': ['s1', 's2', 's3'], 'b': [['s1', 's2'], ['s3'], ['s4', 's5']] }) df['a'] = df['a'].astype(ArrowStringDtype()) df['b'] = df['b'].astype(ArrowListDtype(str)) dest_df = dataserializer.loads(dataserializer.dumps(df)) self.assertIs(type(df), type(dest_df)) pd.testing.assert_frame_equal(df, dest_df) # test DataFrame with SparseDtype s = pd.Series([1, 2, np.nan, np.nan, 3]).astype(pd.SparseDtype(np.dtype(np.float64), np.nan)) dest_s = dataserializer.loads((dataserializer.dumps(s))) pd.testing.assert_series_equal(s, dest_s) df = pd.DataFrame({'s': s}) dest_df = dataserializer.loads((dataserializer.dumps(df))) pd.testing.assert_frame_equal(df, dest_df)
def get_result_data(self, session_id, chunk_key): buf = self._data_writers[(session_id, chunk_key)].getvalue() return dataserializer.loads(buf)
def testFileBufferIO(self): if not np: return from numpy.testing import assert_array_equal compressions = [dataserializer.CompressType.NONE] + \ list(dataserializer.get_supported_compressions()) for c1 in compressions: for c2 in compressions: data = np.random.random((1000, 100)) # test complete read compressed_read_file = BytesIO( dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) compressed = reader.read() self.assertEqual( c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed)) # test partial read compressed_read_file = BytesIO( dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) block = reader.read(128) data_left = reader.read() assert_array_equal(data, dataserializer.loads(block + data_left)) # test read by chunks bio = BytesIO() compressed_read_file = BytesIO( dataserializer.dumps(data, compress=c1)) reader = FileBufferIO(compressed_read_file, 'r', compress_out=c2) while True: block = reader.read(128) if not block: break bio.write(block) compressed = bio.getvalue() self.assertEqual( c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed)) # test write by chunks compressed_read_file.seek(0) compressed_write_file = BytesIO() writer = FileBufferIO(compressed_write_file, 'w', compress_in=c2, managed=False) while True: block = compressed_read_file.read(128) if not block: break writer.write(block) writer.close() compressed = compressed_write_file.getvalue() self.assertEqual( c2, dataserializer.read_file_header(compressed).compress) assert_array_equal(data, dataserializer.loads(compressed))
def testIterativeTilingWithoutEtcd(self): self.start_processes(etcd=False) session_id = uuid.uuid1() actor_client = new_client() rs = np.random.RandomState(0) session_ref = actor_client.actor_ref(self.session_manager_ref.create_session(session_id)) raw = rs.rand(100) a = mt.tensor(raw, chunk_size=10) a.sort() c = a[:5] graph = c.build_graph() targets = [c.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, c.key) expected = np.sort(raw)[:5] assert_allclose(loads(result), expected) with self.assertRaises(KeyError): session_ref.fetch_result(graph_key, a.key, check=False) raw1 = rs.rand(20) raw2 = rs.rand(20) a = mt.tensor(raw1, chunk_size=10) a.sort() b = mt.tensor(raw2, chunk_size=15) + 1 c = mt.concatenate([a[:10], b]) c.sort() d = c[:5] graph = d.build_graph() targets = [d.key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) result = session_ref.fetch_result(graph_key, d.key) expected = np.sort(np.concatenate([np.sort(raw1)[:10], raw2 + 1]))[:5] assert_allclose(loads(result), expected) raw = rs.randint(100, size=(100,)) a = mt.tensor(raw, chunk_size=53) a.sort() b = mt.histogram(a, bins='scott') graph = build_tileable_graph(b, set()) targets = [b[0].key, b[1].key] graph_key = uuid.uuid1() session_ref.submit_tileable_graph(json.dumps(graph.to_json()), graph_key, target_tileables=targets) state = self.wait_for_termination(actor_client, session_ref, graph_key) self.assertEqual(state, GraphState.SUCCEEDED) res = session_ref.fetch_result(graph_key, b[0].key), \ session_ref.fetch_result(graph_key, b[1].key) expected = np.histogram(np.sort(raw), bins='scott') assert_allclose(loads(res[0]), expected[0]) assert_allclose(loads(res[1]), expected[1])