def test_groupby_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1.op.columns == ['a', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'}) graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1.op.columns == ['c']
def test_sort_head(prepare_data): _, pdf = prepare_data df1 = md.DataFrame(pdf, chunk_size=20) df1 = df1.sort_values(by='b') df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results pdf2 = pdf.copy() pdf2.set_index('b', inplace=True) df1 = md.DataFrame(pdf2, chunk_size=20) df1 = df1.sort_index() df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results
def test_read_csv_head(prepare_data, setup): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected) # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_getitem_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.c df3 = df1[['a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs assert opt_df1.op.columns == ['a', 'c'] assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
async def test_task_progress(start_test_service): sv_pool_address, task_api, storage_api = start_test_service session_api = await SessionAPI.create(address=sv_pool_address) ref = await session_api.create_remote_object(task_api._session_id, 'progress_controller', _ProgressController) def f1(count: int): progress_controller = get_context().get_remote_object( 'progress_controller') for idx in range(count): progress_controller.wait() get_context().set_progress((1 + idx) * 1.0 / count) r = mr.spawn(f1, args=(2, )) graph = TileableGraph([r.data]) next(TileableGraphBuilder(graph).build()) await task_api.submit_tileable_graph(graph, fuse_enabled=False) await asyncio.sleep(0.2) results = await task_api.get_task_results(progress=True) assert results[0].progress == 0.0 await ref.set() await asyncio.sleep(1) results = await task_api.get_task_results(progress=True) assert results[0].progress == 0.5 await ref.set() await asyncio.sleep(1) results = await task_api.get_task_results(progress=True) assert results[0].progress == 1.0
async def test_shuffle(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool raw = np.random.rand(10, 10) raw2 = np.random.randint(10, size=(10, )) a = mt.tensor(raw, chunk_size=5) b = mt.tensor(raw2, chunk_size=5) c = a[b] graph = TileableGraph([c.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert task_result.error is None assert await manager.get_task_progress(task_id) == 1.0 expect = raw[raw2] result_tileables = (await manager.get_task_result_tileables(task_id))[0] result = await _merge_data(result_tileables, storage_api) np.testing.assert_array_equal(result, expect)
async def test_iterative_tiling(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool rs = np.random.RandomState(0) raw_a = rs.rand(10, 10) raw_b = rs.rand(10, 10) a = mt.tensor(raw_a, chunk_size=5) b = mt.tensor(raw_b, chunk_size=5) d = a[a[:, 0] < 3] + b[b[:, 0] < 3] graph = TileableGraph([d.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert task_result.error is None assert await manager.get_task_progress(task_id) == 1.0 expect = raw_a[raw_a[:, 0] < 3] + raw_b[raw_b[:, 0] < 3] result_tileables = (await manager.get_task_result_tileables(task_id))[0] result = await _merge_data(result_tileables, storage_api) np.testing.assert_array_equal(result, expect)
async def test_run_tasks_with_same_name(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=5) b = a + 1 c = a * 2 for t, e in zip([b, c], [raw + 1, raw * 2]): graph = TileableGraph([t.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, task_name='my_task', fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) assert await manager.get_task_progress(task_id) == 1.0 result_tileable = (await manager.get_task_result_tileables(task_id))[0] result = await _merge_data(result_tileable, storage_api) np.testing.assert_array_equal(result, e)
async def test_cancel_task(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool def func(): time.sleep(200) rs = [mr.spawn(func) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await asyncio.sleep(.5) with Timer() as timer: await manager.cancel_task(task_id) result = await manager.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 20 keys = [r.key for r in rs] del rs gc.collect() await asyncio.sleep(0.5) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts(keys)) == [0] * len(keys)
async def test_run_task(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool raw = np.random.RandomState(0).rand(10, 10) a = mt.tensor(raw, chunk_size=5) b = a + 1 graph = TileableGraph([b.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) assert await manager.get_task_progress(task_id) == 1.0 result_tileable = (await manager.get_task_result_tileables(task_id))[0] result = await _merge_data(result_tileable, storage_api) np.testing.assert_array_equal(result, raw + 1) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([b.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts( [c.key for c in result_tileable.chunks])) == [1] * len(result_tileable.chunks)
async def test_task_execution(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service def f1(): return np.arange(5) def f2(): return np.arange(5, 10) def f3(f1r, f2r): return np.concatenate([f1r, f2r]).sum() r1 = mr.spawn(f1) r2 = mr.spawn(f2) r3 = mr.spawn(f3, args=(r1, r2)) graph = TileableGraph([r3.data]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) assert await task_api.get_last_idle_time() is None assert isinstance(task_id, str) await task_api.wait_task(task_id) task_result = await task_api.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert await task_api.get_last_idle_time() is not None if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) result_tileable = (await task_api.get_fetch_tileables(task_id))[0] data_key = result_tileable.chunks[0].key assert await storage_api.get(data_key) == 45
def test_read_csv_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_cannot_prune(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune, another rule df3 = df1.head(3) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1[df1.dtypes.index.tolist()] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) # all columns selected records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None
async def test_get_tileable_graph(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service def f1(): return np.arange(5) def f2(): return np.arange(5, 10) def f3(f1r, f2r): return np.concatenate([f1r, f2r]).sum() r1 = mr.spawn(f1) r2 = mr.spawn(f2) r3 = mr.spawn(f3, args=(r1, r2)) graph = TileableGraph([r3.data]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) with pytest.raises(TaskNotExist): await task_api.get_tileable_graph_as_json('non_exist') tileable_detail = await task_api.get_tileable_graph_as_json(task_id) num_tileable = len(tileable_detail.get('tileables')) num_dependencies = len(tileable_detail.get('dependencies')) assert num_tileable > 0 assert num_dependencies <= (num_tileable / 2) * (num_tileable / 2) assert (num_tileable == 1 and num_dependencies == 0) or (num_tileable > 1 and num_dependencies > 0) graph_nodes = [] graph_dependencies = [] for node in graph.iter_nodes(): graph_nodes.append(node.key) for node_successor in graph.iter_successors(node): graph_dependencies.append({ 'fromTileableId': node.key, 'toTileableId': node_successor.key, 'linkType': 0, }) for tileable in tileable_detail.get('tileables'): graph_nodes.remove(tileable.get('tileableId')) assert len(graph_nodes) == 0 for i in range(num_dependencies): dependency = tileable_detail.get('dependencies')[i] assert graph_dependencies[i] == dependency
async def test_optimization(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool with tempfile.TemporaryDirectory() as tempdir: file_path = os.path.join(tempdir, 'test.csv') pdf = pd.DataFrame({ 'a': [3, 4, 5, 3, 5, 4, 1, 2, 3], 'b': [1, 3, 4, 5, 6, 5, 4, 4, 4], 'c': list('aabaaddce'), 'd': list('abaaaddce') }) pdf.to_csv(file_path, index=False) df = md.read_csv(file_path) df2 = df.groupby('c').agg({'a': 'sum'}) df3 = df[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) assert await manager.get_task_progress(task_id) == 1.0 expect = pdf.groupby('c').agg({'a': 'sum'}) result_tileables = (await manager.get_task_result_tileables(task_id)) result1 = result_tileables[0] result = await _merge_data(result1, storage_api) np.testing.assert_array_equal(result, expect) expect = pdf[['b', 'a']] result2 = result_tileables[1] result = await _merge_data(result2, storage_api) np.testing.assert_array_equal(result, expect) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts([ c.key for c in result_tileables[1].chunks ])) == [1] * len(result_tileables[1].chunks) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([df3.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts([ c.key for c in result_tileables[1].chunks ])) == [1] * len(result_tileables[1].chunks)
def test_k_means_init_large_n_clusters(): chunk_bytes_limit = options.chunk_store_limit * 2 n_cluster = 2000 x = mt.random.rand(1000_000, 64, chunk_size=250_000) centers = _init_centroids(x, n_cluster, init='k-means||') t_graph = next(TileableGraphBuilder(TileableGraph([centers])).build()) graph = next(ChunkGraphBuilder(t_graph).build()) for c in graph: nbytes = c.nbytes if not np.isnan(nbytes): assert nbytes <= chunk_bytes_limit
def test_sort_head(prepare_data, setup): _, pdf = prepare_data df1 = md.DataFrame(pdf, chunk_size=20) df1 = df1.sort_values(by='b') df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.sort_values(by='b').head(10) pd.testing.assert_frame_equal(result, expected) pdf2 = pdf.copy() pdf2.set_index('b', inplace=True) df1 = md.DataFrame(pdf2, chunk_size=20) df1 = df1.sort_index() df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf2.sort_index().head(10) pd.testing.assert_frame_equal(result, expected)
def test_value_counts_head(prepare_data, chunk_size): _, pdf = prepare_data df = md.DataFrame(pdf, chunk_size=chunk_size) df1 = df['a'].value_counts() df2 = df1.head(3) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 3 assert len(graph) == 3 assert opt_df2 in graph.results
def test_no_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.iloc[1:10] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None df2 = df1.head(3) df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None assert records.get_optimization_result(df3.data) is None
def test_cupy(): t1 = mt.ones((100, 50), chunk_size=50, gpu=True) t2 = mt.ones(50, chunk_size=50, gpu=True) t = (t1 - t2) / mt.sqrt(t2 * (1 - t2) * len(t2)) graph = TileableGraph([t.data]) next(TileableGraphBuilder(graph).build()) context = dict() chunk_graph_builder = ChunkGraphBuilder(graph, fuse_enabled=False, tile_context=context) chunk_graph = next(chunk_graph_builder.build()) CupyRuntimeOptimizer(chunk_graph).optimize() assert any(n.op.__class__.__name__ == 'TensorCpFuseChunk' for n in chunk_graph)
async def test_task_error(start_test_service): _sv_pool_address, task_api, storage_api = start_test_service # test job cancel def f1(): raise SystemError rs = [mr.spawn(f1) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await task_api.submit_tileable_graph(graph, fuse_enabled=False) await task_api.wait_task(task_id, timeout=10) results = await task_api.get_task_results(progress=True) assert type(results[0].error) is SystemError
def test_read_parquet_head(prepare_data): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results
async def test_error_task(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool with mt.errstate(divide='raise'): a = mt.ones((10, 10), chunk_size=10) c = a / 0 graph = TileableGraph([c.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated assert task_result.error is not None assert isinstance(task_result.error, FloatingPointError)
def test_value_counts_head(prepare_data, setup, chunk_size): _, pdf = prepare_data df = md.DataFrame(pdf, chunk_size=chunk_size) df1 = df['a'].value_counts(method='tree') df2 = df1.head(3) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 3 assert len(graph) == 3 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors}).fetch() expected = pdf['a'].value_counts().head(3) pd.testing.assert_series_equal(result, expected)
def test_groupby_prune_read_sql(gen_data2): pdf, tempdir = gen_data2 uri = 'sqlite:///' + os.path.join(tempdir, 'test.db') table_name = 'test' pdf.to_sql(table_name, uri, index=False) # test read df with columns df1 = md.read_sql_table('test', uri, chunk_size=4) df2 = df1.groupby('a', as_index=False).a.agg({'cnt': 'count'}) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1.op.columns == ['a'] # original tileable should not be modified assert df2.inputs[0] is df1.data
async def test_shuffle(actor_pool): pool, session_id, meta_api, lifecycle_api, storage_api, manager = actor_pool rs = np.random.RandomState(0) raw = rs.rand(10, 10) raw2 = rs.randint(10, size=(10, )) a = mt.tensor(raw, chunk_size=5) b = mt.tensor(raw2, chunk_size=5) c = a[b] graph = TileableGraph([c.data]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await manager.wait_task(task_id) task_result: TaskResult = await manager.get_task_result(task_id) assert task_result.status == TaskStatus.terminated if task_result.error is not None: raise task_result.error.with_traceback(task_result.traceback) assert await manager.get_task_progress(task_id) == 1.0 expect = raw[raw2] result_tileable = (await manager.get_task_result_tileables(task_id))[0] result = await _merge_data(result_tileable, storage_api) np.testing.assert_array_equal(result, expect) # test ref counts assert (await lifecycle_api.get_tileable_ref_counts([c.key]))[0] == 1 assert (await lifecycle_api.get_chunk_ref_counts([ c.key for c in result_tileable.chunks ])) == [1] * len(result_tileable.chunks) await lifecycle_api.decref_tileables([c.key]) ref_counts = await lifecycle_api.get_all_chunk_ref_counts() assert len(ref_counts) == 0 # test if exists in storage from mars.storage import StorageLevel assert len(await storage_api.list(level=StorageLevel.MEMORY)) == 0
def test_read_csv_head(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) context = dict() chunk_graph_builder = ChunkGraphBuilder(graph, fuse_enabled=False, tile_context=context) chunk_graph = next(chunk_graph_builder.build()) chunk1 = context[df1.data].chunks[0].data chunk2 = context[df2.data].chunks[0].data records = optimize(chunk_graph) assert records.get_optimization_result(chunk1) is None opt_chunk2 = records.get_optimization_result(chunk2) assert opt_chunk2.op.nrows == 5 assert len(chunk_graph) == 1 assert opt_chunk2 in chunk_graph.results
async def test_cancel_task(actor_pool): pool, session_id, meta_api, storage_api, manager = actor_pool def func(): time.sleep(20) rs = [mr.spawn(func) for _ in range(10)] graph = TileableGraph([r.data for r in rs]) next(TileableGraphBuilder(graph).build()) task_id = await manager.submit_tileable_graph(graph, fuse_enabled=False) assert isinstance(task_id, str) await asyncio.sleep(.5) with Timer() as timer: await manager.cancel_task(task_id) result = await manager.get_task_result(task_id) assert result.status == TaskStatus.terminated assert timer.duration < 15
def test_read_parquet_head(prepare_data, setup): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors}).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected)
def test_groupby_and_getitem(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df1[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1 in graph.predecessors(opt_df2) opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data