def test_groupby_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1.op.columns == ['a', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'}) graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1.op.columns == ['c']
def test_sort_head(prepare_data): _, pdf = prepare_data df1 = md.DataFrame(pdf, chunk_size=20) df1 = df1.sort_values(by='b') df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results pdf2 = pdf.copy() pdf2.set_index('b', inplace=True) df1 = md.DataFrame(pdf2, chunk_size=20) df1 = df1.sort_index() df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results
def test_read_csv_head(prepare_data, setup): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected) # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_getitem_prune_read_parquet(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.parquet') pdf.to_parquet(file_path) df1 = md.read_parquet(file_path) df2 = df1.c df3 = df1[['a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs assert opt_df1.op.columns == ['a', 'c'] assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1 in opt_df3.inputs # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
def test_read_csv_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results # test multiple head df3 = df1.head(10) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None assert opt_df1.op.nrows == 10 opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert graph.predecessors(opt_df2)[0] is opt_df1 assert opt_df2.inputs[0] is opt_df1 opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert graph.predecessors(opt_df3)[0] is opt_df1 assert opt_df3.inputs[0] is opt_df1 # test head with successor df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.head(5) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 2
def test_cannot_prune(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) # does not support prune, another rule df3 = df1.head(3) graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None df1 = md.read_csv(file_path) df2 = df1[df1.dtypes.index.tolist()] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) # all columns selected records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is None
def test_sort_head(prepare_data, setup): _, pdf = prepare_data df1 = md.DataFrame(pdf, chunk_size=20) df1 = df1.sort_values(by='b') df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf.sort_values(by='b').head(10) pd.testing.assert_frame_equal(result, expected) pdf2 = pdf.copy() pdf2.set_index('b', inplace=True) df1 = md.DataFrame(pdf2, chunk_size=20) df1 = df1.sort_index() df2 = df1.head(10) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 10 assert len(graph) == 2 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors }).fetch() expected = pdf2.sort_index().head(10) pd.testing.assert_frame_equal(result, expected)
def test_value_counts_head(prepare_data, chunk_size): _, pdf = prepare_data df = md.DataFrame(pdf, chunk_size=chunk_size) df1 = df['a'].value_counts() df2 = df1.head(3) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 3 assert len(graph) == 3 assert opt_df2 in graph.results
def test_no_head(prepare_data): tempdir, pdf = prepare_data file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path, index=False) size = os.stat(file_path).st_size / 2 df1 = md.read_csv(file_path, chunk_bytes=size) df2 = df1.iloc[1:10] graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None df2 = df1.head(3) df3 = df1 + 1 graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None assert records.get_optimization_result(df2.data) is None assert records.get_optimization_result(df3.data) is None
def test_read_parquet_head(prepare_data): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results
def test_groupby_prune_read_sql(gen_data2): pdf, tempdir = gen_data2 uri = 'sqlite:///' + os.path.join(tempdir, 'test.db') table_name = 'test' pdf.to_sql(table_name, uri, index=False) # test read df with columns df1 = md.read_sql_table('test', uri, chunk_size=4) df2 = df1.groupby('a', as_index=False).a.agg({'cnt': 'count'}) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1.op.columns == ['a'] # original tileable should not be modified assert df2.inputs[0] is df1.data
def test_value_counts_head(prepare_data, setup, chunk_size): _, pdf = prepare_data df = md.DataFrame(pdf, chunk_size=chunk_size) df1 = df['a'].value_counts(method='tree') df2 = df1.head(3) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 3 assert len(graph) == 3 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors}).fetch() expected = pdf['a'].value_counts().head(3) pd.testing.assert_series_equal(result, expected)
def test_groupby_and_getitem(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df1[['b', 'a']] graph = TileableGraph([df2.data, df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None assert opt_df1 in graph.predecessors(opt_df2) opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is not None assert opt_df1 in graph.predecessors(opt_df3) assert opt_df1.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df2.inputs[0] is df1.data assert df3.inputs[0] is df1.data
def test_read_parquet_head(prepare_data, setup): tempdir, pdf = prepare_data dirname = os.path.join(tempdir, 'test_parquet') os.makedirs(dirname) for i in range(3): file_path = os.path.join(dirname , f'test{i}.parquet') pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False) df1 = md.read_parquet(dirname) df2 = df1.head(5) graph = TileableGraph([df2.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) assert records.get_optimization_result(df1.data) is None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2.op.nrows == 5 assert len(graph) == 1 assert opt_df2 in graph.results result = df2.execute(extra_config={ 'operand_executors': _iloc_operand_executors}).fetch() expected = pdf.head(5) pd.testing.assert_frame_equal(result, expected)
def test_groupby_read_csv(gen_data1): pdf, tempdir = gen_data1 file_path = os.path.join(tempdir, 'test.csv') pdf.to_csv(file_path) df1 = md.read_csv(file_path) df2 = df1.groupby('c').agg({'a': 'sum'}) df3 = df2 + 1 graph = TileableGraph([df3.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df1 = records.get_optimization_result(df1.data) assert opt_df1 is not None opt_df2 = records.get_optimization_result(df2.data) assert opt_df2 is not None opt_df3 = records.get_optimization_result(df3.data) assert opt_df3 is None assert opt_df1 in graph.predecessors(opt_df2) assert opt_df1 in opt_df2.inputs assert opt_df1.op.usecols == ['a', 'c'] assert opt_df2 in graph.predecessors(df3.data) assert opt_df2 in df3.inputs df4 = md.read_csv(file_path, usecols=['a', 'b', 'c']) df5 = df4.groupby('c').agg({'b': 'sum'}) graph = TileableGraph([df5.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df4 = records.get_optimization_result(df4.data) assert opt_df4 is not None opt_df5 = records.get_optimization_result(df5.data) assert opt_df5 is not None assert opt_df4.op.usecols == ['b', 'c'] df6 = md.read_csv(file_path) df7 = df6.groupby('c').agg({'b': 'sum'}) df8 = df6.groupby('b').agg({'a': 'sum'}) graph = TileableGraph([df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is not None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is not None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is not None assert opt_df6.op.usecols == ['a', 'b', 'c'] # original tileable should not be modified assert df7.inputs[0] is df6.data assert df8.inputs[0] is df6.data # test data source in result tileables graph = TileableGraph([df6.data, df7.data, df8.data]) next(TileableGraphBuilder(graph).build()) records = optimize(graph) opt_df6 = records.get_optimization_result(df6.data) assert opt_df6 is None opt_df7 = records.get_optimization_result(df7.data) assert opt_df7 is None opt_df8 = records.get_optimization_result(df8.data) assert opt_df8 is None