Esempio n. 1
0
def test_groupby_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data

    df3 = df1.groupby('c', as_index=False).c.agg({'cnt': 'count'})
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1.op.columns == ['c']
Esempio n. 2
0
def test_sort_head(prepare_data):
    _, pdf = prepare_data

    df1 = md.DataFrame(pdf, chunk_size=20)
    df1 = df1.sort_values(by='b')
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    pdf2 = pdf.copy()
    pdf2.set_index('b', inplace=True)
    df1 = md.DataFrame(pdf2, chunk_size=20)
    df1 = df1.sort_index()
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results
Esempio n. 3
0
def test_read_csv_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Esempio n. 4
0
def test_getitem_prune_read_parquet(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.parquet')
    pdf.to_parquet(file_path)

    df1 = md.read_parquet(file_path)
    df2 = df1.c
    df3 = df1[['a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)

    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    assert opt_df1.op.columns == ['a', 'c']
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1 in opt_df3.inputs
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Esempio n. 5
0
def test_read_csv_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Esempio n. 6
0
def test_cannot_prune(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune
    df3 = df1 + 1
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    # does not support prune, another rule
    df3 = df1.head(3)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None

    df1 = md.read_csv(file_path)
    df2 = df1[df1.dtypes.index.tolist()]
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    # all columns selected
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is None
Esempio n. 7
0
def test_sort_head(prepare_data, setup):
    _, pdf = prepare_data

    df1 = md.DataFrame(pdf, chunk_size=20)
    df1 = df1.sort_values(by='b')
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.sort_values(by='b').head(10)
    pd.testing.assert_frame_equal(result, expected)

    pdf2 = pdf.copy()
    pdf2.set_index('b', inplace=True)
    df1 = md.DataFrame(pdf2, chunk_size=20)
    df1 = df1.sort_index()
    df2 = df1.head(10)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 10
    assert len(graph) == 2
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf2.sort_index().head(10)
    pd.testing.assert_frame_equal(result, expected)
Esempio n. 8
0
def test_value_counts_head(prepare_data, chunk_size):
    _, pdf = prepare_data
    df = md.DataFrame(pdf, chunk_size=chunk_size)

    df1 = df['a'].value_counts()
    df2 = df1.head(3)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 3
    assert len(graph) == 3
    assert opt_df2 in graph.results
Esempio n. 9
0
def test_no_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.iloc[1:10]

    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None

    df2 = df1.head(3)
    df3 = df1 + 1

    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    assert records.get_optimization_result(df2.data) is None
    assert records.get_optimization_result(df3.data) is None
Esempio n. 10
0
def test_read_parquet_head(prepare_data):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results
Esempio n. 11
0
def test_groupby_prune_read_sql(gen_data2):
    pdf, tempdir = gen_data2
    uri = 'sqlite:///' + os.path.join(tempdir, 'test.db')
    table_name = 'test'
    pdf.to_sql(table_name, uri, index=False)

    # test read df with columns
    df1 = md.read_sql_table('test', uri, chunk_size=4)
    df2 = df1.groupby('a', as_index=False).a.agg({'cnt': 'count'})
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1.op.columns == ['a']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
Esempio n. 12
0
def test_value_counts_head(prepare_data, setup, chunk_size):
    _, pdf = prepare_data
    df = md.DataFrame(pdf, chunk_size=chunk_size)

    df1 = df['a'].value_counts(method='tree')
    df2 = df1.head(3)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 3
    assert len(graph) == 3
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors}).fetch()
    expected = pdf['a'].value_counts().head(3)
    pd.testing.assert_series_equal(result, expected)
Esempio n. 13
0
def test_groupby_and_getitem(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df1[['b', 'a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Esempio n. 14
0
def test_read_parquet_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    dirname = os.path.join(tempdir, 'test_parquet')
    os.makedirs(dirname)
    for i in range(3):
        file_path = os.path.join(dirname , f'test{i}.parquet')
        pdf[i * 40: (i + 1) * 40].to_parquet(file_path, index=False)

    df1 = md.read_parquet(dirname)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors}).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)
Esempio n. 15
0
def test_groupby_read_csv(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1.op.usecols == ['a', 'c']
    assert opt_df2 in graph.predecessors(df3.data)
    assert opt_df2 in df3.inputs

    df4 = md.read_csv(file_path, usecols=['a', 'b', 'c'])
    df5 = df4.groupby('c').agg({'b': 'sum'})
    graph = TileableGraph([df5.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df4 = records.get_optimization_result(df4.data)
    assert opt_df4 is not None
    opt_df5 = records.get_optimization_result(df5.data)
    assert opt_df5 is not None
    assert opt_df4.op.usecols == ['b', 'c']

    df6 = md.read_csv(file_path)
    df7 = df6.groupby('c').agg({'b': 'sum'})
    df8 = df6.groupby('b').agg({'a': 'sum'})
    graph = TileableGraph([df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is not None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is not None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is not None
    assert opt_df6.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df7.inputs[0] is df6.data
    assert df8.inputs[0] is df6.data

    # test data source in result tileables
    graph = TileableGraph([df6.data, df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is None