Beispiel #1
0
def test_read_csv_head(prepare_data, setup):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    result = df2.execute(extra_config={
        'operand_executors': _iloc_operand_executors
    }).fetch()
    expected = pdf.head(5)
    pd.testing.assert_frame_equal(result, expected)

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Beispiel #2
0
def test_read_csv_head(prepare_data):
    tempdir, pdf = prepare_data
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path, index=False)

    size = os.stat(file_path).st_size / 2
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    graph = TileableGraph([df2.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 1
    assert opt_df2 in graph.results

    # test multiple head
    df3 = df1.head(10)
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    assert opt_df1.op.nrows == 10
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert graph.predecessors(opt_df2)[0] is opt_df1
    assert opt_df2.inputs[0] is opt_df1
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert graph.predecessors(opt_df3)[0] is opt_df1
    assert opt_df3.inputs[0] is opt_df1

    # test head with successor
    df1 = md.read_csv(file_path, chunk_bytes=size)
    df2 = df1.head(5)
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    assert records.get_optimization_result(df1.data) is None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2.op.nrows == 5
    assert len(graph) == 2
Beispiel #3
0
def test_groupby_and_getitem(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df1[['b', 'a']]
    graph = TileableGraph([df2.data, df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    assert opt_df1 in graph.predecessors(opt_df2)
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is not None
    assert opt_df1 in graph.predecessors(opt_df3)
    assert opt_df1.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df2.inputs[0] is df1.data
    assert df3.inputs[0] is df1.data
Beispiel #4
0
def test_groupby_read_csv(gen_data1):
    pdf, tempdir = gen_data1
    file_path = os.path.join(tempdir, 'test.csv')
    pdf.to_csv(file_path)

    df1 = md.read_csv(file_path)
    df2 = df1.groupby('c').agg({'a': 'sum'})
    df3 = df2 + 1
    graph = TileableGraph([df3.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df1 = records.get_optimization_result(df1.data)
    assert opt_df1 is not None
    opt_df2 = records.get_optimization_result(df2.data)
    assert opt_df2 is not None
    opt_df3 = records.get_optimization_result(df3.data)
    assert opt_df3 is None
    assert opt_df1 in graph.predecessors(opt_df2)
    assert opt_df1 in opt_df2.inputs
    assert opt_df1.op.usecols == ['a', 'c']
    assert opt_df2 in graph.predecessors(df3.data)
    assert opt_df2 in df3.inputs

    df4 = md.read_csv(file_path, usecols=['a', 'b', 'c'])
    df5 = df4.groupby('c').agg({'b': 'sum'})
    graph = TileableGraph([df5.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df4 = records.get_optimization_result(df4.data)
    assert opt_df4 is not None
    opt_df5 = records.get_optimization_result(df5.data)
    assert opt_df5 is not None
    assert opt_df4.op.usecols == ['b', 'c']

    df6 = md.read_csv(file_path)
    df7 = df6.groupby('c').agg({'b': 'sum'})
    df8 = df6.groupby('b').agg({'a': 'sum'})
    graph = TileableGraph([df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is not None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is not None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is not None
    assert opt_df6.op.usecols == ['a', 'b', 'c']
    # original tileable should not be modified
    assert df7.inputs[0] is df6.data
    assert df8.inputs[0] is df6.data

    # test data source in result tileables
    graph = TileableGraph([df6.data, df7.data, df8.data])
    next(TileableGraphBuilder(graph).build())
    records = optimize(graph)
    opt_df6 = records.get_optimization_result(df6.data)
    assert opt_df6 is None
    opt_df7 = records.get_optimization_result(df7.data)
    assert opt_df7 is None
    opt_df8 = records.get_optimization_result(df8.data)
    assert opt_df8 is None