Esempio n. 1
0
def test_rewrite_simple_shuffle_layer():
    npartitions = 10
    df = dd.from_pandas(pd.DataFrame(np.random.randint(0, 100, size=(100, 2)),
                                     columns=["age", "grade"]),
                        npartitions=npartitions)
    # We set max_branch=npartitions in order to ensure that the task-based
    # shuffle happens in a single stage, which is required in order for our
    # optimization to work.
    a = df.set_index(["age"], shuffle="tasks", max_branch=npartitions)

    dsk = a.__dask_graph__()
    keys = a.__dask_keys__()
    assert any(type(v) is SimpleShuffleLayer for k, v in dsk.layers.items())
    dsk = rewrite_simple_shuffle_layer(dsk, keys)
    assert all(
        type(v) is not SimpleShuffleLayer for k, v in dsk.layers.items())
    assert any(
        type(v) is MultipleReturnSimpleShuffleLayer
        for k, v in dsk.layers.items())
Esempio n. 2
0
 def side_effect(dsk, keys):
     return rewrite_simple_shuffle_layer(dsk, keys)