Ejemplo n.º 1
0
def test_blockwise_non_blockwise_output():
    x = da.ones(10, chunks=(5, ))
    y = ((x + 1) + 2) + 3
    w = y.sum()
    z = ((y * 2) * 3) * 4

    z_top_before = tuple(z.dask.dicts[z.name].indices)
    (zz, ) = dask.optimize(z)
    z_top_after = tuple(z.dask.dicts[z.name].indices)
    assert z_top_before == z_top_after, "z_top mutated"

    dsk = optimize_blockwise(z.dask,
                             keys=list(dask.core.flatten(z.__dask_keys__())))
    assert isinstance(dsk, HighLevelGraph)
    assert (len([
        layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)
    ]) == 1)

    dsk = optimize_blockwise(
        HighLevelGraph.merge(w.dask, z.dask),
        keys=list(dask.core.flatten([w.__dask_keys__(),
                                     z.__dask_keys__()])),
    )
    assert isinstance(dsk, HighLevelGraph)
    assert (len([
        layer
        for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)
    ]) >= 1)
Ejemplo n.º 2
0
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(core.flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    else:
        # Perform Blockwise optimizations for HLG input
        dsk = optimize_dataframe_getitem(dsk, keys=keys)
        dsk = optimize_blockwise(dsk, keys=keys)
        dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Do not perform low-level fusion unless the user has
    # specified True explicitly. The configuration will
    # be None by default.
    if not config.get("optimization.fuse.active"):
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    fuse_subgraphs = config.get("optimization.fuse.subgraphs")
    if fuse_subgraphs is None:
        fuse_subgraphs = True
    dsk, _ = fuse(
        dsk,
        keys,
        dependencies=dependencies,
        fuse_subgraphs=fuse_subgraphs,
    )
    dsk, _ = cull(dsk, keys)
    return dsk
Ejemplo n.º 3
0
def test_blockwise_non_blockwise_output():
    x = da.ones(10, chunks=(5,))
    y = (((x + 1) + 2) + 3)
    w = y.sum()
    z = (((y * 2) * 3) * 4)

    z_top_before = tuple(z.dask.dicts[z.name].indices)
    (zz,) = dask.optimize(z)
    z_top_after = tuple(z.dask.dicts[z.name].indices)
    assert z_top_before == z_top_after, "z_top mutated"

    dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__())))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1

    dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask),
                             keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
Ejemplo n.º 4
0
def optimize(
    dsk,
    keys,
    fuse_keys=None,
    fast_functions=None,
    inline_functions_fast_functions=(getter_inline,),
    rename_fused_keys=True,
    **kwargs,
):
    """Optimize dask for array computation

    1.  Cull tasks not necessary to evaluate keys
    2.  Remove full slicing, e.g. x[:]
    3.  Inline fast functions like getitem and np.transpose
    """
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())

    dsk = optimize_blockwise(dsk, keys=keys)
    dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Perform low-level fusion unless the user has
    # specified False explicitly.
    if config.get("optimization.fuse.active") is False:
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    # Low level task optimizations
    if fast_functions is not None:
        inline_functions_fast_functions = fast_functions

    hold = hold_keys(dsk, dependencies)

    dsk, dependencies = fuse(
        dsk,
        hold + keys + (fuse_keys or []),
        dependencies,
        rename_keys=rename_fused_keys,
    )
    if inline_functions_fast_functions:
        dsk = inline_functions(
            dsk,
            keys,
            dependencies=dependencies,
            fast_functions=inline_functions_fast_functions,
        )

    return optimize_slices(dsk)
Ejemplo n.º 5
0
def test_dont_merge_before_reductions():
    x = da.ones(10, chunks=(5,))
    y = da.blockwise(inc, "i", x, "i", dtype=x.dtype)
    z = da.blockwise(sum, "", y, "i", dtype=y.dtype)
    w = da.blockwise(sum, "", z, "", dtype=y.dtype)

    dsk = optimize_blockwise(w.dask)

    assert len([d for d in dsk.dicts.values() if isinstance(d, Blockwise)]) == 2

    z.compute()
Ejemplo n.º 6
0
def test_dont_merge_before_reductions():
    x = da.ones(10, chunks=(5,))
    y = da.blockwise(inc, 'i', x, 'i', dtype=x.dtype)
    z = da.blockwise(sum, '', y, 'i', dtype=y.dtype)
    w = da.blockwise(sum, '', z, '', dtype=y.dtype)

    dsk = optimize_blockwise(w.dask)

    assert len([d for d in dsk.dicts.values() if isinstance(d, Blockwise)]) == 2

    z.compute()
Ejemplo n.º 7
0
def test_daily_stock():
    pytest.importorskip("pandas_datareader", minversion="0.8.0")
    df = dd.demo.daily_stock("GOOG", start="2010-01-01", stop="2010-01-30", freq="1h")
    assert isinstance(df, dd.DataFrame)
    assert 10 < df.npartitions < 31
    assert_eq(df, df)

    # Check `optimize_blockwise`
    df = df[["open", "close"]]
    keys = [(df._name, i) for i in range(df.npartitions)]
    graph = optimize_blockwise(df.__dask_graph__(), keys)
    layers = graph.layers
    name = list(layers.keys())[0]
    assert len(layers) == 1
    assert isinstance(layers[name], Blockwise)
Ejemplo n.º 8
0
def optimize(
    dsk: Mapping,
    keys: Hashable | list[Hashable] | set[Hashable],
    **kwargs: Any,
) -> Mapping:
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())

    dsk = optimize_blockwise(dsk, keys=keys)
    dsk = fuse_roots(dsk, keys=keys)  # type: ignore
    dsk = dsk.cull(set(keys))  # type: ignore
    return dsk
Ejemplo n.º 9
0
def test_make_timeseries_blockwise():
    df = dd.demo.make_timeseries()
    df = df[["x", "y"]]
    keys = [(df._name, i) for i in range(df.npartitions)]

    # Check that `optimize_dataframe_getitem` changes the
    # `columns` attribute of the "make-timeseries" layer
    graph = optimize_dataframe_getitem(df.__dask_graph__(), keys)
    key = [k for k in graph.layers.keys() if k.startswith("make-timeseries-")][0]
    assert set(graph.layers[key].columns) == {"x", "y"}

    # Check that `optimize_blockwise` fuses both
    # `Blockwise` layers together into a singe `Blockwise` layer
    graph = optimize_blockwise(df.__dask_graph__(), keys)
    layers = graph.layers
    name = list(layers.keys())[0]
    assert len(layers) == 1
    assert isinstance(layers[name], Blockwise)
Ejemplo n.º 10
0
def test_from_delayed_to_dask_array():
    # Check that `from_delayed`` can be followed
    # by `to_dask_array` without breaking
    # optimization behavior
    # See: https://github.com/dask-contrib/dask-sql/issues/497
    from dask.blockwise import optimize_blockwise

    dfs = [delayed(pd.DataFrame)(np.ones((3, 2))) for i in range(3)]
    ddf = dd.from_delayed(dfs)
    arr = ddf.to_dask_array()

    # If we optimize this graph without calling
    # `fuse_roots`, the underlying `BlockwiseDep`
    # `mapping` keys will be 1-D (e.g. `(4,)`),
    # while the collection keys will be 2-D
    # (e.g. `(4, 0)`)
    keys = [k[0] for k in arr.__dask_keys__()]
    dsk = optimize_blockwise(arr.dask, keys=keys)
    dsk.cull(keys)

    result = arr.compute()
    assert result.shape == (9, 2)