def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5, )) y = ((x + 1) + 2) + 3 w = y.sum() z = ((y * 2) * 3) * 4 z_top_before = tuple(z.dask.dicts[z.name].indices) (zz, ) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise) ]) == 1) dsk = optimize_blockwise( HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])), ) assert isinstance(dsk, HighLevelGraph) assert (len([ layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise) ]) >= 1)
def optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] keys = list(core.flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) else: # Perform Blockwise optimizations for HLG input dsk = optimize_dataframe_getitem(dsk, keys=keys) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Do not perform low-level fusion unless the user has # specified True explicitly. The configuration will # be None by default. if not config.get("optimization.fuse.active"): return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) fuse_subgraphs = config.get("optimization.fuse.subgraphs") if fuse_subgraphs is None: fuse_subgraphs = True dsk, _ = fuse( dsk, keys, dependencies=dependencies, fuse_subgraphs=fuse_subgraphs, ) dsk, _ = cull(dsk, keys) return dsk
def test_blockwise_non_blockwise_output(): x = da.ones(10, chunks=(5,)) y = (((x + 1) + 2) + 3) w = y.sum() z = (((y * 2) * 3) * 4) z_top_before = tuple(z.dask.dicts[z.name].indices) (zz,) = dask.optimize(z) z_top_after = tuple(z.dask.dicts[z.name].indices) assert z_top_before == z_top_after, "z_top mutated" dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__()))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1 dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask), keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()]))) assert isinstance(dsk, HighLevelGraph) assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
def optimize( dsk, keys, fuse_keys=None, fast_functions=None, inline_functions_fast_functions=(getter_inline,), rename_fused_keys=True, **kwargs, ): """Optimize dask for array computation 1. Cull tasks not necessary to evaluate keys 2. Remove full slicing, e.g. x[:] 3. Inline fast functions like getitem and np.transpose """ if not isinstance(keys, (list, set)): keys = [keys] keys = list(flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Perform low-level fusion unless the user has # specified False explicitly. if config.get("optimization.fuse.active") is False: return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) # Low level task optimizations if fast_functions is not None: inline_functions_fast_functions = fast_functions hold = hold_keys(dsk, dependencies) dsk, dependencies = fuse( dsk, hold + keys + (fuse_keys or []), dependencies, rename_keys=rename_fused_keys, ) if inline_functions_fast_functions: dsk = inline_functions( dsk, keys, dependencies=dependencies, fast_functions=inline_functions_fast_functions, ) return optimize_slices(dsk)
def test_dont_merge_before_reductions(): x = da.ones(10, chunks=(5,)) y = da.blockwise(inc, "i", x, "i", dtype=x.dtype) z = da.blockwise(sum, "", y, "i", dtype=y.dtype) w = da.blockwise(sum, "", z, "", dtype=y.dtype) dsk = optimize_blockwise(w.dask) assert len([d for d in dsk.dicts.values() if isinstance(d, Blockwise)]) == 2 z.compute()
def test_dont_merge_before_reductions(): x = da.ones(10, chunks=(5,)) y = da.blockwise(inc, 'i', x, 'i', dtype=x.dtype) z = da.blockwise(sum, '', y, 'i', dtype=y.dtype) w = da.blockwise(sum, '', z, '', dtype=y.dtype) dsk = optimize_blockwise(w.dask) assert len([d for d in dsk.dicts.values() if isinstance(d, Blockwise)]) == 2 z.compute()
def test_daily_stock(): pytest.importorskip("pandas_datareader", minversion="0.8.0") df = dd.demo.daily_stock("GOOG", start="2010-01-01", stop="2010-01-30", freq="1h") assert isinstance(df, dd.DataFrame) assert 10 < df.npartitions < 31 assert_eq(df, df) # Check `optimize_blockwise` df = df[["open", "close"]] keys = [(df._name, i) for i in range(df.npartitions)] graph = optimize_blockwise(df.__dask_graph__(), keys) layers = graph.layers name = list(layers.keys())[0] assert len(layers) == 1 assert isinstance(layers[name], Blockwise)
def optimize( dsk: Mapping, keys: Hashable | list[Hashable] | set[Hashable], **kwargs: Any, ) -> Mapping: if not isinstance(keys, (list, set)): keys = [keys] keys = list(flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) # type: ignore dsk = dsk.cull(set(keys)) # type: ignore return dsk
def test_make_timeseries_blockwise(): df = dd.demo.make_timeseries() df = df[["x", "y"]] keys = [(df._name, i) for i in range(df.npartitions)] # Check that `optimize_dataframe_getitem` changes the # `columns` attribute of the "make-timeseries" layer graph = optimize_dataframe_getitem(df.__dask_graph__(), keys) key = [k for k in graph.layers.keys() if k.startswith("make-timeseries-")][0] assert set(graph.layers[key].columns) == {"x", "y"} # Check that `optimize_blockwise` fuses both # `Blockwise` layers together into a singe `Blockwise` layer graph = optimize_blockwise(df.__dask_graph__(), keys) layers = graph.layers name = list(layers.keys())[0] assert len(layers) == 1 assert isinstance(layers[name], Blockwise)
def test_from_delayed_to_dask_array(): # Check that `from_delayed`` can be followed # by `to_dask_array` without breaking # optimization behavior # See: https://github.com/dask-contrib/dask-sql/issues/497 from dask.blockwise import optimize_blockwise dfs = [delayed(pd.DataFrame)(np.ones((3, 2))) for i in range(3)] ddf = dd.from_delayed(dfs) arr = ddf.to_dask_array() # If we optimize this graph without calling # `fuse_roots`, the underlying `BlockwiseDep` # `mapping` keys will be 1-D (e.g. `(4,)`), # while the collection keys will be 2-D # (e.g. `(4, 0)`) keys = [k[0] for k in arr.__dask_keys__()] dsk = optimize_blockwise(arr.dask, keys=keys) dsk.cull(keys) result = arr.compute() assert result.shape == (9, 2)