def test_annotations_survive_optimization(): with dask.annotate(foo="bar"): graph = HighLevelGraph.from_collections( "b", { "a": 1, "b": (inc, "a"), "c": (inc, "b") }, [], ) d = Delayed("b", graph) assert type(d.dask) is HighLevelGraph assert len(d.dask.layers) == 1 assert len(d.dask.layers["b"]) == 3 assert d.dask.layers["b"].annotations == {"foo": "bar"} # Ensure optimizing a Delayed object returns a HighLevelGraph # and doesn't loose annotations (d_opt, ) = dask.optimize(d) assert type(d_opt.dask) is HighLevelGraph assert len(d_opt.dask.layers) == 1 assert len(d_opt.dask.layers["b"]) == 2 # c is culled assert d_opt.dask.layers["b"].annotations == {"foo": "bar"}
def test_single_annotation(annotation): with dask.annotate(**annotation): A = da.ones((10, 10), chunks=(5, 5)) alayer = A.__dask_graph__().layers[A.name] assert alayer.annotations == annotation assert dask.config.get("annotations", None) is None
async def test_compute(c, s, a, b): with dask.annotate(resources={"A": 1}): x = delayed(inc)(1) with dask.annotate(resources={"B": 1}): y = delayed(inc)(x) yy = c.compute(y, optimize_graph=False) await wait(yy) assert b.data xs = [delayed(inc)(i) for i in range(10, 20)] xxs = c.compute(xs, resources={"B": 1}) await wait(xxs) assert len(b.data) > 10
def test_to_dataframe_optimize_graph(): pytest.importorskip("dask.dataframe") from dask.dataframe.utils import assert_eq as assert_eq_df x = db.from_sequence( [{"name": "test1", "v1": 1}, {"name": "test2", "v1": 2}], npartitions=2 ) # linear `map` tasks will be fused by graph optimization with dask.annotate(foo=True): y = x.map(lambda a: dict(**a, v2=a["v1"] + 1)) y = y.map(lambda a: dict(**a, v3=a["v2"] + 1)) y = y.map(lambda a: dict(**a, v4=a["v3"] + 1)) # verifying the maps are not fused yet assert len(y.dask) == y.npartitions * 4 # with optimizations d = y.to_dataframe() # All the `map` tasks have been fused assert len(d.dask) < len(y.dask) # no optimizations d2 = y.to_dataframe(optimize_graph=False) # Graph hasn't been fused. It contains all the original tasks, # plus one extra layer converting to DataFrame assert len(d2.dask) == len(y.dask) + d.npartitions # Annotations are still there assert hlg_layer_topological(d2.dask, 1).annotations == {"foo": True} assert_eq_df(d, d2)
async def test_dataframe_annotations(c, s, a, b): retries = 5 plugin = ExampleAnnotationPlugin(retries=retries) s.add_plugin(plugin) assert plugin in s.plugins df = dd.from_pandas( pd.DataFrame({ "a": np.arange(10, dtype=int), "b": np.arange(10, 0, -1, dtype=float) }), npartitions=5, ) df = df.shuffle("a", shuffle="tasks", max_branch=2) acol = df["a"] bcol = df["b"] with dask.annotate(retries=retries): df = acol + bcol with dask.config.set(optimization__fuse__active=False): rdf = await c.compute(df) assert rdf.dtypes == np.float64 assert (rdf == 10.0).all() # There is an annotation match per partition (i.e. task) assert plugin.retry_matches == df.npartitions
def test_multiple_annotations(): with dask.annotate(block_id=annot_map_fn): with dask.annotate(resource="GPU"): A = da.ones((10, 10), chunks=(5, 5)) B = A + 1 C = B + 1 assert dask.config.get("annotations", None) is None alayer = A.__dask_graph__().layers[A.name] blayer = B.__dask_graph__().layers[B.name] clayer = C.__dask_graph__().layers[C.name] assert alayer.annotations == {"resource": "GPU", "block_id": annot_map_fn} assert blayer.annotations == {"block_id": annot_map_fn} assert clayer.annotations is None
def test_multiple_annotations(): da = pytest.importorskip("dask.array") with dask.annotate(block_id=annot_map_fn): with dask.annotate(resources={"GPU": 1}): A = da.ones((10, 10), chunks=(5, 5)) B = A + 1 C = B + 1 assert dask.config.get("annotations", None) is None alayer = A.__dask_graph__().layers[A.name] blayer = B.__dask_graph__().layers[B.name] clayer = C.__dask_graph__().layers[C.name] assert alayer.annotations == {"resources": {"GPU": 1}, "block_id": annot_map_fn} assert blayer.annotations == {"block_id": annot_map_fn} assert clayer.annotations is None
def test_worker_actor_handle_is_weakref_from_compute_sync(client): workers = list(client.run(lambda: None)) with dask.annotate(workers=workers[0]): counter = dask.delayed(Counter)() with dask.annotate(workers=workers[1]): intermediate = dask.delayed(lambda c: None)(counter) with dask.annotate(workers=workers[0]): final = dask.delayed(lambda x, c: x)(intermediate, counter) final.compute(actors=counter, optimize_graph=False) def worker_tasks_running(dask_worker): return len(dask_worker.data) + len(dask_worker.actors) start = time() while any(client.run(worker_tasks_running).values()): sleep(0.01) assert time() < start + 30
async def test_persist_multiple_collections(c, s, a, b): with dask.annotate(resources={"A": 1}): x = delayed(inc)(1) y = delayed(inc)(x) xx, yy = c.persist([x, y], optimize_graph=False) await wait([xx, yy]) assert x.key in a.data assert y.key in a.data assert not b.data
async def test_persist_collections(c, s, a, b): da = pytest.importorskip("dask.array") x = da.arange(10, chunks=(5, )) with dask.annotate(resources={"A": 1}): y = x.map_blocks(lambda x: x + 1) z = y.map_blocks(lambda x: 2 * x) w = z.sum() ww, yy = c.persist([w, y], optimize_graph=False) await wait([ww, yy]) assert all(stringify(key) in a.data for key in y.__dask_keys__())
def test_fuse_roots_annotations(): x = da.ones(10, chunks=(2,)) y = da.zeros(10, chunks=(2,)) with dask.annotate(foo="bar"): y = y ** 2 z = (x + 1) + (2 * y) hlg = dask.blockwise.optimize_blockwise(z.dask) assert len(hlg.layers) == 3 assert {"foo": "bar"} in [l.annotations for l in hlg.layers.values()] za = da.Array(hlg, z.name, z.chunks, z.dtype) assert_eq(za, z)
def test_collections_get(client, optimize_graph, s, a, b): da = pytest.importorskip("dask.array") async def f(dask_worker): await dask_worker.set_resources(**{"A": 1}) client.run(f, workers=[a["address"]]) with dask.annotate(resources={"A": 1}): x = da.random.random(100, chunks=(10, )) + 1 x.compute(optimize_graph=optimize_graph) def g(dask_worker): return len(dask_worker.log) logs = client.run(g) assert logs[a["address"]] assert not logs[b["address"]]
def persist_across_workers(client, objects, workers=None): """ Calls persist on the 'objects' ensuring they are spread across the workers on 'workers'. Parameters ---------- client : dask.distributed.Client objects : list Dask distributed objects to be persisted workers : list or None List of workers across which to persist objects If None, then all workers attached to 'client' will be used """ if workers is None: workers = client.has_what().keys() # Default to all workers if check_min_dask_version("2020.12.0"): with dask.annotate(workers=set(workers)): return client.persist(objects) else: return client.persist(objects, workers={o: workers for o in objects})
import dask import dask.array as da # Start Ray. # Tip: If connecting to an existing cluster, use ray.init(address="auto"). ray.init() # Use our Dask config helper to set the scheduler to ray_dask_get globally, # without having to specify it on each compute call. enable_dask_on_ray() # All Ray tasks that underly the Dask operations performed in an annotation # context will require the indicated resources: 2 CPUs and 0.01 of the custom # resource. with dask.annotate( ray_remote_args=dict(num_cpus=2, resources={"custom_resource": 0.01}) ): d_arr = da.ones(100) # Operations on the same collection can have different annotations. with dask.annotate(ray_remote_args=dict(resources={"other_custom_resource": 0.01})): d_arr = 2 * d_arr # This happens outside of the annotation context, so no resource constraints # will be attached to the underlying Ray tasks for the sum() operation. sum_ = d_arr.sum() # Compute the result, passing in a default resource request that will be # applied to all operations that aren't already annotated with a resource # request. In this case, only the sum() operation will get this default # resource request.
def wrapped_fn(*args, **kwargs): if dd is None or not hasattr(dask, "annotate"): return fn(*args, **kwargs) with dask.annotate(ray_remote_args=dict(scheduling_strategy="SPREAD")): return fn(*args, **kwargs)