def test_str_graph(): dsk = {"x": 1} assert str_graph(dsk) == dsk dsk = {("x", 1): (inc, 1)} assert str_graph(dsk) == {str(("x", 1)): (inc, 1)} dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))} assert str_graph(dsk) == { str(("x", 1)): (inc, 1), str(("x", 2)): (inc, str(("x", 1))), } dsks = [ {"x": 1}, {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))}, {("x", 1): (sum, [1, 2, 3]), ("x", 2): (sum, [("x", 1), ("x", 1)])}, ] for dsk in dsks: sdsk = str_graph(dsk) keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, str) for k in sdsk) assert dask.get(dsk, keys) == dask.get(sdsk, skeys) dsk = {("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x", (("y", 1),)), (("z", 1),))} dsk = str_graph(dsk, extra_values=(("z", 1),)) assert dsk["('y', 1)"][0].dsk["x"] == "('y', 1)" assert dsk["('y', 1)"][1][0] == "('z', 1)"
def test_identical_nodes( optimizer: Tuple[ str, Callable[[Dict[Hashable, Any], Union[Hashable, Iterable[Hashable]]], Dict[Hashable, Any]], ] ) -> None: """Small test for the presence of identical nodes.""" cache_dir, graphchain_optimize = optimizer def foo(x: int) -> int: return x + 1 def bar(*args: int) -> int: return sum(args) dsk = {"foo1": (foo, 1), "foo2": (foo, 1), "top1": (bar, "foo1", "foo2")} # First run newdsk = graphchain_optimize(dsk, ["top1"]) # type: ignore[arg-type] result = dask.get(newdsk, ["top1"]) assert result == (4,) # Second run newdsk = graphchain_optimize(dsk, ["top1"]) # type: ignore[arg-type] result = dask.get(newdsk, ["top1"]) assert result == (4,)
def test_str_graph(): dsk = {b'x': 1} assert str_graph(dsk) == dsk dsk = {('x', 1): (inc, 1)} assert str_graph(dsk) == {str(('x', 1)): (inc, 1)} dsk = {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))} assert str_graph(dsk) == { str(('x', 1)): (inc, 1), str(('x', 2)): (inc, str(('x', 1))) } dsks = [{ 'x': 1 }, { ('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1)) }, { ('x', 1): (sum, [1, 2, 3]), ('x', 2): (sum, [('x', 1), ('x', 1)]) }] for dsk in dsks: sdsk = str_graph(dsk) keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, (str, bytes)) for k in sdsk) assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
def test_cache_deletion( dask_graph: Dict[Hashable, Any], optimizer: Tuple[ str, Callable[[Dict[Hashable, Any]], Dict[Hashable, Any]]]) \ -> None: """Test cache deletion. Tests the ability to obtain results in the event that cache files are deleted (in the even of a cache-miss, the exec-store wrapper should be re-run by the load-wrapper). """ dsk = dask_graph cache_dir, graphchain_optimize = optimizer storage = fs.osfs.OSFS(cache_dir) # Cleanup first storage.removetree("/") # Run optimizer (first time) newdsk = graphchain_optimize(dsk, keys=["top1"]) # type: ignore result = dask.get(newdsk, ["top1"]) newdsk = graphchain_optimize(dsk, keys=["top1"]) # type: ignore result = dask.get(newdsk, ["top1"]) # Check the final result assert result == (-14, )
def test_groupby_tasks(): b = db.from_sequence(range(160), npartitions=4) out = b.groupby(lambda x: x % 10, max_branch=4, method='tasks') partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(1000), npartitions=100) out = b.groupby(lambda x: x % 123, method='tasks') assert len(out.dask) < 100**2 partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(10000), npartitions=345) out = b.groupby(lambda x: x % 2834, max_branch=24, method='tasks') partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b))
def test_run_smaller_sections(abcde): """ aa / | b d bb dd / \ /| | / a c e cc Prefer to run acb first because then we can get that out of the way """ a, b, c, d, e = abcde aa, bb, cc, dd = [x * 2 for x in [a, b, c, d]] expected = [a, c, b, e, d, cc, bb, aa, dd] log = [] def f(x): def _(*args): log.append(x) return _ dsk = {a: (f(a),), c: (f(c),), e: (f(e),), cc: (f(cc),), b: (f(b), a, c), d: (f(d), c, e), bb: (f(bb), cc), aa: (f(aa), d, bb), dd: (f(dd), cc)} dask.get(dsk, [aa, b, dd]) # trigger computation assert log == expected
def test_str_graph(): dsk = {"x": 1} assert str_graph(dsk) == dsk dsk = {("x", 1): (inc, 1)} assert str_graph(dsk) == {str(("x", 1)): (inc, 1)} dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))} assert str_graph(dsk) == { str(("x", 1)): (inc, 1), str(("x", 2)): (inc, str(("x", 1))), } dsks = [ { "x": 1 }, { ("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1)) }, { ("x", 1): (sum, [1, 2, 3]), ("x", 2): (sum, [("x", 1), ("x", 1)]) }, ] for dsk in dsks: sdsk = str_graph(dsk) keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, str) for k in sdsk) assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
def test_ephemeral_locking(zk, dsk2): with pytest.raises(LockTimeout): with Lock(zk, name="dsk2", timeout=1, ephemeral=True), \ Lock(zk, name="dsk2", timeout=1, ephemeral=True): get(dsk2, 'f') with pytest.raises(NoNodeError): zk.get("/epos/dsk2")
def test_turn_off_fusion(): x = da.ones(10, chunks=(5, )) y = da.sum(x + 1 + 2 + 3) a = y._optimize(y.dask, y._keys()) with dask.set_options(fuse_ave_width=0): b = y._optimize(y.dask, y._keys()) assert dask.get(a, y._keys()) == dask.get(b, y._keys()) assert len(a) < len(b)
def test_turn_off_fusion(): x = da.ones(10, chunks=(5,)) y = da.sum(x + 1 + 2 + 3) a = y.__dask_optimize__(y.dask, y.__dask_keys__()) with dask.config.set(fuse_ave_width=0): b = y.__dask_optimize__(y.dask, y.__dask_keys__()) assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__()) assert len(a) < len(b)
def f(c, a, b): dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y')} keys = 'z' result = yield _get(c.ip, c.port, dsk, keys, gather=True) assert result == dask.get(dsk, keys) dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y'), 'a': (inc, 'z'), 'b': (add, 'a', 'x')} keys = 'b' result = yield _get(c.ip, c.port, dsk, keys, gather=True) assert result == dask.get(dsk, keys)
def test_turn_off_fusion(): x = da.ones(10, chunks=(5, )) y = da.sum(x + 1 + 2 + 3) a = y.__dask_optimize__(y.dask, y.__dask_keys__()) with dask.config.set({"optimization.fuse.ave-width": 0}): b = y.__dask_optimize__(y.dask, y.__dask_keys__()) assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__()) assert len(a) < len(b)
def test_targeted_callback(fun, tgt, dsk): from dask import get with TargetedCallback(): tgt.exists.return_value = False get(dsk, 'a') fun.assert_called_once() fun.reset_mock() tgt.exists.return_value = True get(dsk, 'a') fun.assert_not_called()
def test_shuffle(shuffle): s = shuffle_func(d, d.b, shuffle=shuffle) assert isinstance(s, dd.DataFrame) assert s.npartitions == d.npartitions x = dask.get(s.dask, (s._name, 0)) y = dask.get(s.dask, (s._name, 1)) assert not (set(x.b) & set(y.b)) # disjoint assert set(s.dask).issuperset(d.dask) assert shuffle_func(d, d.b)._name == shuffle_func(d, d.b)._name
def test_order_cycle(): with pytest.raises(RuntimeError, match="Cycle detected"): dask.get({"a": (f, "a")}, "a") # we encounter this in `get` with pytest.raises(RuntimeError, match="Cycle detected"): order({"a": (f, "a")}) # trivial self-loop with pytest.raises(RuntimeError, match="Cycle detected"): order({("a", 0): (f, ("a", 0))}) # non-string with pytest.raises(RuntimeError, match="Cycle detected"): order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a")}) # non-trivial loop with pytest.raises(RuntimeError, match="Cycle detected"): order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a", "d"), "d": 1}) with pytest.raises(RuntimeError, match="Cycle detected"): order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a", "d"), "d": (f, "b")})
def test_persisting(zk, dsk1): with Persist(zk, name="dsk1"): with Ran() as r: assert get(dsk1, 'w') == 6 assert r.steps == ['z', 'w'] with Ran() as r: assert get(dsk1, 'w') == 6 assert r.steps == [] assert loads(zk.get("/epos/dsk1/z")[0]) == 3 assert loads(zk.get("/epos/dsk1/w")[0]) == 6 # tests ephemeral=False, znode still exists after context handler assert loads(zk.get("/epos/dsk1/w")[0]) == 6
def test_stringify(): obj = "Hello" assert stringify(obj) is obj obj = b"Hello" assert stringify(obj) is obj dsk = {"x": 1} assert stringify(dsk) == str(dsk) assert stringify(dsk, exclusive=()) == dsk dsk = {("x", 1): (inc, 1)} assert stringify(dsk) == str({("x", 1): (inc, 1)}) assert stringify(dsk, exclusive=()) == {("x", 1): (inc, 1)} dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))} assert stringify(dsk, exclusive=dsk) == { ("x", 1): (inc, 1), ("x", 2): (inc, str(("x", 1))), } dsks = [ { "x": 1 }, { ("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1)) }, { ("x", 1): (sum, [1, 2, 3]), ("x", 2): (sum, [("x", 1), ("x", 1)]) }, ] for dsk in dsks: sdsk = { stringify(k): stringify(v, exclusive=dsk) for k, v in dsk.items() } keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, str) for k in sdsk) assert get(dsk, keys) == get(sdsk, skeys) dsk = { ("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x", (("y", 1), )), (("z", 1), )) } dsk = stringify(dsk, exclusive=set(dsk) | {("z", 1)}) assert dsk[("y", 1)][0].dsk["x"] == "('y', 1)" assert dsk[("y", 1)][1][0] == "('z', 1)"
def test_exec_only_nodes( dask_graph: Dict[Hashable, Any], optimizer_exec_only_nodes: Tuple[ str, Callable[[Dict[Hashable, Any], Union[Hashable, Iterable[Hashable]]], Dict[Hashable, Any]], ], ) -> None: """Test skipping some tasks. Tests that execution-only nodes execute in the event that dependencies of their parent nodes (i.e. in the dask graph) get modified. """ dsk = dask_graph cache_dir, graphchain_optimize = optimizer_exec_only_nodes # Cleanup temporary directory filelist = os.listdir(cache_dir) for entry in filelist: entrypath = os.path.join(cache_dir, entry) if os.path.isdir(entrypath): shutil.rmtree(entrypath, ignore_errors=True) else: os.remove(entrypath) filelist = os.listdir(cache_dir) assert not filelist # Run optimizer first time newdsk = graphchain_optimize(dsk, ["top1"]) result = dask.get(newdsk, ["top1"]) assert result == (-14,) # Modify function def goo(*args: int) -> int: # hash miss this! return sum(args) + 1 dsk["goo1"] = (goo, *dsk["goo1"][1:]) # Run optimizer a second time newdsk = graphchain_optimize(dsk, ["top1"]) # Check the final result: # The output of node 'boo1' is needed at node 'baz2' # because 'goo1' was modified. A matching result indicates # that the boo1 node was executed, its dependencies loaded # which is the desired behaviour in such cases. result = dask.get(newdsk, ["top1"]) assert result == (-14,)
def say_hello(): geojson = request.json['geojson'] geojson = json.dumps(geojson) if isinstance(geojson, dict) else geojson graph = { "aoi": ["geojson", geojson], "aoi-dissolved": ["dissolve", "aoi"], "dissolved-geom": ["split", "aoi-dissolved"], "aoi-prj": ["project_local", "dissolved-geom"], "aoi-area": ["get_area", "aoi-prj"] } graph = create_dag_from_json(graph) outputs = ['dissolved-geom', 'aoi-area'] results = dask.get(graph, outputs) final_output = {} for result, name in zip(results, outputs): if isinstance(result, dict) and 'features' in result.keys(): final_output[name] = analysis_funcs.ogr2json(result) else: final_output[name] = result return jsonify(final_output), 200
def test_second_run( dask_graph: Dict[Hashable, Any], optimizer: Tuple[ str, Callable[[Dict[Hashable, Any]], Dict[Hashable, Any]]]) \ -> None: """Second run. Tests a second run of the graphchain optimization function `optimize`. It checks the final result, that that all function calls are wrapped - for loading and the the result key has no dependencies. """ dsk = dask_graph _, graphchain_optimize = optimizer # Run optimizer newdsk = graphchain_optimize(dsk, keys=["top1"]) # type: ignore # Check the final result result = dask.get(newdsk, ["top1"]) assert result == (-14, ) # Check that the functions are wrapped for loading for key in dsk.keys(): newtask = newdsk[key] assert isinstance(newtask, tuple) assert isinstance(newtask[0], CachedComputation)
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3) for j in range(2)} y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2) for j in range(3)} x_futures = yield e._scatter(x_dsk) y_futures = yield e._scatter(y_dsk) dt = np.random.random(0).dtype x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt) y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt) x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt) y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt) exprs = [lambda x, y: x.T + y, lambda x, y: x.mean() + y.mean(), lambda x, y: x.dot(y).std(axis=0), lambda x, y: x - x.mean(axis=1)[:, None]] for expr in exprs: local = expr(x_local, y_local) local_results = dask.get(local.dask, local._keys()) local_result = da.Array._finalize(local, local_results) remote = expr(x_remote, y_remote) remote_results = yield e._get(remote.dask, remote._keys()) remote_result = da.Array._finalize(remote, remote_results) assert np.all(local_result == remote_result) yield e._shutdown()
def test_chunked_dot_product(): x = np.arange(400).reshape((20, 20)) o = np.ones((20, 20)) d = {'x': x, 'o': o} getx = getem('x', (5, 5), shape=(20, 20)) geto = getem('o', (5, 5), shape=(20, 20)) result = top(dotmany, 'out', 'ik', 'x', 'ij', 'o', 'jk', numblocks={ 'x': (4, 4), 'o': (4, 4) }) dsk = merge(d, getx, geto, result) out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)]) assert eq(np.dot(x, o), concatenate3(out))
def __call__(self, input_buffer): keys = sorted(['input'] + list(self.dsk.keys())) more = False while True: if more: self.dsk['input'] = Stream.NoNewData more = False else: buf = next(input_buffer) if buf not in [Stream.EndOfStream, Stream.NoNewData]: self.t_ |= buf.getExtent() self.dsk['input'] = buf outputs = { key: output for key, output in zip(keys, dask.get(self.dsk, keys)) } for key in keys: if isinstance(outputs[key], More): more = True outputs[key] = outputs[key].output if all(Stream.EndOfStream == o for o in outputs.values()): return outputs['t'] = self.t_.end yield outputs
def test_ephemeral_persisting(zk, dsk2): with Persist(zk, name="dsk2", ns="/test/dags", ephemeral=True): with Ran() as r: assert get(dsk2, 'e') == 10 assert r.steps == ['e'] with Ran() as r: assert get(dsk2, 's') == 0.4 assert r.steps == ['f', 's'] with Ran() as r: assert get(dsk2, 's') == 0.4 assert r.steps == [] assert loads(zk.get("/test/dags/dsk2/e")[0]) == 10 with pytest.raises(NoNodeError): zk.get("/test/dags/dsk2/e")
def NO_test_single_run_s3( dask_graph: dict, optimizer_s3: Tuple[str, Callable]) -> None: """Run on S3. Tests a single run of the graphchain optimization function ``optimize`` using Amazon S3 as a persistency layer. It checks the final result, that all function calls are wrapped - for execution and output storing, that the hashchain is created, that hashed outputs (the <hash>.pickle[.lz4] files) are generated and that the name of each file is a key in the hashchain. """ dsk = dask_graph cache_dir, graphchain_optimize = optimizer_s3 # Run optimizer newdsk = graphchain_optimize(dsk, keys=["top1"]) # Check the final result result = dask.get(newdsk, ["top1"]) assert result == (-14, ) data_ext = ".pickle.lz4" # Check that all functions have been wrapped for key, _task in dsk.items(): newtask = newdsk[key] isinstance(newtask, CachedComputation) # Check that the hash files are written and that each # filename can be found as a key in the hashchain # (the association of hash <-> DAG tasks is not tested) storage = fs.open_fs(cache_dir) filelist = storage.listdir("/") nfiles = sum(map(lambda x: x.endswith(data_ext), filelist)) assert nfiles == len(dsk)
def graph_sizes(arr: dask.array.Array) -> T.Dict[T.Hashable, T.Dict]: """ Get the node sizes for each node in arr's Dask graph, to be used in visualisation functions Sizes are returned using the 'label' graphviz attribute >>> import dask.dot >>> a = dask.array.zeros((10,10), chunks=(5,5)) >>> sizes = graph_sizes(a) >>> dask.dot.to_graphviz(a.dask, data_attributes=sizes) # doctest: +ELLIPSIS <graphviz.dot.Digraph object ...> Note: All nodes will be computed to calculate the size """ keys = list(arr.dask.keys()) sizes = dict( zip( keys, [ {"label": dask.utils.format_bytes(x.nbytes)} if isinstance(x, numpy.ndarray) else {} for x in dask.get(arr.dask, keys) ], ) ) return sizes
def featurize_single_ts(ts, features_to_use, custom_script_path=None, custom_functions=None, raise_exceptions=True): """Compute feature values for a given single time-series. Data is returned as dictionaries/lists of lists. Parameters ---------- ts : TimeSeries object Single time series to be featurized. features_to_use : list of str List of feature names to be generated. custom_functions : dict, optional Dictionary of custom feature functions to be evaluated for the given time series, or a dictionary representing a dask graph of function evaluations. Dictionaries of functions should have keys `feature_name` and values functions that take arguments (t, m, e); in the case of a dask graph, these arrays should be referenced as 't', 'm', 'e', respectively, and any values with keys present in `features_to_use` will be computed. raise_exceptions : bool, optional If True, exceptions during feature computation are raised immediately; if False, exceptions are supressed and `np.nan` is returned for the given feature and any dependent features. Defaults to True. Returns ------- dict Dictionary with feature names as keys, lists of feature values (one per channel) as values. """ # Initialize empty feature array for all channels feature_values = np.empty((len(features_to_use), ts.n_channels)) for (t_i, m_i, e_i), i in zip(ts.channels(), range(ts.n_channels)): feature_graph = generate_dask_graph(t_i, m_i, e_i) feature_graph.update(ts.meta_features) if custom_functions: # If values in custom_functions are functions, add calls to graph if all(hasattr(v, '__call__') for v in custom_functions.values()): feature_graph.update({feat: f(t_i, m_i, e_i) for feat, f in custom_functions.items()}) # Otherwise, custom_functions is another dask graph else: feature_graph.update(custom_functions) # Do not execute in parallel; parallelization has already taken place # at the level of time series, so we compute features for a single time # series in serial. if raise_exceptions: raise_callback = reraise else: raise_callback = lambda e, tb: None dask_values = dask.get(feature_graph, features_to_use, raise_exception=raise_callback, pack_exception=pack_exception) feature_values[:, i] = [x if not isinstance(x, Exception) else np.nan for x in dask_values] index = pd.MultiIndex.from_product((features_to_use, range(ts.n_channels)), names=('feature', 'channel')) return pd.Series(feature_values.ravel(), index=index)
def test_first_run(dask_graph: dict, optimizer: Tuple[str, Callable]) -> None: """First run. Tests a first run of the graphchain optimization function ``optimize``. It checks the final result, that that all function calls are wrapped - for execution and output storing, that the hashchain is created, that hashed outputs (the <hash>.pickle[.lz4] files) are generated and that the name of each file is a key in the hashchain. """ dsk = dask_graph cache_dir, graphchain_optimize = optimizer # Run optimizer newdsk = graphchain_optimize(dsk, keys=["top1"]) # Check the final result result = dask.get(newdsk, ["top1"]) assert result == (-14, ) # Check that all functions have been wrapped for key, _task in dsk.items(): newtask = newdsk[key] assert isinstance(newtask[0], CachedComputation) # Check that the hash files are written and that each # filename can be found as a key in the hashchain # (the association of hash <-> DAG tasks is not tested) storage = fs.osfs.OSFS(cache_dir) filelist = storage.listdir("/") nfiles = len(filelist) assert nfiles >= len(dsk) storage.close()
def __call__(self, input_buffer): keys = sorted(['input'] + list(self.dsk.keys())) more = False while True: if more: self.dsk['input'] = Stream.NoNewData more = False else: buf = next(input_buffer) if buf not in [Stream.EndOfStream, Stream.NoNewData]: self.t_ |= buf.getExtent() self.dsk['input'] = buf outputs = {key: output for key, output in zip(keys, dask.get(self.dsk, keys))} for key in keys: if isinstance(outputs[key], More): more = True outputs[key] = outputs[key].output if all(Stream.EndOfStream == o for o in outputs.values()): return outputs['t'] = self.t_.end yield outputs
def test_repartition_npartitions(nin, nout): b = db.from_sequence(range(100), npartitions=nin) c = b.repartition(npartitions=nout) assert c.npartitions == nout assert_eq(b, c) results = dask.get(c.dask, c.__dask_keys__()) assert all(results)
def f(c, a, b): dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y')} keys = 'z' result = yield _get(c.ip, c.port, dsk, keys, gather=True) assert result == dask.get(dsk, keys) dsk = { 'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y'), 'a': (inc, 'z'), 'b': (add, 'a', 'x') } keys = 'b' result = yield _get(c.ip, c.port, dsk, keys, gather=True) assert result == dask.get(dsk, keys)
def test_dask_workflow_with_explicit_parallel_sub_tasks(self): """ We do explicit Parrallel call for some tasks """ import dask # runner = GlobalFakeRunner() runner = FakeRunner() # decorate functions... generate_pricedata = dfp.job_delayed(runner)(self.generate_pricedata) generate_fundata = dfp.job_delayed(runner)(self.generate_fundata) generate_riskdata = dfp.job_delayed(runner)(self.generate_riskdata) generate_predictors = dfp.job_delayed(runner)(self.generate_predictors) generate_positions = dfp.delayed(self.generate_positions) # from dask.multiprocessing import get # from dask.threaded import get from dask. async import get_sync as get # declare the dataflow dsk = dict() pools = ['pool1', 'pool2'] for pool in pools: dsk[(pool, 'pricedata')] = generate_pricedata(pool), dsk[(pool, 'fundata')] = generate_fundata(pool), dsk[(pool, 'riskdata')] = generate_riskdata(pool, 'risk'), (pool, 'pricedata') dsk[(pool, 'pred')] = generate_predictors(pool, 'risk'), [ (pool, t) for t in ['pricedata', 'fundata', 'riskdata'] ] dsk[(pool, 'positions')] = dfp.ParallelJobs(runner)([ generate_positions(pool, 'risk', 'momentum', 'markowitz_aversion', max_risk=max_risk) for max_risk in range(10) ]), (pool, 'pred') # get(dsk, [(pool,'pred') for pool in pools]) # executes in parallel # results = get(dsk, dsk.keys()) jobids = dict(zip(dsk.keys(), get(dsk, dsk.keys()))) assert len(jobids) == 10 assert jobids[('pool2', 'positions')] == [ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28 ] get(dsk, ('pool2', 'positions')) get(dsk, ('pool2', 'positions'))
def test_repartition(nin, nout): b = db.from_sequence(range(100), npartitions=nin) c = b.repartition(npartitions=nout) assert c.npartitions == nout assert b.compute(get=dask.get) == c.compute(get=dask.get) results = dask.get(c.dask, c.__dask_keys__()) assert all(results)
def test_local_parents_of_reduction(abcde): """ c1 | b1 c2 | /| a1 b2 c3 | /| a2 b3 | a3 Prefer to finish a1 stack before proceding to b2 """ a, b, c, d, e = abcde a1, a2, a3 = [a + i for i in '123'] b1, b2, b3 = [b + i for i in '123'] c1, c2, c3 = [c + i for i in '123'] expected = [a3, a2, a1, b3, b2, b1, c3, c2, c1] log = [] def f(x): def _(*args): log.append(x) return _ dsk = { a3: (f(a3), ), a2: (f(a2), a3), a1: (f(a1), a2), b3: (f(b3), ), b2: (f(b2), b3, a2), b1: (f(b1), b2), c3: (f(c3), ), c2: (f(c2), c3, b2), c1: (f(c1), c2) } order(dsk) dask.get(dsk, [a1, b1, c1]) # trigger computation assert log == expected
def test_repartition(nin, nout): b = db.from_sequence(range(100), npartitions=nin) c = b.repartition(npartitions=nout) assert c.npartitions == nout assert b.compute(get=dask.get) == c.compute(get=dask.get) results = dask.get(c.dask, c._keys()) assert all(results)
def __call__(self, dsk, keys, **kwargs): """Compute dask task and keep track of number of times we do so.""" import dask self.total_computes += 1 if self.total_computes > self.max_computes: raise RuntimeError("Too many dask computations were scheduled: " "{}".format(self.total_computes)) return dask.get(dsk, keys, **kwargs)
def test_local_parents_of_reduction(abcde): """ c1 | b1 c2 | /| a1 b2 c3 | /| a2 b3 | a3 Prefer to finish a1 stack before proceding to b2 """ a, b, c, d, e = abcde a1, a2, a3 = [a + i for i in '123'] b1, b2, b3 = [b + i for i in '123'] c1, c2, c3 = [c + i for i in '123'] expected = [a3, a2, a1, b3, b2, b1, c3, c2, c1] log = [] def f(x): def _(*args): log.append(x) return _ dsk = {a3: (f(a3),), a2: (f(a2), a3), a1: (f(a1), a2), b3: (f(b3),), b2: (f(b2), b3, a2), b1: (f(b1), b2), c3: (f(c3),), c2: (f(c2), c3, b2), c1: (f(c1), c2)} order(dsk) dask.get(dsk, [a1, b1, c1]) # trigger computation assert log == expected
def test_str_graph(): dsk = {'x': 1} assert str_graph(dsk) == dsk dsk = {('x', 1): (inc, 1)} assert str_graph(dsk) == {str(('x', 1)): (inc, 1)} dsk = {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))} assert str_graph(dsk) == {str(('x', 1)): (inc, 1), str(('x', 2)): (inc, str(('x', 1)))} dsks = [{'x': 1}, {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))}, {('x', 1): (sum, [1, 2, 3]), ('x', 2): (sum, [('x', 1), ('x', 1)])}] for dsk in dsks: sdsk = str_graph(dsk) keys = list(dsk) skeys = [str(k) for k in keys] assert all(isinstance(k, str) for k in sdsk) assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
def test_chunked_transpose_plus_one(): x = np.arange(400).reshape((20, 20)) d = {'x': x} getx = getem('x', (5, 5), shape=(20, 20)) f = lambda x: x.T + 1 comp = top(f, 'out', 'ij', 'x', 'ji', numblocks={'x': (4, 4)}) dsk = merge(d, getx, comp) out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)]) assert eq(concatenate3(out), x.T + 1)
def test_chunked_dot_product(): x = np.arange(400).reshape((20, 20)) o = np.ones((20, 20)) d = {'x': x, 'o': o} getx = getem('x', (5, 5), shape=(20, 20)) geto = getem('o', (5, 5), shape=(20, 20)) result = top(dotmany, 'out', 'ik', 'x', 'ij', 'o', 'jk', numblocks={'x': (4, 4), 'o': (4, 4)}) dsk = merge(d, getx, geto, result) out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)]) assert eq(np.dot(x, o), concatenate3(out))
def test_minimize_data_transfer(): x = np.ones(100) y = da.from_array(x, chunks=25) z = y + 1 dsk = z.__dask_optimize__(z.dask, z.__dask_keys__()) keys = list(dsk) results = dask.get(dsk, keys) big_key = [k for k, r in zip(keys, results) if r is x][0] dependencies, dependents = dask.core.get_deps(dsk) deps = dependents[big_key] assert len(deps) == 4 for dep in deps: assert dsk[dep][0] in (getitem, getter) assert dsk[dep][1] == big_key
def myget(dsk, keys, **kwargs): var[0] = var[0] + 1 return dask.get(dsk, keys, **kwargs)
def counting_get(*args, **kwargs): count[0] += 1 return dask.get(*args, **kwargs)
def generate_features(t, m, e, features_to_use): """Utility function that generates features from a dask DAG.""" graph = generate_dask_graph(t, m, e) values = dask.get(graph, features_to_use) return dict(zip(features_to_use, values))
def my_get(dsk, keys): assert dsk == dict(y.dask) # but they aren't return dask.get(dsk, keys)
def get(dsk, keys, *args, **kwargs): called[0] = True return dask.get(dsk, keys)