def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=b"\n", ) _, values2 = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=b"foo", ) assert [a.key for a in concat(values)] != [b.key for b in concat(values2)] results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b"\n") for r in res) ourlines = b"".join(res).split(b"\n") testlines = b"".join(files[k] for k in sorted(files)).split(b"\n") assert ourlines == testlines # delimiter not at the end d = b"}" _, values = read_bytes( "s3://" + test_bucket_name + "/test/accounts*", blocksize=blocksize, delimiter=d ) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b"}") for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_read_bytes_delimited(): with filetexts(files, mode="b"): for bs in [5, 15, 45, "1.5 kB"]: _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"\n") _, values2 = read_bytes(".test.accounts*", blocksize=bs, delimiter=b"foo") assert [a.key for a in concat(values) ] != [b.key for b in concat(values2)] results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b"\n") for r in res) ourlines = b"".join(res).split(b"\n") testlines = b"".join(files[k] for k in sorted(files)).split(b"\n") assert ourlines == testlines # delimiter not at the end d = b"}" _, values = read_bytes(".test.accounts*", blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b"}") for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def test_modification_time_read_bytes(): with s3_context("compress", files): _, a = read_bytes("s3://compress/test/accounts.*", anon=True) _, b = read_bytes("s3://compress/test/accounts.*", anon=True) assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)] with s3_context("compress", valmap(double, files)): _, c = read_bytes("s3://compress/test/accounts.*", anon=True) assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
def test_deterministic_key_names(hdfs): data = b"abc\n" * int(1e3) fn = "%s/file" % basedir with hdfs.open(fn, "wb", replication=1) as fil: fil.write(data) _, x = read_bytes("hdfs://%s/*" % basedir, delimiter=b"\n", sample=False) _, y = read_bytes("hdfs://%s/*" % basedir, delimiter=b"\n", sample=False) _, z = read_bytes("hdfs://%s/*" % basedir, delimiter=b"c", sample=False) assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def _dict(self): if hasattr(self, "_cached_dict"): return self._cached_dict["dsk"] else: keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) func = SubgraphCallable(dsk, self.output, keys) dsk = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate, output_blocks=self.output_blocks, dims=self.dims, ) # Handle IO Subgraph dsk = _inject_io_tasks(dsk, self.io_deps, self.output_indices, self.new_axes) self._cached_dict = {"dsk": dsk} return self._cached_dict["dsk"]
def _dict(self): if hasattr(self, "_cached_dict"): return self._cached_dict["dsk"] else: keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) func = SubgraphCallable(dsk, self.output, keys) dsk = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate, output_blocks=self.output_blocks, dims=self.dims, ) if self.io_subgraph: # This is an IO layer. for k in dsk: io_key = (self.io_name,) + tuple([k[i] for i in range(1, len(k))]) if io_key in dsk[k]: # Inject IO-function arguments into the blockwise graph # as a single (packed) tuple. io_item = self.io_subgraph.get(io_key) io_item = list(io_item[1:]) if len(io_item) > 1 else [] new_task = [io_item if v == io_key else v for v in dsk[k]] dsk[k] = tuple(new_task) self._cached_dict = {"dsk": dsk} return self._cached_dict["dsk"]
def call_function(func, func_token, args, kwargs, pure=None, nout=None): dask_key_name = kwargs.pop("dask_key_name", None) pure = kwargs.pop("pure", pure) if dask_key_name is None: name = "%s-%s" % ( funcname(func), tokenize(func_token, *args, pure=pure, **kwargs), ) else: name = dask_key_name args2, collections = unzip(map(unpack_collections, args), 2) collections = list(concat(collections)) if kwargs: dask_kwargs, collections2 = unpack_collections(kwargs) collections.extend(collections2) task = (apply, func, list(args2), dask_kwargs) else: task = (func, ) + args2 graph = HighLevelGraph.from_collections(name, {name: task}, dependencies=collections) nout = nout if nout is not None else None return Delayed(name, graph, length=nout)
def test_repeat(): x = np.random.random((10, 11, 13)) d = da.from_array(x, chunks=(4, 5, 3)) repeats = [0, 1, 2, 5] axes = [-3, -2, -1, 0, 1, 2] for r in repeats: for a in axes: assert_eq(x.repeat(r, axis=a), d.repeat(r, axis=a)) assert_eq(d.repeat(2, 0), da.repeat(d, 2, 0)) with pytest.raises(NotImplementedError): da.repeat(d, np.arange(10)) with pytest.raises(NotImplementedError): da.repeat(d, 2, None) with pytest.raises(NotImplementedError): da.repeat(d, 2) for invalid_axis in [3, -4]: with pytest.raises(ValueError): da.repeat(d, 2, axis=invalid_axis) x = np.arange(5) d = da.arange(5, chunks=(2, )) assert_eq(x.repeat(3), d.repeat(3)) for r in [1, 2, 3, 4]: assert all(concat(d.repeat(r).chunks))
def _dict(self): if hasattr(self, "_cached_dict"): return self._cached_dict["dsk"] else: keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) func = SubgraphCallable(dsk, self.output, keys) key_deps = {} non_blockwise_keys = set() dsk = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate, key_deps=key_deps, non_blockwise_keys=non_blockwise_keys, ) self._cached_dict = { "dsk": dsk, "basic_layer": BasicLayer(dsk, key_deps, non_blockwise_keys), } return self._cached_dict["dsk"]
async def assert_balanced(inp, expected, c, s, *workers): steal = s.extensions["stealing"] steal._pc.stop() counter = itertools.count() tasks = list(concat(inp)) data_seq = itertools.count() futures = [] for w, ts in zip(workers, inp): for t in sorted(ts, reverse=True): if t: [dat] = await c.scatter([next(data_seq)], workers=w.address) ts = s.tasks[dat.key] # Ensure scheduler state stays consistent old_nbytes = ts.nbytes ts.nbytes = s.bandwidth * t for ws in ts.who_has: ws.nbytes += ts.nbytes - old_nbytes else: dat = 123 i = next(counter) f = c.submit( func, dat, key="%d-%d" % (int(t), i), workers=w.address, allow_other_workers=True, pure=False, priority=-i, ) futures.append(f) while len(s.rprocessing) < len(futures): await asyncio.sleep(0.001) for i in range(10): steal.balance() while steal.in_flight: await asyncio.sleep(0.001) result = [ sorted([int(key_split(k)) for k in s.processing[w.address]], reverse=True) for w in workers ] result2 = sorted(result, reverse=True) expected2 = sorted(expected, reverse=True) if config.get("pdb-on-err"): if result2 != expected2: import pdb pdb.set_trace() if result2 == expected2: return raise Exception("Expected: {}; got: {}".format(str(expected2), str(result2)))
def test_read_bytes_blocksize_types(blocksize): with filetexts(files, mode="b"): sample, vals = read_bytes(".test.account*", blocksize=blocksize) results = compute(*concat(vals)) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_compression(fmt, blocksize): if fmt not in compress: pytest.skip("compression function not provided") files2 = valmap(compress[fmt], files) with filetexts(files2, mode="b"): if fmt and blocksize: with pytest.raises(ValueError): read_bytes( ".test.accounts.*.json", blocksize=blocksize, delimiter=b"\n", compression=fmt, ) return sample, values = read_bytes( ".test.accounts.*.json", blocksize=blocksize, delimiter=b"\n", compression=fmt, ) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def test_compression(s3, fmt, blocksize, s3so): if fmt not in compress: pytest.skip("compression function not provided") s3._cache.clear() with s3_context("compress", valmap(compress[fmt], files)): if fmt and blocksize: with pytest.raises(ValueError): read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize, **s3so ) return sample, values = read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize, **s3so ) assert sample.startswith(files[sorted(files)[0]][:10]) assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def __dask_distributed_pack__(self, client): from distributed.worker import dumps_function from distributed.utils import CancelledError from distributed.utils_comm import unpack_remotedata keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) dsk = (SubgraphCallable(dsk, self.output, keys), ) dsk, dsk_unpacked_futures = unpack_remotedata(dsk, byte_keys=True) func = dumps_function(dsk[0]) func_future_args = dsk[1:] indices = list(toolz.concat(self.indices)) indices, indices_unpacked_futures = unpack_remotedata(indices, byte_keys=True) # Check the legality of the unpacked futures for future in itertools.chain(dsk_unpacked_futures, indices_unpacked_futures): if future.client is not client: raise ValueError( "Inputs contain futures that were created by another client." ) if stringify(future.key) not in client.futures: raise CancelledError(stringify(future.key)) # All blockwise tasks will depend on the futures in `indices` global_dependencies = tuple( stringify(f.key) for f in indices_unpacked_futures) ret = { "output": self.output, "output_indices": self.output_indices, "func": func, "func_future_args": func_future_args, "global_dependencies": global_dependencies, "indices": indices, "numblocks": self.numblocks, "concatenate": self.concatenate, "new_axes": self.new_axes, "io_subgraph": (self.io_name, self.io_subgraph) if self.io_name else (None, None), "output_blocks": self.output_blocks, "dims": self.dims, } return ret
def test_names(): with filetexts(files, mode="b"): _, a = read_bytes(".test.accounts.*") _, b = read_bytes(".test.accounts.*") a = list(concat(a)) b = list(concat(b)) assert [aa._key for aa in a] == [bb._key for bb in b] sleep(1) for fn in files: with open(fn, "ab") as f: f.write(b"x") _, c = read_bytes(".test.accounts.*") c = list(concat(c)) assert [aa._key for aa in a] != [cc._key for cc in c]
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1, )), consolidate=None): """Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([ zip(inds, dims) for (x, inds), (x, dims) in toolz.join( toolz.first, argpairs2, toolz.first, numblocks.items()) ]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict( (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def coarsen(reduction, x, axes, trim_excess=False, **kwargs): """Coarsen array by applying reduction to fixed size neighborhoods Parameters ---------- reduction: function Function like np.sum, np.mean, etc... x: np.ndarray Array to be coarsened axes: dict Mapping of axis to coarsening factor Examples -------- >>> x = np.array([1, 2, 3, 4, 5, 6]) >>> coarsen(np.sum, x, {0: 2}) array([ 3, 7, 11]) >>> coarsen(np.max, x, {0: 3}) array([3, 6]) Provide dictionary of scale per dimension >>> x = np.arange(24).reshape((4, 6)) >>> x array([[ 0, 1, 2, 3, 4, 5], [ 6, 7, 8, 9, 10, 11], [12, 13, 14, 15, 16, 17], [18, 19, 20, 21, 22, 23]]) >>> coarsen(np.min, x, {0: 2, 1: 3}) array([[ 0, 3], [12, 15]]) You must avoid excess elements explicitly >>> x = np.array([1, 2, 3, 4, 5, 6, 7, 8]) >>> coarsen(np.min, x, {0: 3}, trim_excess=True) array([1, 4]) """ # Insert singleton dimensions if they don't exist already for i in range(x.ndim): if i not in axes: axes[i] = 1 if trim_excess: ind = tuple( slice(0, -(d % axes[i])) if d % axes[i] else slice(None, None) for i, d in enumerate(x.shape)) x = x[ind] # (10, 10) -> (5, 2, 5, 2) newshape = tuple( concat([(x.shape[i] // axes[i], axes[i]) for i in range(x.ndim)])) return reduction(x.reshape(newshape), axis=tuple(range(1, x.ndim * 2, 2)), **kwargs)
def test_open_files_write(s3): paths = ["s3://" + test_bucket_name + "/more/" + f for f in files] fils = open_files(paths, mode="wb") for fil, data in zip(fils, files.values()): with fil as f: f.write(data) sample, values = read_bytes("s3://" + test_bucket_name + "/more/test/accounts.*") results = compute(*concat(values)) assert set(list(files.values())) == set(results)
def _check_dsk(dsk): """ Check that graph is well named and non-overlapping """ if not isinstance(dsk, HighLevelGraph): return assert all(isinstance(k, (tuple, str)) for k in dsk.layers) freqs = frequencies(concat(dsk.dicts.values())) non_one = {k: v for k, v in freqs.items() if v != 1} assert not non_one, non_one
def split_at_breaks(array, breaks, axis=0): """ Split an array into a list of arrays (using slices) at the given breaks >>> split_at_breaks(np.arange(6), [3, 5]) [array([0, 1, 2]), array([3, 4]), array([5])] """ padded_breaks = concat([[None], breaks, [None]]) slices = [slice(i, j) for i, j in sliding_window(2, padded_breaks)] preslice = (slice(None), ) * axis split_array = [array[preslice + (s, )] for s in slices] return split_array
def test_read_bytes_block(): with filetexts(files, mode="b"): for bs in [5, 15, 45, 1500]: sample, vals = read_bytes(".test.account*", blocksize=bs) assert list(map(len, vals)) == [(len(v) // bs + 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum(len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_read_bytes_block(s3, blocksize): _, vals = read_bytes( "s3://" + test_bucket_name + "/test/account*", blocksize=blocksize ) assert list(map(len, vals)) == [(len(v) // blocksize + 1) for v in files.values()] results = compute(*concat(vals)) assert sum(len(r) for r in results) == sum(len(v) for v in files.values()) ourlines = b"".join(results).split(b"\n") testlines = b"".join(files.values()).split(b"\n") assert set(ourlines) == set(testlines)
def test_open_files_write(hdfs): path = "hdfs://%s/" % basedir data = [b"test data %i" % i for i in range(5)] files = open_files(path, num=len(data), mode="wb") for fil, b in zip(files, data): with fil as f: f.write(b) sample, vals = read_bytes("hdfs://%s/*.part" % basedir) (results,) = dask.compute(list(concat(vals))) assert data == results
def test_read_bytes(s3): sample, values = read_bytes("s3://" + test_bucket_name + "/test/accounts.*") assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], "dask") assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
def test_read_bytes(): with filetexts(files, mode="b"): sample, values = read_bytes(".test.accounts.*") assert isinstance(sample, bytes) assert sample[:5] == files[sorted(files)[0]][:5] assert sample.endswith(b"\n") assert isinstance(values, (list, tuple)) assert isinstance(values[0], (list, tuple)) assert hasattr(values[0][0], "dask") assert sum(map(len, values)) >= len(files) results = compute(*concat(values)) assert set(results) == set(files.values())
async def test_stress_communication(c, s, *workers): s.validate = False # very slow otherwise da = pytest.importorskip("dask.array") # Test consumes many file descriptors and can hang if the limit is too low resource = pytest.importorskip("resource") bump_rlimit(resource.RLIMIT_NOFILE, 8192) n = 20 xs = [da.random.random((100, 100), chunks=(5, 5)) for i in range(n)] ys = [x + x.T for x in xs] z = da.atop(vsum, "ij", *concat(zip(ys, ["ij"] * n)), dtype="float64") future = c.compute(z.sum()) result = await future assert isinstance(result, float)
def _dict(self): if hasattr(self, "_cached_dict"): return self._cached_dict else: keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) func = SubgraphCallable(dsk, self.output, keys) self._cached_dict = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate) return self._cached_dict
async def scatter_to_workers(nthreads, data, rpc=rpc, report=True, serializers=None): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. nthreads should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ assert isinstance(nthreads, dict) assert isinstance(data, dict) workers = list(concat([w] * nc for w, nc in nthreads.items())) names, data = list(zip(*data.items())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = { worker: {key: value for _, key, value in v} for worker, v in d.items() } rpcs = {addr: rpc(addr) for addr in d} try: out = await All([ rpcs[address].update_data(data=v, report=report, serializers=serializers) for address, v in d.items() ]) finally: for r in rpcs.values(): await r.close_rpc() nbytes = merge(o["nbytes"] for o in out) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} return (names, who_has, nbytes)
def _dict(self): if hasattr(self, "_cached_dict"): return self._cached_dict["dsk"] else: keys = tuple(map(blockwise_token, range(len(self.indices)))) dsk, _ = fuse(self.dsk, [self.output]) func = SubgraphCallable(dsk, self.output, keys) key_deps = {} non_blockwise_keys = set() dsk = make_blockwise_graph( func, self.output, self.output_indices, *list(toolz.concat(self.indices)), new_axes=self.new_axes, numblocks=self.numblocks, concatenate=self.concatenate, key_deps=key_deps, non_blockwise_keys=non_blockwise_keys, ) if self.io_subgraph: # This is an IO layer. for k in dsk: io_key = (self.io_name, ) + tuple( [k[i] for i in range(1, len(k))]) if io_key in dsk[k]: # Inject IO-function arguments into the blockwise graph # as a single (packed) tuple. io_item = self.io_subgraph.get(io_key) io_item = list(io_item[1:]) if len(io_item) > 1 else [] new_task = [ io_item if v == io_key else v for v in dsk[k] ] dsk[k] = tuple(new_task) # Clear IO "placeholder" dependencies for k in key_deps: if k[0] == self.output: key_deps[k] = set() self._cached_dict = { "dsk": dsk, "basic_layer": BasicLayer(dsk, key_deps, non_blockwise_keys), } return self._cached_dict["dsk"]
def test_read_text(fmt, bs, encoding, include_path): if fmt not in utils.compress: pytest.skip("compress function not provided for %s" % fmt) compress = utils.compress[fmt] files2 = {k: compress(v.encode(encoding)) for k, v in files.items()} with filetexts(files2, mode="b"): b = read_text(".test.accounts.*.json", compression=fmt, blocksize=bs, encoding=encoding) (L, ) = compute(b) assert "".join(L) == expected o = read_text( sorted(files), compression=fmt, blocksize=bs, encoding=encoding, include_path=include_path, ) b = o.pluck(0) if include_path else o (L, ) = compute(b) assert "".join(L) == expected if include_path: (paths, ) = compute(o.pluck(1)) expected_paths = list( concat([[k] * v.count("\n") for k, v in files.items()])) assert len(paths) == len(expected_paths) for path, expected_path in zip(paths, expected_paths): assert path.endswith(expected_path) blocks = read_text( ".test.accounts.*.json", compression=fmt, blocksize=bs, encoding=encoding, collection=False, ) L = compute(*blocks) assert "".join(line for block in L for line in block) == expected