def test_pure(): v1 = do(add, pure=True)(1, 2) v2 = do(add, pure=True)(1, 2) assert v1.key == v2.key myrand = do(random) assert myrand().key != myrand().key
def test_kwargs(): def mysum(a, b, c=(), **kwargs): return a + b + sum(c) + sum(kwargs.values()) dmysum = do(mysum) ten = dmysum(1, 2, c=[value(3), 0], four=dmysum(2,2)) assert ten.compute() == 10 dmysum = do(mysum, pure=True) ten = dmysum(1, 2, c=[value(3), 0], four=dmysum(2,2)) assert ten.compute() == 10
def test_sync_compute(loop): with cluster() as (c, [a, b]): with Executor(('127.0.0.1', c['port'])) as e: from dask.imperative import do, value x = value(1) y = do(inc)(x) z = do(dec)(x) yy, zz = e.compute(y, z, sync=True) assert (yy, zz) == (2, 0)
def test_kwargs(): def mysum(a, b, c=(), **kwargs): return a + b + sum(c) + sum(kwargs.values()) dmysum = do(mysum) ten = dmysum(1, 2, c=[value(3), 0], four=dmysum(2, 2)) assert ten.compute() == 10 dmysum = do(mysum, pure=True) ten = dmysum(1, 2, c=[value(3), 0], four=dmysum(2, 2)) assert ten.compute() == 10
def test_sync_compute(loop): with cluster() as (s, [a, b]): with Executor(('127.0.0.1', s['port'])) as e: from dask.imperative import do, value x = value(1) y = do(inc)(x) z = do(dec)(x) yy, zz = e.compute(y, z, sync=True) assert (yy, zz) == (2, 0)
def fit(self, X, y=None): X = value(X) if y is not None: y = value(y) new_ests = [] for name, est in self.steps: new_est = do(fit)(est, X, y) X = do(transform)(new_est, X) new_ests.append(new_est) return Pipeline([(name, new_est) for (name, old_est), new_est in zip(self.steps, new_ests)])
def test_iterators(): a = value(1) b = value(2) c = do(sum)(iter([a, b])) assert c.compute() == 3 def f(seq): assert isinstance(seq, Iterator) return sum(seq) c = do(f)(iter([a, b])) assert c.compute() == 3
def test_do(): add2 = do(add) assert add2(1, 2).compute() == 3 assert (add2(1, 2) + 3).compute() == 6 assert add2(add2(1, 2), 3).compute() == 6 a = value(1) b = add2(add2(a, 2), 3) assert a.key in b.dask
def test_async_compute(s, a, b): e = Executor((s.ip, s.port), start=False) yield e._start() from dask.imperative import do, value x = value(1) y = do(inc)(x) z = do(dec)(x) yy, zz, aa = e.compute(y, z, 3, sync=False) assert isinstance(yy, Future) assert isinstance(zz, Future) assert aa == 3 result = yield e._gather([yy, zz]) assert result == [2, 0] yield e._shutdown()
def test_array_bag_imperative(): arr1 = np.arange(100).reshape((10, 10)) arr2 = arr1.dot(arr1.T) darr1 = da.from_array(arr1, chunks=(5, 5)) darr2 = da.from_array(arr2, chunks=(5, 5)) b = db.from_sequence([1, 2, 3]) seq = [arr1, arr2, darr1, darr2, b] out = do(sum)([i.sum() for i in seq]) assert out.compute() == 2 * arr1.sum() + 2 * arr2.sum() + sum([1, 2, 3])
def f(c, a, b): e = Executor((c.ip, c.port), start=False, loop=loop) yield e._start() from dask.imperative import do, value x = value(1) y = do(inc)(x) z = do(dec)(x) yy, zz, aa = e.compute(y, z, 3, sync=False) assert isinstance(yy, Future) assert isinstance(zz, Future) assert aa == 3 result = yield e._gather([yy, zz]) assert result == [2, 0] yield e._shutdown()
def test_from_imperative(): from dask.imperative import value, do a, b, c = value([1, 2, 3]), value([4, 5, 6]), value([7, 8, 9]) bb = from_imperative([a, b, c]) assert bb.name == from_imperative([a, b, c]).name assert isinstance(bb, Bag) assert list(bb) == [1, 2, 3, 4, 5, 6, 7, 8, 9] asum_value = do(lambda X: sum(X))(a) asum_item = db.Item.from_imperative(asum_value) assert asum_value.compute() == asum_item.compute() == 6
def test_array_imperative(): arr = np.arange(100).reshape((10, 10)) darr = da.from_array(arr, chunks=(5, 5)) val = do(sum)([arr, darr, 1]) assert isinstance(val, Value) assert np.allclose(val.compute(), arr + arr + 1) assert val.sum().compute() == (arr + arr + 1).sum() assert val[0, 0].compute() == (arr + arr + 1)[0, 0] task, dasks = to_task_dasks(darr) assert len(dasks) == 1 orig = set(darr.dask) final = set(dasks[0]) assert orig.issubset(final) diff = final.difference(orig) assert len(diff) == 1
def best_parameters(estimator, cv, X, y, parameter_iterable, scorer, fit_params, iid): """ Lazily apply fit-and-score to data on all parameters / folds This function does little of the input checking and it doesn't trigger computation. Returns a lazy value object. This should return almost immediately """ _X, _y = X, y X = value(X) y = y if y is None else value(y) cv = [(value(train), value(test)) for train, test in cv] out = [_fit_and_score(estimator, X, y, scorer, train, test, parameters, fit_params) for parameters in parameter_iterable for train, test in cv] return do(pick_best_parameters)(out, len(cv), iid)
def best_parameters(estimator, cv, X, y, parameter_iterable, scorer, fit_params, iid): """ Lazily apply fit-and-score to data on all parameters / folds This function does little of the input checking and it doesn't trigger computation. Returns a lazy value object. This should return almost immediately """ _X, _y = X, y X = value(X) y = y if y is None else value(y) cv = [(value(train), value(test)) for train, test in cv] out = [ _fit_and_score(estimator, X, y, scorer, train, test, parameters, fit_params) for parameters in parameter_iterable for train, test in cv ] return do(pick_best_parameters)(out, len(cv), iid)
def test_do_method_descriptor(): do(bytes.decode)(b'') # does not err
def predict(self, X): for name, est in self.steps[:-1]: X = do(transform)(est, X) y = do(predict)(self.steps[-1][1], X) return y
def transform(self, X): for name, est in self.steps: X = do(transform)(est, X) return X
def test_lists_are_concrete(): a = value(1) b = value(2) c = do(max)([[a, 10], [b, 20]], key=lambda x: x[0])[1] assert c.compute() == 20
def score(self, X, y): X = value(X) y = value(y) y_predicted = self.predict(X) return do(accuracy_score)(y_predicted, y)
def to_sklearn(self): """ Create an sklearn pipeline object wrapped in a value >>> pipeline.to_sklearn().compute() # doctest: +SKIP """ return do(sklearn.pipeline.Pipeline)(self.steps)
def test_lists(): a = value(1) b = value(2) c = do(sum)([a, b]) assert c.compute() == 3
def read_text(fn, keyname=None, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, fs=None, lazy=True, collection=True, blocksize=2**27, compression=None): """ Read text lines from S3 Parameters ---------- path: string Path of files on S3, including both bucket, key, or globstring keyname: string, optional If path is only the bucket name, provide key name as second argument collection: boolean, optional Whether or not to return a high level collection lazy: boolean, optional Whether or not to start reading immediately blocksize: int, optional Number of bytes per partition. Use ``None`` for no blocking. Silently ignored if data is compressed with a non-splittable format like gzip. lineterminator: str, optional The endline string used to deliniate line breaks compression: str, optional Compression to use options include: gzip The use of compression will suppress blocking Examples -------- Provide bucket and keyname joined by slash. >>> b = read_text('bucket/key-directory/') # doctest: +SKIP Alternatively use support globstrings >>> b = read_text('bucket/key-directory/2015-*.json').map(json.loads) # doctest: +SKIP Or separate bucket and keyname >>> b = read_text('bucket', 'key-directory/2015-*.json').map(json.loads) # doctest: +SKIP Optionally provide blocksizes and delimiter to chunk up large files >>> b = read_text('bucket', 'key-directory/2015-*.json', ... linedelimiter='\\n', blocksize=2**25) # doctest: +SKIP Specify compression, blocksizes not allowed >>> b = read_text('bucket/my-data.*.json.gz', ... compression='gzip', blocksize=None) # doctest: +SKIP Returns ------- Dask bag if collection=True or Futures or dask values otherwise """ if keyname is not None: if not keyname.startswith('/'): keyname = '/' + keyname fn = fn + keyname fs = fs or S3FileSystem() executor = default_executor(executor) if compression: blocksize = None decompress = decompressors[compression] filenames = sorted(fs.glob(fn)) blocks = [ block for fn in filenames for block in read_bytes(fn, executor, fs, lazy=True, delimiter=lineterminator.encode(), blocksize=blocksize) ] if compression: blocks = [do(decompress)(b) for b in blocks] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] ensure_default_get(executor) from dask.bag import from_imperative if collection: result = from_imperative(lines) else: result = lines if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def test_key_names_include_function_names(): def myfunc(x): return x + 1 assert do(myfunc)(1).key.startswith('myfunc')
def read_text(fn, keyname=None, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, fs=None, lazy=True, collection=True, blocksize=2**27, compression=None): """ Read text lines from S3 Parameters ---------- path: string Path of files on S3, including both bucket, key, or globstring keyname: string, optional If path is only the bucket name, provide key name as second argument collection: boolean, optional Whether or not to return a high level collection lazy: boolean, optional Whether or not to start reading immediately blocksize: int, optional Number of bytes per partition. Use ``None`` for no blocking. Silently ignored if data is compressed with a non-splittable format like gzip. lineterminator: str, optional The endline string used to deliniate line breaks compression: str, optional Compression to use options include: gzip The use of compression will suppress blocking Examples -------- Provide bucket and keyname joined by slash. >>> b = read_text('bucket/key-directory/') # doctest: +SKIP Alternatively use support globstrings >>> b = read_text('bucket/key-directory/2015-*.json').map(json.loads) # doctest: +SKIP Or separate bucket and keyname >>> b = read_text('bucket', 'key-directory/2015-*.json').map(json.loads) # doctest: +SKIP Optionally provide blocksizes and delimiter to chunk up large files >>> b = read_text('bucket', 'key-directory/2015-*.json', ... linedelimiter='\\n', blocksize=2**25) # doctest: +SKIP Specify compression, blocksizes not allowed >>> b = read_text('bucket/my-data.*.json.gz', ... compression='gzip', blocksize=None) # doctest: +SKIP Returns ------- Dask bag if collection=True or Futures or dask values otherwise """ if keyname is not None: if not keyname.startswith('/'): keyname = '/' + keyname fn = fn + keyname fs = fs or S3FileSystem() executor = default_executor(executor) if compression: blocksize=None decompress = decompressors[compression] filenames = sorted(fs.glob(fn)) blocks = [block for fn in filenames for block in read_bytes(fn, executor, fs, lazy=True, delimiter=lineterminator.encode(), blocksize=blocksize)] if compression: blocks = [do(decompress)(b) for b in blocks] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] ensure_default_get(executor) from dask.bag import from_imperative if collection: result = from_imperative(lines) else: result = lines if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result