def test_bag_array_conversion(): import dask.bag as db b = db.range(10, npartitions=1) x, = b.map_partitions(np.asarray).to_delayed() x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]] z = da.concatenate([x]) assert_eq(z, np.arange(10), check_graph=False)
def test_temporary_directory(tmpdir): b = db.range(10, npartitions=4) with dask.set_options(temporary_directory=str(tmpdir)): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
def test_temporary_directory(): b = db.range(10, npartitions=4) with dask.set_options(temporary_directory=os.getcwd()): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(os.getcwd()))
def test_groupby_tasks_3(): func = lambda x: x % 10 b = db.range(20, npartitions=5).groupby(func, shuffle="tasks", max_branch=2) result = b.compute(scheduler="sync") assert dict(result) == groupby(func, range(20))
def test_temporary_directory(tmpdir): b = db.range(10, npartitions=4) with dask.config.set(temporary_directory=str(tmpdir)): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
def test_KFold_bag(): X = db.range(1000, npartitions=10) y = db.range(1000, npartitions=10) cv = list(KFold(4).split(X, y)) for x_train, y_train, x_test, y_test in cv: train, test = dask.compute(x_train, x_test) assert len(train) + len(test) == 1000 assert set(train) | set(test) == set(range(1000)) assert (first(KFold(3).split(X, y))[0].name == first(KFold(3).split(X, y))[0].name) assert (first(KFold(4).split(X, y))[0].name != first(KFold(3).split(X, y))[0].name) with pytest.raises(ValueError): list(KFold(11).split(X, y))
def test_optimize_fuse_keys(): x = db.range(10, npartitions=2) y = x.map(inc) z = y.map(inc) dsk = z._optimize(z.dask, z._keys()) assert not set(y.dask) & set(dsk) dsk = z._optimize(z.dask, z._keys(), fuse_keys=y._keys()) assert all(k in dsk for k in y._keys())
def test_optimize_fuse_keys(): x = db.range(10, npartitions=2) y = x.map(inc) z = y.map(inc) dsk = z.__dask_optimize__(z.dask, z.__dask_keys__()) assert not y.dask.keys() & dsk.keys() dsk = z.__dask_optimize__(z.dask, z.__dask_keys__(), fuse_keys=y.__dask_keys__()) assert all(k in dsk for k in y.__dask_keys__())
def test_temporary_directory(tmpdir): b = db.range(10, npartitions=4) # We use a pool to avoid a race condition between the pool close # cleaning up files, and the assert below. with ProcessPoolExecutor(4) as pool: with dask.config.set(temporary_directory=str(tmpdir), pool=pool): b2 = b.groupby(lambda x: x % 2) b2.compute() assert any(fn.endswith(".partd") for fn in os.listdir(str(tmpdir)))
def test_random_split_errors(): b = db.range(1000, npartitions=10) with pytest.raises(ValueError): random_split(b, 2) with pytest.raises(ValueError): random_split(b, -1) with pytest.raises(ValueError): random_split(b, 0.5, "not-a-seed-or-RandomState") with pytest.raises(TypeError): random_split("not-a-dask-object", 0.5)
def test_random_split_bag(): b = db.range(1000, npartitions=10) train, test = random_split(b, 0.2, 123) assert random_split(b, 0.2, 123)[0].name == train.name assert random_split(b, 0.3, 123)[0].name != train.name assert random_split(b, 0.2)[0].name != random_split(b, 0.2)[0].name train_c, test_c = dask.compute(train, test) assert 0.75 < len(train_c) / 1000 < 0.85 assert len(train_c) + len(test_c) == 1000 assert set(train_c) | set(test_c) == set(range(1000))
def test_train_test_split(): x = np.arange(1000) a = da.from_array(x, chunks=100) m = dm.from_array(a) b = db.range(1000, npartitions=10) train_a, test_a = train_test_split(a, test_size=0.2, random_state=123) train_a2, test_a2, train, test = train_test_split(a, a + 10, test_size=0.2, random_state=123) assert train_a2.name == train_a.name assert test_a2.name == test_a.name assert train_a2.name != train.name assert train_a.chunks == train.chunks assert test_a.chunks == test.chunks train_b, test_b, train_m, test_m = train_test_split(b, m, random_state=123) parts_b = train_b._get(train_b.dask, train_b._keys()) parts_m = train_m._get(train_m.dask, train_m._keys()) for p_b, p_m in zip(parts_b, parts_m): assert set(p_b) == set(p_m) with pytest.raises(ValueError): train_test_split(a, invalid_option=1) # invalid kwargs with pytest.raises(ValueError): train_test_split(test_size=0.2) # no arrays with pytest.raises(ValueError): train_test_split(a, b) # not all da.Array with pytest.raises(ValueError): train_test_split(a, da.from_array(x, chunks=10)) # Not aligned with pytest.raises(ValueError): train_test_split(m, db.range(1000, npartitions=12)) # Not aligned
def test_optimize_globals(): da = pytest.importorskip("dask.array") x = da.ones(10, chunks=(5, )) def optimize_double(dsk, keys): return {k: (mul, 2, v) for k, v in dsk.items()} from dask.array.utils import assert_eq assert_eq(x + 1, np.ones(10) + 1) with dask.config.set(array_optimize=optimize_double): assert_eq(x + 1, (np.ones(10) * 2 + 1) * 2, check_chunks=False) assert_eq(x + 1, np.ones(10) + 1) b = db.range(10, npartitions=2) with dask.config.set(array_optimize=optimize_double): xx, bb = dask.compute(x + 1, b.map(inc), scheduler="single-threaded") assert_eq(xx, (np.ones(10) * 2 + 1) * 2)
def test_to_textfiles_empty_partitions(): with tmpdir() as d: b = db.range(5, npartitions=5).filter(lambda x: x == 1).map(str) b.to_textfiles(os.path.join(d, '*.txt')) assert len(os.listdir(d)) == 5
def test_groupby_tasks_3(): func = lambda x: x % 10 b = db.range(20, npartitions=5).groupby(func, method='tasks', max_branch=2) result = b.compute(get=dask.get) assert dict(result) == groupby(func, range(20))
def test_aggregation(npartitions): L = list(range(15)) b = db.range(15, npartitions=npartitions) assert b.mean().compute(get=dask.get) == sum(L) / len(L) assert b.sum().compute(get=dask.get) == sum(L) assert b.count().compute(get=dask.get) == len(L)
def test_aggregation(npartitions): L = list(range(15)) b = db.range(15, npartitions=npartitions) assert_eq(b.mean(), sum(L) / len(L)) assert_eq(b.sum(), sum(L)) assert_eq(b.count(), len(L))
def test_groupby_tasks_2(size, npartitions, groups): func = lambda x: x % groups b = db.range(size, npartitions=npartitions).groupby(func, shuffle="tasks") result = b.compute(scheduler="sync") assert dict(result) == groupby(func, range(size))
def test_groupby_tasks_3(): func = lambda x: x % 10 b = db.range(20, npartitions=5).groupby(func, shuffle='tasks', max_branch=2) result = b.compute(scheduler='sync') assert dict(result) == groupby(func, range(20))
def test_bag_groupby_tasks_default(e, s, a, b): with dask.set_options(get=e.get): b = db.range(100, npartitions=10) b2 = b.groupby(lambda x: x % 13) assert not any('partd' in k[0] for k in b2.dask)
def test_range(): for npartitions in [1, 7, 10, 28]: b = db.range(100, npartitions=npartitions) assert len(b.dask) == npartitions assert b.npartitions == npartitions assert list(b) == list(range(100))
def test_bag_groupby_tasks_default(c, s, a, b): b = db.range(100, npartitions=10) b2 = b.groupby(lambda x: x % 13) assert not any('partd' in k[0] for k in b2.dask)
def test_groupby_tasks_2(size, npartitions, groups): func = lambda x: x % groups b = db.range(size, npartitions=npartitions).groupby(func, shuffle='tasks') result = b.compute(scheduler='sync') assert dict(result) == groupby(func, range(size))
def test_repeated_groupby(): b = db.range(10, npartitions=4) c = b.groupby(lambda x: x % 3) assert valmap(len, dict(c)) == valmap(len, dict(c))
def test_fit(self): d = from_sklearn(self.sk) b = db.from_sequence(self.raw_X) fit = d.fit(b, db.range(len(self.raw_X), len(self.raw_X))) assert fit is d
def test_groupby_tasks_2(size, npartitions, groups): func = lambda x: x % groups b = db.range(size, npartitions=npartitions).groupby(func, method='tasks') result = b.compute(get=dask.get) assert dict(result) == groupby(func, range(size))
from dask import bag as db from dask.diagnostics import ProgressBar import sprite2.aws import sprite2.dask import math # primality test def is_prime(n): if n % 2 == 0 and n > 2: return False return all(n % i for i in range(3, int(math.sqrt(n)) + 1, 2)) # chunk 10 million d = (db.range(100000000, npartitions=100).filter(is_prime).count()) # compute result with ProgressBar(): result = d.compute(get=sprite2.dask.get) print(result)