def test_groupby_tasks(): b = db.from_sequence(range(160), npartitions=4) out = b.groupby(lambda x: x % 10, max_branch=4, method='tasks') partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(1000), npartitions=100) out = b.groupby(lambda x: x % 123, method='tasks') assert len(out.dask) < 100**2 partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b)) b = db.from_sequence(range(10000), npartitions=345) out = b.groupby(lambda x: x % 2834, max_branch=24, method='tasks') partitions = dask.get(out.dask, out._keys()) for a in partitions: for b in partitions: if a is not b: assert not set(pluck(0, a)) & set(pluck(0, b))
def test_to_dataframe(): try: import dask.dataframe import pandas as pd except ImportError: return b = db.from_sequence([(1, 2), (10, 20), (100, 200)], npartitions=2) df = b.to_dataframe() assert list(df.columns) == list(pd.DataFrame(list(b)).columns) df = b.to_dataframe(columns=['a', 'b']) assert df.npartitions == b.npartitions assert list(df.columns) == ['a', 'b'] assert df.a.compute().values.tolist() == list(b.pluck(0)) assert df.b.compute().values.tolist() == list(b.pluck(1)) b = db.from_sequence([{'a': 1, 'b': 2}, {'a': 10, 'b': 20}, {'a': 100, 'b': 200}], npartitions=2) df2 = b.to_dataframe() assert (df2.compute().values == df.compute().values).all()
def test_to_dataframe(): pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') b = db.from_sequence([(1, 2), (10, 20), (100, 200)], npartitions=2) df = b.to_dataframe() assert list(df.columns) == list(pd.DataFrame(list(b)).columns) df = b.to_dataframe(columns=['a', 'b']) assert df.npartitions == b.npartitions assert list(df.columns) == ['a', 'b'] assert df.a.compute().values.tolist() == list(b.pluck(0)) assert df.b.compute().values.tolist() == list(b.pluck(1)) b = db.from_sequence([{'a': 1, 'b': 2}, {'a': 10, 'b': 20}, {'a': 100, 'b': 200}], npartitions=2) df2 = b.to_dataframe() assert (df2.compute().values == df.compute().values).all() assert df2._name == b.to_dataframe()._name assert df2._name != df._name meta = pd.DataFrame({'a': [1], 'b': [2]}).iloc[0:0] df3 = b.to_dataframe(columns=meta) assert df2._name == df3._name assert (df3.compute().values == df2.compute().values).all() b = db.from_sequence([1, 2, 3, 4, 5], npartitions=2) df4 = b.to_dataframe() assert len(df4.columns) == 1 assert list(df4.compute()) == list(pd.DataFrame(list(b)))
def test_map_partitions_args_kwargs(): x = [random.randint(-100, 100) for i in range(100)] y = [random.randint(-100, 100) for i in range(100)] dx = db.from_sequence(x, npartitions=10) dy = db.from_sequence(y, npartitions=10) def maximum(x, y=0): y = repeat(y) if isinstance(y, int) else y return [max(a, b) for (a, b) in zip(x, y)] sol = maximum(x, y=10) assert db.map_partitions(maximum, dx, y=10).compute() == sol assert dx.map_partitions(maximum, y=10).compute() == sol assert dx.map_partitions(maximum, 10).compute() == sol sol = maximum(x, y) assert db.map_partitions(maximum, dx, dy).compute() == sol assert dx.map_partitions(maximum, y=dy).compute() == sol assert dx.map_partitions(maximum, dy).compute() == sol dy_mean = dy.mean().apply(int) sol = maximum(x, int(sum(y) / len(y))) assert dx.map_partitions(maximum, y=dy_mean).compute() == sol assert dx.map_partitions(maximum, dy_mean).compute() == sol dy_mean = dask.delayed(dy_mean) assert dx.map_partitions(maximum, y=dy_mean).compute() == sol assert dx.map_partitions(maximum, dy_mean).compute() == sol
def test_map_method(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a, b=2, c=3): return a + b + c assert b.map(myadd).compute() == list(map(myadd, x)) assert b.map(myadd, b2).compute() == list(map(myadd, x, x2)) assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x] assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x] assert (b.map(myadd, b2, c=10).compute() == [myadd(i, j, 10) for (i, j) in zip(x, x2)]) x_sum = sum(x) assert (b.map(myadd, b.sum(), c=10).compute() == [myadd(i, x_sum, 10) for i in x]) # check that map works with multiarg functions. Can be removed after # deprecated behavior is removed assert b.map(add, b2).compute() == list(map(add, x, x2)) # check that map works with vararg functions. Can be removed after # deprecated behavior is removed def vararg_inc(*args): return inc(*args) assert b.map(vararg_inc).compute(get=dask.get) == list(map(inc, x))
def test_fold(): c = b.fold(add) assert c.compute() == sum(L) assert c.key == b.fold(add).key c2 = b.fold(add, initial=10) assert c2.key != c.key assert c2.compute() == sum(L) + 10 * b.npartitions assert c2.key == b.fold(add, initial=10).key c = db.from_sequence(range(5), npartitions=3) def binop(acc, x): acc = acc.copy() acc.add(x) return acc d = c.fold(binop, set.union, initial=set()) assert d.compute() == set(c) assert d.key == c.fold(binop, set.union, initial=set()).key d = db.from_sequence('hello') assert set(d.fold(lambda a, b: ''.join([a, b]), initial='').compute()) == set('hello') e = db.from_sequence([[1], [2], [3]], npartitions=2) with dask.set_options(get=get_sync): assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
def test_dags(executor): # build dags by using itemgetter and dicts scope = dict( a=db.from_sequence(range(0, 10), npartitions=3), b=db.from_sequence(range(10, 20), npartitions=3), c=db.from_sequence(range(20, 30), npartitions=3), ) graph = chained( apply_concat([ chained(op.itemgetter('a'), sum, seq), chained(op.itemgetter('b'), sum, seq), chained(op.itemgetter('c'), sum, seq), ]), apply_concat([ chained(max, seq), chained(min, seq), chained(sum, seq), ]) ) actual = executor(graph, scope) assert sorted(actual) == sorted([ sum(range(20, 30)), sum(range(0, 10)), sum(range(0, 30)), ])
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3]) assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
def test_reduction_empty_aggregate(npartitions): b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None) assert_eq(b.min(split_every=2), 1) vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler='sync') assert vals == (1, 1) with pytest.raises(ValueError): b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions) b.filter(None).min(split_every=2).compute(scheduler='sync')
def test_reduction_empty_aggregate(npartitions): b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None) assert b.min(split_every=2).compute(get=dask.get) == 1 vals = db.compute(b.min(split_every=2), b.max(split_every=2), get=dask.get) assert vals == (1, 1) with pytest.raises(ValueError): b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions) b.filter(None).min(split_every=2).compute(get=dask.get)
def test_product(): b2 = b.product(b) assert b2.npartitions == b.npartitions**2 assert set(b2) == set([(i, j) for i in L for j in L]) x = db.from_sequence([1, 2, 3, 4]) y = db.from_sequence([10, 20, 30]) z = x.product(y) assert set(z) == set([(i, j) for i in [1, 2, 3, 4] for j in [10, 20, 30]])
def test_to_delayed(): b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3) a, b, c = b.map(inc).to_delayed() assert all(isinstance(x, Delayed) for x in [a, b, c]) assert b.compute() == [4, 5] b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3) t = b.sum().to_delayed() assert isinstance(t, Delayed) assert t.compute() == 21
def test_map_with_constructors(): assert db.from_sequence([[1, 2, 3]]).map(A).compute() assert db.from_sequence([1, 2, 3]).map(B).compute() assert db.from_sequence([[1, 2, 3]]).map(B).compute() failed = False try: db.from_sequence([[1,]]).map(A).compute() except TypeError: failed = True assert failed
def test_map_with_builtins(): b = db.from_sequence(range(3)) assert ' '.join(b.map(str)) == '0 1 2' assert b.map(str).map(tuple).compute() == [('0',), ('1',), ('2',)] assert b.map(str).map(tuple).map(any).compute() == [True, True, True] b2 = b.map(lambda n: [(n, n+1), (2*(n-1), -n)]) assert b2.map(dict).compute() == [{0: 1, -2: 0}, {1: 2, 0: -1}, {2: -2}] assert b.map(lambda n: (n, n+1)).map(pow).compute() == [0, 1, 8] assert b.map(bool).compute() == [False, True, True] assert db.from_sequence([(1, 'real'), ('1', 'real')]).map(hasattr).compute() == \ [True, False]
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] assert c.name == db.concat([a, b]).name assert b.concat().name != a.concat().name assert b.concat().name == b.concat().name b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3]) assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
def test_to_imperative(): from dask.imperative import Value b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3) a, b, c = b.map(inc).to_imperative() assert all(isinstance(x, Value) for x in [a, b, c]) assert b.compute() == [4, 5] b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3) t = b.sum().to_imperative() assert isinstance(t, Value) assert t.compute() == 21
def test_bag_map(): b = db.from_sequence(range(100), npartitions=10) b2 = db.from_sequence(range(100, 200), npartitions=10) x = b.compute() x2 = b2.compute() def myadd(a=1, b=2, c=3): return a + b + c assert db.map(myadd, b).compute() == list(map(myadd, x)) assert db.map(myadd, a=b).compute() == list(map(myadd, x)) assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2)) assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x] sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, b=b2, c=100).compute() == sol sol = [myadd(i, c=100) for (i, j) in zip(x, x2)] assert db.map(myadd, b, c=100).compute() == sol x_sum = sum(x) sol = [myadd(x_sum, b=i, c=100) for i in x2] assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol sol = [myadd(i, b=x_sum, c=100) for i in x2] assert db.map(myadd, b2, b.sum(), c=100).compute() == sol sol = [myadd(a=100, b=x_sum, c=i) for i in x2] assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol a = dask.delayed(10) assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x] assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x] # Mispatched npartitions fewer_parts = db.from_sequence(range(100), npartitions=5) with pytest.raises(ValueError): db.map(myadd, b, fewer_parts) # No bags with pytest.raises(ValueError): db.map(myadd, b.sum(), 1, 2) # Unequal partitioning unequal = db.from_sequence(range(110), npartitions=10) with pytest.raises(ValueError): db.map(myadd, b, unequal, c=b2).compute() with pytest.raises(ValueError): db.map(myadd, b, b=unequal, c=b2).compute()
def test_distinct(): assert sorted(b.distinct()) == [0, 1, 2, 3, 4] assert b.distinct().name == b.distinct().name assert 'distinct' in b.distinct().name assert b.distinct().count().compute() == 5 bag = db.from_sequence([0] * 50, npartitions=50) assert bag.filter(None).distinct().compute() == []
def test_frequencies(): c = b.frequencies() assert dict(c) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3} c2 = b.frequencies(split_every=2) assert dict(c2) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3} assert c.name == b.frequencies().name assert c.name != c2.name assert c2.name == b.frequencies(split_every=2).name # test bag with empty partitions b2 = db.from_sequence(range(20), partition_size=2) b2 = b2.filter(lambda x: x < 10) d = b2.frequencies() assert dict(d) == dict(zip(range(10), [1] * 10)) bag = db.from_sequence([0, 0, 0, 0], npartitions=4) bag2 = bag.filter(None).frequencies(split_every=2) assert dict(bag2.compute(get=dask.get)) == {}
def apply_to_local( transform, obj, npartitions=None, get=None, rules=None, rewrites=(), ): """Distribute obj, then apply the transform, finally compute the result. :param Callable[Any,Any] transform: the transformation to apply. :param Sequence[Any] obj: the list of objects to transform. :param Optional[int] npartitions: the number of partitions to split the original sequence into. :param Callable[Any,Any,Any] get: the get function to use when computing the resulting dask object. To execute in parallel, use the ``get`` method of ``distributed.Client``. See :class:`flowly.dst.LocalCluster` for a simple way to start a local cluster and to construct the client. :param Optional[Iterable] rules: See :func:`flowly.dsk.apply`. :param Iterable[Callable[Callable[Any,Any],Callable[Any,Any]]]: See :func:`flowly.tz.apply`. """ obj = db.from_sequence(obj, npartitions=npartitions) obj = apply(transform, obj, rewrites=rewrites, rules=rules) return obj.compute(get=get)
def test__futures_to_dask_bag(s, a, b): import dask.bag as db e = Executor((s.ip, s.port), start=False) yield e._start() L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] futures = yield e._scatter(L) rb = yield _futures_to_dask_bag(futures) assert isinstance(rb, db.Bag) assert rb.npartitions == len(L) lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3) exprs = [lambda x: x.map(lambda x: x + 1).sum(), lambda x: x.filter(lambda x: x % 2)] for expr in exprs: local = expr(lb).compute(get=dask.get) remote = e.compute(expr(rb)) remote = yield remote._result() assert local == remote yield e._shutdown()
def test_grid_dimensions(self): from neuronunit.optimization.model_parameters import model_params provided_keys = list(model_params.keys()) USE_CACHED_GS = False from neuronunit.optimization import exhaustive_search from neuronunit.optimization.optimization_management import map_wrapper import dask.bag as db npoints = 2 nparams = 3 for i in range(1,10): for j in range(1,10): grid_points = exhaustive_search.create_grid(npoints = i, nparams = j) b0 = db.from_sequence(grid_points[0:2], npartitions=8) dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute()) self.assertEqual(i*j,len(dtcpop)) self.assertNotEqual(dtcpop,None) dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2]) self.assertNotEqual(dtcpop_compare,None) self.assertEqual(len(dtcpop_compare),len(dtcpop)) for i,j in enumerate(dtcpop): for k,v in dtcpop_compare[i].attrs.items(): print(k,v,i,j) self.assertEqual(j.attrs[k],v) return True
def test_unzip(): b = db.from_sequence(range(100)).map(lambda x: (x, x + 1, x + 2)) one, two, three = b.unzip(3) assert list(one) == list(range(100)) assert list(three) == [i + 2 for i in range(100)] assert one.name == b.unzip(3)[0].name assert one.name != two.name
def test_to_textfiles_endlines(): b = db.from_sequence(['a', 'b', 'c'], npartitions=1) with tmpfile() as fn: b.to_textfiles([fn]) with open(fn, 'r') as f: result = f.readlines() assert result == ['a\n', 'b\n', 'c']
def test_non_splittable_reductions(npartitions): np = pytest.importorskip('numpy') data = list(range(100)) c = db.from_sequence(data, npartitions=npartitions) assert_eq(c.mean(), np.mean(data)) assert_eq(c.std(), np.std(data))
def test_01a_compute_score(dtcpop): from neuronunit.optimization import get_neab from neuronunit.optimization.optimization_management import dtc_to_rheo from neuronunit.optimization.optimization_management import nunit_evaluation from neuronunit.optimization.optimization_management import format_test #dtcpop = grid_points() dtclist = list(map(dtc_to_rheo,dtcpop)) for d in dtclist: assert len(list(d.attrs.values())) > 0 import dask.bag as db b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(format_test,b0).compute()) b0 = db.from_sequence(dtclist, npartitions=8) dtclist = list(db.map(nunit_evaluation,b0).compute()) return dtclist
def main(kind): input_array = np.random.random(5000) getter = {'processes': dask.multiprocessing.get, 'threads': dask.threaded.get}[kind] # sets the scheduler with dask.set_options(get=getter): # set ``partition_size`` to ensure each partition has enough work bag = db.from_sequence(input_array, partition_size=1000) # compute elemwise cosine on the gpu within each partition bag_cos = bag.map_partitions( lambda x: gpu_cos(np.asarray(x, dtype=np.float32))) # apply partial sum-reduce on each partition # then, finish it on the host got = bag_cos.reduction(sum_parts, sum).compute() # cross validate with numpy expected = np.sum(np.cos(input_array)) print('Got: ', got) print('Expected:', expected) correct = np.allclose(got, expected) print('Correct: ', correct) sys.exit(0 if correct else 1)
def test_roundtrip(tmpdir, codec): if codec == 'snappy': pytest.importorskip('snappy') fn = os.path.join(tmpdir, 'out*.avro') b = db.from_sequence(expected, npartitions=3) b.to_avro(fn, schema=schema, codec=codec) b2 = db.read_avro(fn) assert b.compute() == b2.compute()
def test_random_sample_random_state(): """ Sampling with fixed random seed generates identical results. """ a = db.from_sequence(range(50), npartitions=5) b = a.random_sample(0.5, 1234) c = a.random_sample(0.5, 1234) assert list(b) == list(c)
def test_to_textfiles_name_function_preserves_order(): seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] b = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: b.to_textfiles(dn) out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute() assert seq == out
def test_bag_paths(): b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2) assert b.to_textfiles('foo*') == ['foo0', 'foo1'] os.remove('foo0') os.remove('foo1')
def test_concat(): a = db.from_sequence([1, 2, 3]) b = db.from_sequence([4, 5, 6]) c = db.concat([a, b]) assert list(c) == [1, 2, 3, 4, 5, 6] assert c.name == db.concat([a, b]).name
def test_to_dataframe(): dd = pytest.importorskip('dask.dataframe') pd = pytest.importorskip('pandas') def check_parts(df, sol): assert all((p.dtypes == sol.dtypes).all() for p in dask.compute(*df.to_delayed())) dsk = {('test', 0): [(1, 2)], ('test', 1): [], ('test', 2): [(10, 20), (100, 200)]} b = Bag(dsk, 'test', 3) sol = pd.DataFrame(b.compute(), columns=['a', 'b']) # Elements are tuples df = b.to_dataframe() dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}), check_index=False) df = b.to_dataframe(columns=['a', 'b']) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')]) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Elements are dictionaries b = b.map(lambda x: dict(zip(['a', 'b'], x))) df = b.to_dataframe() dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) assert df._name == b.to_dataframe()._name # With metadata specified for meta in [sol, [('a', 'i8'), ('b', 'i8')]]: df = b.to_dataframe(meta=meta) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Error to specify both columns and meta with pytest.raises(ValueError): b.to_dataframe(columns=['a', 'b'], meta=sol) # Inference fails if empty first partition b2 = b.filter(lambda x: x['a'] > 200) with pytest.raises(ValueError): b2.to_dataframe() # Single column b = b.pluck('a') sol = sol[['a']] df = b.to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol) # Works with iterators and tuples sol = pd.DataFrame({'a': range(100)}) b = db.from_sequence(range(100), npartitions=5) for f in [iter, tuple]: df = b.map_partitions(f).to_dataframe(meta=sol) dd.utils.assert_eq(df, sol, check_index=False) check_parts(df, sol)
def bag_to_iterator(x, **kwargs): return db.from_sequence(x, **filter_kwargs(db.from_sequence, kwargs))
def test_reduction_empty(): b = db.from_sequence(range(10), npartitions=100) assert b.filter(lambda x: x % 2 == 0).max().compute(get=dask.get) == 8 assert b.filter(lambda x: x % 2 == 0).min().compute(get=dask.get) == 0
def test_empty(): list(db.from_sequence([])) == []
def test_concat_after_map(): a = db.from_sequence([1, 2]) b = db.from_sequence([4, 5]) result = db.concat([a.map(inc), b]) assert list(result) == [2, 3, 4, 5]
def test_string_namespace_with_unicode(): b = db.from_sequence([u'Alice Smith', u'Bob Jones', 'Charlie Smith'], npartitions=2) assert list(b.str.lower()) == ['alice smith', 'bob jones', 'charlie smith']
def test_str_empty_split(): b = db.from_sequence([u'Alice Smith', u'Bob Jones', 'Charlie Smith'], npartitions=2) assert list(b.str.split()) == [['Alice', 'Smith'], ['Bob', 'Jones'], ['Charlie', 'Smith']]
def test_bag_class_extend(): dictbag = BagOfDicts(*db.from_sequence([{'a': {'b': 'c'}}])._args) assert dictbag.get('a').get('b').compute()[0] == 'c' assert dictbag.get('a').set('d', 'EXTENSIBILITY!!!').compute()[0] == \ {'b': 'c', 'd': 'EXTENSIBILITY!!!'} assert isinstance(dictbag.get('a').get('b'), BagOfDicts)
def test_bag_compute_forward_kwargs(): x = db.from_sequence([1, 2, 3]).map(lambda a: a + 1) x.compute(bogus_keyword=10)
def test_zip(npartitions, hi=1000): evens = db.from_sequence(range(0, hi, 2), npartitions=npartitions) odds = db.from_sequence(range(1, hi, 2), npartitions=npartitions) pairs = db.zip(evens, odds) assert pairs.npartitions == npartitions assert list(pairs) == list(zip(range(0, hi, 2), range(1, hi, 2)))
def test_empty_bag(): b = db.from_sequence([]) assert b.map(inc).all().compute(get=dask.get) assert not b.map(inc).any().compute(get=dask.get) assert not b.map(inc).sum().compute(get=dask.get) assert not b.map(inc).count().compute(get=dask.get)
def test_string_namespace_with_unicode(): b = db.from_sequence([u"Alice Smith", u"Bob Jones", "Charlie Smith"], npartitions=2) assert list(b.str.lower()) == ["alice smith", "bob jones", "charlie smith"]
def test_msgpack_unicode(): b = db.from_sequence([{"a": 1}]).groupby("a") result = b.compute(get=dask. async .get_sync) assert dict(result) == {1: [{'a': 1}]}
def test_from_long_sequence(): L = list(range(1001)) b = db.from_sequence(L) assert set(b) == set(L)
def test_from_sequence(): b = db.from_sequence([1, 2, 3, 4, 5], npartitions=3) assert len(b.dask) == 3 assert set(b) == set([1, 2, 3, 4, 5])
def test_map_function_with_multiple_arguments(): b = db.from_sequence([(1, 10), (2, 20), (3, 30)], npartitions=3) assert list( b.map(lambda x, y: x + y).compute(get=dask.get)) == [11, 22, 33] assert list(b.map(list).compute()) == [[1, 10], [2, 20], [3, 30]]
def test_reduction_with_non_comparable_objects(): b = db.from_sequence([StrictReal(x) for x in range(10)], partition_size=2) assert b.fold(max, max).compute(get=dask.get) == StrictReal(9)
def test_non_splittable_reductions(): np = pytest.importorskip('numpy') data = list(range(100)) c = db.from_sequence(data, npartitions=10) assert c.mean().compute() == np.mean(data) assert c.std().compute(get=dask.get) == np.std(data)
def n_random_resamples( *args, samples, n_repeats, function=None, function_kwargs=None, bundle_args=True, replace=True, with_dask=True ): """ Repeatedly randomly resample xarray args and return results passed through function. Parameters ---------- *args : xarray DataArray or Dataset Objects containing data to be resampled. The coordinates of the first object are used for resampling. The same resampling is applied to all objects. samples : dict Dictionary containing dimensions to subsample, number of samples and continuous block size within the sample. Of the form {'dim1': (n_samples, block_size), 'dim2': (n_samples, block_size)}. The first object in args must contain all dimensions listed in samples, but subsequent objects need not. n_repeats : int Number of times to repeat the resampling process function : function object, optional Function to reduce the subsampled data. function_kwargs : dict, optional Keyword arguments to provide to function. bundle_args : bool, default True If True, pass all resampled objects to function together. Otherwise pass each object through function separately. replace : bool, default True Whether the sample is with or without replacement. with_dask : bool, default True If True, use dask to parallelize across n_repeats using dask.delayed Returns ------- sample : xarray DataArray or Dataset Array containing the results of passing the subsampled data through function """ if with_dask & (n_repeats > 500): n_args = itertools.repeat(args[0], times=n_repeats) b = db.from_sequence(n_args, npartitions=100) rs_list = b.map( random_resample, *(args[1:]), **{ "samples": samples, "function": function, "function_kwargs": function_kwargs, "replace": replace, } ).compute() else: resample_ = dask.delayed(random_resample) if with_dask else random_resample rs_list = [ resample_( *args, samples=samples, function=function, function_kwargs=function_kwargs, bundle_args=bundle_args, replace=replace ) for _ in range(n_repeats) ] if with_dask: rs_list = dask.compute(rs_list)[0] if all(isinstance(r, tuple) for r in rs_list): return tuple( [xr.concat([r.unify_chunks() for r in rs], dim="k") for rs in zip(*rs_list)] ) else: return xr.concat([r.unify_chunks() for r in rs_list], dim="k")
def test_topk_with_multiarg_lambda(): b = db.from_sequence([(1, 10), (2, 9), (3, 8)], npartitions=2) assert list(b.topk(2, key=lambda a, b: b)) == [(1, 10), (2, 9)]
def test_to_textfiles_name_function_warn(): seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'] a = db.from_sequence(seq, npartitions=16) with tmpdir() as dn: with pytest.warns(None): a.to_textfiles(dn, name_function=str)
def test_topk_with_non_callable_key(): b = db.from_sequence([(1, 10), (2, 9), (3, 8)], npartitions=2) assert list(b.topk(2, key=1)) == [(1, 10), (2, 9)] assert list(b.topk(2, key=0)) == [(3, 8), (2, 9)] assert b.topk(2, key=1).name == b.topk(2, key=1).name
def test_flatten(): b = db.from_sequence([[1], [2, 3]]) assert list(b.flatten()) == [1, 2, 3] assert b.flatten().name == b.flatten().name
def test_pluck_with_default(): b = db.from_sequence(['Hello', '', 'World']) pytest.raises(IndexError, lambda: list(b.pluck(0))) assert list(b.pluck(0, None)) == ['H', None, 'W'] assert b.pluck(0, None).name == b.pluck(0, None).name assert b.pluck(0).name != b.pluck(0, None).name
def test_groupby_with_indexer(): b = db.from_sequence([[1, 2, 3], [1, 4, 9], [2, 3, 4]]) result = dict(b.groupby(0)) assert valmap(sorted, result) == {1: [[1, 2, 3], [1, 4, 9]], 2: [[2, 3, 4]]}
def test_bag_with_single_callable(): f = lambda: None b = db.from_sequence([f]) assert list(b.compute(get=dask.get)) == [f]
def test_ensure_compute_output_is_concrete(): b = db.from_sequence([1, 2, 3]) result = b.map(lambda x: x + 1).compute() assert not isinstance(result, Iterator)
assert_eq(c.std(), np.std(data)) def test_std(): assert_eq(b.std(), math.sqrt(2.0)) assert float(b.std()) == math.sqrt(2.0) def test_var(): assert_eq(b.var(), 2.0) assert float(b.var()) == 2.0 @pytest.mark.parametrize( "transform", [identity, dask.delayed, lambda x: db.from_sequence(x, npartitions=1)]) def test_join(transform): other = transform([1, 2, 3]) c = b.join(other, on_self=isodd, on_other=iseven) assert_eq(c, list(join(iseven, [1, 2, 3], isodd, list(b)))) assert_eq(b.join(other, isodd), list(join(isodd, [1, 2, 3], isodd, list(b)))) assert c.name == b.join(other, on_self=isodd, on_other=iseven).name def test_foldby(): c = b.foldby(iseven, add, 0, add, 0) assert (reduceby, iseven, add, (b.name, 0), 0) in list(c.dask.values()) assert set(c) == set( reduceby(iseven, lambda acc, x: acc + x, L, 0).items()) assert c.name == b.foldby(iseven, add, 0, add, 0).name