Example #1
0
def test_groupby_tasks():
    b = db.from_sequence(range(160), npartitions=4)
    out = b.groupby(lambda x: x % 10, max_branch=4, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))


    b = db.from_sequence(range(1000), npartitions=100)
    out = b.groupby(lambda x: x % 123, method='tasks')
    assert len(out.dask) < 100**2
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))


    b = db.from_sequence(range(10000), npartitions=345)
    out = b.groupby(lambda x: x % 2834, max_branch=24, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))
Example #2
0
def test_to_dataframe():
    try:
        import dask.dataframe
        import pandas as pd
    except ImportError:
        return
    b = db.from_sequence([(1, 2), (10, 20), (100, 200)], npartitions=2)

    df = b.to_dataframe()
    assert list(df.columns) == list(pd.DataFrame(list(b)).columns)

    df = b.to_dataframe(columns=['a', 'b'])
    assert df.npartitions == b.npartitions
    assert list(df.columns) == ['a', 'b']

    assert df.a.compute().values.tolist() == list(b.pluck(0))
    assert df.b.compute().values.tolist() == list(b.pluck(1))

    b = db.from_sequence([{'a':   1, 'b':   2},
                          {'a':  10, 'b':  20},
                          {'a': 100, 'b': 200}], npartitions=2)

    df2 = b.to_dataframe()

    assert (df2.compute().values == df.compute().values).all()
Example #3
0
def test_to_dataframe():
    pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')
    b = db.from_sequence([(1, 2), (10, 20), (100, 200)], npartitions=2)

    df = b.to_dataframe()
    assert list(df.columns) == list(pd.DataFrame(list(b)).columns)

    df = b.to_dataframe(columns=['a', 'b'])
    assert df.npartitions == b.npartitions
    assert list(df.columns) == ['a', 'b']

    assert df.a.compute().values.tolist() == list(b.pluck(0))
    assert df.b.compute().values.tolist() == list(b.pluck(1))

    b = db.from_sequence([{'a':   1, 'b':   2},
                          {'a':  10, 'b':  20},
                          {'a': 100, 'b': 200}], npartitions=2)

    df2 = b.to_dataframe()

    assert (df2.compute().values == df.compute().values).all()

    assert df2._name == b.to_dataframe()._name
    assert df2._name != df._name

    meta = pd.DataFrame({'a': [1], 'b': [2]}).iloc[0:0]
    df3 = b.to_dataframe(columns=meta)
    assert df2._name == df3._name
    assert (df3.compute().values == df2.compute().values).all()

    b = db.from_sequence([1, 2, 3, 4, 5], npartitions=2)
    df4 = b.to_dataframe()
    assert len(df4.columns) == 1
    assert list(df4.compute()) == list(pd.DataFrame(list(b)))
Example #4
0
def test_map_partitions_args_kwargs():
    x = [random.randint(-100, 100) for i in range(100)]
    y = [random.randint(-100, 100) for i in range(100)]

    dx = db.from_sequence(x, npartitions=10)
    dy = db.from_sequence(y, npartitions=10)

    def maximum(x, y=0):
        y = repeat(y) if isinstance(y, int) else y
        return [max(a, b) for (a, b) in zip(x, y)]

    sol = maximum(x, y=10)
    assert db.map_partitions(maximum, dx, y=10).compute() == sol
    assert dx.map_partitions(maximum, y=10).compute() == sol
    assert dx.map_partitions(maximum, 10).compute() == sol

    sol = maximum(x, y)
    assert db.map_partitions(maximum, dx, dy).compute() == sol
    assert dx.map_partitions(maximum, y=dy).compute() == sol
    assert dx.map_partitions(maximum, dy).compute() == sol

    dy_mean = dy.mean().apply(int)
    sol = maximum(x, int(sum(y) / len(y)))
    assert dx.map_partitions(maximum, y=dy_mean).compute() == sol
    assert dx.map_partitions(maximum, dy_mean).compute() == sol

    dy_mean = dask.delayed(dy_mean)
    assert dx.map_partitions(maximum, y=dy_mean).compute() == sol
    assert dx.map_partitions(maximum, dy_mean).compute() == sol
Example #5
0
def test_map_method():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a, b=2, c=3):
        return a + b + c

    assert b.map(myadd).compute() == list(map(myadd, x))
    assert b.map(myadd, b2).compute() == list(map(myadd, x, x2))
    assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x]
    assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x]
    assert (b.map(myadd, b2, c=10).compute() ==
            [myadd(i, j, 10) for (i, j) in zip(x, x2)])
    x_sum = sum(x)
    assert (b.map(myadd, b.sum(), c=10).compute() ==
            [myadd(i, x_sum, 10) for i in x])

    # check that map works with multiarg functions. Can be removed after
    # deprecated behavior is removed
    assert b.map(add, b2).compute() == list(map(add, x, x2))

    # check that map works with vararg functions. Can be removed after
    # deprecated behavior is removed
    def vararg_inc(*args):
        return inc(*args)

    assert b.map(vararg_inc).compute(get=dask.get) == list(map(inc, x))
Example #6
0
def test_fold():
    c = b.fold(add)
    assert c.compute() == sum(L)
    assert c.key == b.fold(add).key

    c2 = b.fold(add, initial=10)
    assert c2.key != c.key
    assert c2.compute() == sum(L) + 10 * b.npartitions
    assert c2.key == b.fold(add, initial=10).key

    c = db.from_sequence(range(5), npartitions=3)
    def binop(acc, x):
        acc = acc.copy()
        acc.add(x)
        return acc

    d = c.fold(binop, set.union, initial=set())
    assert d.compute() == set(c)
    assert d.key == c.fold(binop, set.union, initial=set()).key

    d = db.from_sequence('hello')
    assert set(d.fold(lambda a, b: ''.join([a, b]), initial='').compute()) == set('hello')

    e = db.from_sequence([[1], [2], [3]], npartitions=2)
    with dask.set_options(get=get_sync):
        assert set(e.fold(add, initial=[]).compute()) == set([1, 2, 3])
Example #7
0
def test_dags(executor):
    # build dags by using itemgetter and dicts
    scope = dict(
        a=db.from_sequence(range(0, 10), npartitions=3),
        b=db.from_sequence(range(10, 20), npartitions=3),
        c=db.from_sequence(range(20, 30), npartitions=3),
    )

    graph = chained(
        apply_concat([
            chained(op.itemgetter('a'), sum, seq),
            chained(op.itemgetter('b'), sum, seq),
            chained(op.itemgetter('c'), sum, seq),
        ]),
        apply_concat([
            chained(max, seq),
            chained(min, seq),
            chained(sum, seq),
        ])
    )

    actual = executor(graph, scope)
    assert sorted(actual) == sorted([
        sum(range(20, 30)),
        sum(range(0, 10)),
        sum(range(0, 30)),
    ])
Example #8
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Example #9
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert_eq(b.min(split_every=2), 1)
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler='sync')
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(scheduler='sync')
Example #10
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert b.min(split_every=2).compute(get=dask.get) == 1
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), get=dask.get)
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(get=dask.get)
Example #11
0
def test_product():
    b2 = b.product(b)
    assert b2.npartitions == b.npartitions**2
    assert set(b2) == set([(i, j) for i in L for j in L])

    x = db.from_sequence([1, 2, 3, 4])
    y = db.from_sequence([10, 20, 30])
    z = x.product(y)
    assert set(z) == set([(i, j) for i in [1, 2, 3, 4] for j in [10, 20, 30]])
Example #12
0
def test_to_delayed():
    b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3)
    a, b, c = b.map(inc).to_delayed()
    assert all(isinstance(x, Delayed) for x in [a, b, c])
    assert b.compute() == [4, 5]

    b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3)
    t = b.sum().to_delayed()
    assert isinstance(t, Delayed)
    assert t.compute() == 21
Example #13
0
def test_map_with_constructors():
    assert db.from_sequence([[1, 2, 3]]).map(A).compute()
    assert db.from_sequence([1, 2, 3]).map(B).compute()
    assert db.from_sequence([[1, 2, 3]]).map(B).compute()

    failed = False
    try:
        db.from_sequence([[1,]]).map(A).compute()
    except TypeError:
        failed = True
    assert failed
Example #14
0
def test_map_with_builtins():
    b = db.from_sequence(range(3))
    assert ' '.join(b.map(str)) == '0 1 2'
    assert b.map(str).map(tuple).compute() == [('0',), ('1',), ('2',)]
    assert b.map(str).map(tuple).map(any).compute() == [True, True, True]

    b2 = b.map(lambda n: [(n, n+1), (2*(n-1), -n)])
    assert b2.map(dict).compute() == [{0: 1, -2: 0}, {1: 2, 0: -1}, {2: -2}]
    assert b.map(lambda n: (n, n+1)).map(pow).compute() == [0, 1, 8]
    assert b.map(bool).compute() == [False, True, True]
    assert db.from_sequence([(1, 'real'), ('1', 'real')]).map(hasattr).compute() == \
        [True, False]
Example #15
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]

    assert c.name == db.concat([a, b]).name
    assert b.concat().name != a.concat().name
    assert b.concat().name == b.concat().name

    b = db.from_sequence([1, 2, 3]).map(lambda x: x * [1, 2, 3])
    assert list(b.concat()) == [1, 2, 3] * sum([1, 2, 3])
Example #16
0
def test_to_imperative():
    from dask.imperative import Value

    b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3)
    a, b, c = b.map(inc).to_imperative()
    assert all(isinstance(x, Value) for x in [a, b, c])
    assert b.compute() == [4, 5]

    b = db.from_sequence([1, 2, 3, 4, 5, 6], npartitions=3)
    t = b.sum().to_imperative()
    assert isinstance(t, Value)
    assert t.compute() == 21
Example #17
0
def test_bag_map():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a=1, b=2, c=3):
        return a + b + c

    assert db.map(myadd, b).compute() == list(map(myadd, x))
    assert db.map(myadd, a=b).compute() == list(map(myadd, x))
    assert db.map(myadd, b, b2).compute() == list(map(myadd, x, x2))
    assert db.map(myadd, b, 10).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, 10, b=b).compute() == [myadd(10, b=i) for i in x]

    sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, b=b2, c=100).compute() == sol

    sol = [myadd(i, c=100) for (i, j) in zip(x, x2)]
    assert db.map(myadd, b, c=100).compute() == sol

    x_sum = sum(x)
    sol = [myadd(x_sum, b=i, c=100) for i in x2]
    assert db.map(myadd, b.sum(), b=b2, c=100).compute() == sol

    sol = [myadd(i, b=x_sum, c=100) for i in x2]
    assert db.map(myadd, b2, b.sum(), c=100).compute() == sol

    sol = [myadd(a=100, b=x_sum, c=i) for i in x2]
    assert db.map(myadd, a=100, b=b.sum(), c=b2).compute() == sol

    a = dask.delayed(10)
    assert db.map(myadd, b, a).compute() == [myadd(i, 10) for i in x]
    assert db.map(myadd, b, b=a).compute() == [myadd(i, b=10) for i in x]

    # Mispatched npartitions
    fewer_parts = db.from_sequence(range(100), npartitions=5)
    with pytest.raises(ValueError):
        db.map(myadd, b, fewer_parts)

    # No bags
    with pytest.raises(ValueError):
        db.map(myadd, b.sum(), 1, 2)

    # Unequal partitioning
    unequal = db.from_sequence(range(110), npartitions=10)
    with pytest.raises(ValueError):
        db.map(myadd, b, unequal, c=b2).compute()
    with pytest.raises(ValueError):
        db.map(myadd, b, b=unequal, c=b2).compute()
Example #18
0
def test_distinct():
    assert sorted(b.distinct()) == [0, 1, 2, 3, 4]
    assert b.distinct().name == b.distinct().name
    assert 'distinct' in b.distinct().name
    assert b.distinct().count().compute() == 5
    bag = db.from_sequence([0] * 50, npartitions=50)
    assert bag.filter(None).distinct().compute() == []
Example #19
0
def test_frequencies():
    c = b.frequencies()
    assert dict(c) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    c2 = b.frequencies(split_every=2)
    assert dict(c2) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    assert c.name == b.frequencies().name
    assert c.name != c2.name
    assert c2.name == b.frequencies(split_every=2).name
    # test bag with empty partitions
    b2 = db.from_sequence(range(20), partition_size=2)
    b2 = b2.filter(lambda x: x < 10)
    d = b2.frequencies()
    assert dict(d) == dict(zip(range(10), [1] * 10))
    bag = db.from_sequence([0, 0, 0, 0], npartitions=4)
    bag2 = bag.filter(None).frequencies(split_every=2)
    assert dict(bag2.compute(get=dask.get)) == {}
Example #20
0
File: dsk.py Project: chmp/flowly
def apply_to_local(
        transform, obj,
        npartitions=None,
        get=None,
        rules=None,
        rewrites=(),
):
    """Distribute obj, then apply the transform, finally compute the result.

    :param Callable[Any,Any] transform:
        the transformation to apply.

    :param Sequence[Any] obj:
        the list of objects to transform.

    :param Optional[int] npartitions:
        the number of partitions to split the original sequence into.

    :param Callable[Any,Any,Any] get:
        the get function to use when computing the resulting dask object.
        To execute in parallel, use the ``get`` method of
        ``distributed.Client``.
        See :class:`flowly.dst.LocalCluster` for a simple way to start a local
        cluster and to construct the client.

    :param Optional[Iterable] rules:
        See :func:`flowly.dsk.apply`.

    :param Iterable[Callable[Callable[Any,Any],Callable[Any,Any]]]:
        See :func:`flowly.tz.apply`.
    """
    obj = db.from_sequence(obj, npartitions=npartitions)
    obj = apply(transform, obj, rewrites=rewrites, rules=rules)
    return obj.compute(get=get)
Example #21
0
def test__futures_to_dask_bag(s, a, b):
    import dask.bag as db
    e = Executor((s.ip, s.port), start=False)
    yield e._start()

    L = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
    futures = yield e._scatter(L)

    rb = yield _futures_to_dask_bag(futures)
    assert isinstance(rb, db.Bag)
    assert rb.npartitions == len(L)

    lb = db.from_sequence([1, 2, 3, 4, 5, 6, 7, 8, 9], npartitions=3)

    exprs = [lambda x: x.map(lambda x: x + 1).sum(),
             lambda x: x.filter(lambda x: x % 2)]

    for expr in exprs:
        local = expr(lb).compute(get=dask.get)
        remote = e.compute(expr(rb))
        remote = yield remote._result()

        assert local == remote

    yield e._shutdown()
Example #22
0
    def test_grid_dimensions(self):
        from neuronunit.optimization.model_parameters import model_params
        provided_keys = list(model_params.keys())
        USE_CACHED_GS = False
        from neuronunit.optimization import exhaustive_search
        from neuronunit.optimization.optimization_management import map_wrapper
        import dask.bag as db
        npoints = 2
        nparams = 3
        for i in range(1,10):
            for j in range(1,10):
                grid_points = exhaustive_search.create_grid(npoints = i, nparams = j)
                b0 = db.from_sequence(grid_points[0:2], npartitions=8)
                dtcpop = list(db.map(exhaustive_search.update_dtc_grid,b0).compute())
                self.assertEqual(i*j,len(dtcpop))
                self.assertNotEqual(dtcpop,None)
                dtcpop_compare = map_wrapper(exhaustive_search.update_dtc_grid,grid_points[0:2])
                self.assertNotEqual(dtcpop_compare,None)
                self.assertEqual(len(dtcpop_compare),len(dtcpop))
                for i,j in enumerate(dtcpop):
                    for k,v in dtcpop_compare[i].attrs.items():
                        print(k,v,i,j)
                        self.assertEqual(j.attrs[k],v)

        return True
Example #23
0
def test_unzip():
    b = db.from_sequence(range(100)).map(lambda x: (x, x + 1, x + 2))
    one, two, three = b.unzip(3)
    assert list(one) == list(range(100))
    assert list(three) == [i + 2 for i in range(100)]
    assert one.name == b.unzip(3)[0].name
    assert one.name != two.name
Example #24
0
def test_to_textfiles_endlines():
    b = db.from_sequence(['a', 'b', 'c'], npartitions=1)
    with tmpfile() as fn:
        b.to_textfiles([fn])
        with open(fn, 'r') as f:
            result = f.readlines()
        assert result == ['a\n', 'b\n', 'c']
Example #25
0
def test_non_splittable_reductions(npartitions):
    np = pytest.importorskip('numpy')
    data = list(range(100))
    c = db.from_sequence(data, npartitions=npartitions)

    assert_eq(c.mean(), np.mean(data))
    assert_eq(c.std(), np.std(data))
Example #26
0
def test_01a_compute_score(dtcpop):
    from neuronunit.optimization import get_neab
    from neuronunit.optimization.optimization_management import dtc_to_rheo
    from neuronunit.optimization.optimization_management import nunit_evaluation
    from neuronunit.optimization.optimization_management import format_test
    #dtcpop = grid_points()
    dtclist = list(map(dtc_to_rheo,dtcpop))
    for d in dtclist:
        assert len(list(d.attrs.values())) > 0
    import dask.bag as db
    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(format_test,b0).compute())

    b0 = db.from_sequence(dtclist, npartitions=8)
    dtclist = list(db.map(nunit_evaluation,b0).compute())
    return dtclist
Example #27
0
def main(kind):
    input_array = np.random.random(5000)

    getter = {'processes': dask.multiprocessing.get,
              'threads': dask.threaded.get}[kind]

    # sets the scheduler
    with dask.set_options(get=getter):

        # set ``partition_size`` to ensure each partition has enough work
        bag = db.from_sequence(input_array, partition_size=1000)

        # compute elemwise cosine on the gpu within each partition
        bag_cos = bag.map_partitions(
            lambda x: gpu_cos(np.asarray(x, dtype=np.float32)))

        # apply partial sum-reduce on each partition
        # then, finish it on the host
        got = bag_cos.reduction(sum_parts, sum).compute()

        # cross validate with numpy
        expected = np.sum(np.cos(input_array))

        print('Got:     ', got)
        print('Expected:', expected)
        correct = np.allclose(got, expected)
        print('Correct: ', correct)
        sys.exit(0 if correct else 1)
Example #28
0
def test_roundtrip(tmpdir, codec):
    if codec == 'snappy':
        pytest.importorskip('snappy')
    fn = os.path.join(tmpdir, 'out*.avro')
    b = db.from_sequence(expected, npartitions=3)
    b.to_avro(fn, schema=schema, codec=codec)
    b2 = db.read_avro(fn)
    assert b.compute() == b2.compute()
Example #29
0
def test_random_sample_random_state():
    """
    Sampling with fixed random seed generates identical results.
    """
    a = db.from_sequence(range(50), npartitions=5)
    b = a.random_sample(0.5, 1234)
    c = a.random_sample(0.5, 1234)
    assert list(b) == list(c)
Example #30
0
def test_to_textfiles_name_function_preserves_order():
    seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p']
    b = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        b.to_textfiles(dn)

        out = db.read_text(os.path.join(dn, "*"), encoding='ascii').map(str).map(str.strip).compute()
        assert seq == out
Example #31
0
def test_bag_paths():
    b = db.from_sequence(['abc', '123', 'xyz'], npartitions=2)
    assert b.to_textfiles('foo*') == ['foo0', 'foo1']
    os.remove('foo0')
    os.remove('foo1')
Example #32
0
def test_concat():
    a = db.from_sequence([1, 2, 3])
    b = db.from_sequence([4, 5, 6])
    c = db.concat([a, b])
    assert list(c) == [1, 2, 3, 4, 5, 6]
    assert c.name == db.concat([a, b]).name
Example #33
0
def test_to_dataframe():
    dd = pytest.importorskip('dask.dataframe')
    pd = pytest.importorskip('pandas')

    def check_parts(df, sol):
        assert all((p.dtypes == sol.dtypes).all() for p in
                   dask.compute(*df.to_delayed()))

    dsk = {('test', 0): [(1, 2)],
           ('test', 1): [],
           ('test', 2): [(10, 20), (100, 200)]}
    b = Bag(dsk, 'test', 3)
    sol = pd.DataFrame(b.compute(), columns=['a', 'b'])

    # Elements are tuples
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol.rename(columns={'a': 0, 'b': 1}),
                       check_index=False)
    df = b.to_dataframe(columns=['a', 'b'])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    df = b.to_dataframe(meta=[('a', 'i8'), ('b', 'i8')])
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Elements are dictionaries
    b = b.map(lambda x: dict(zip(['a', 'b'], x)))
    df = b.to_dataframe()
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)
    assert df._name == b.to_dataframe()._name

    # With metadata specified
    for meta in [sol, [('a', 'i8'), ('b', 'i8')]]:
        df = b.to_dataframe(meta=meta)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)

    # Error to specify both columns and meta
    with pytest.raises(ValueError):
        b.to_dataframe(columns=['a', 'b'], meta=sol)

    # Inference fails if empty first partition
    b2 = b.filter(lambda x: x['a'] > 200)
    with pytest.raises(ValueError):
        b2.to_dataframe()

    # Single column
    b = b.pluck('a')
    sol = sol[['a']]
    df = b.to_dataframe(meta=sol)
    dd.utils.assert_eq(df, sol, check_index=False)
    check_parts(df, sol)

    # Works with iterators and tuples
    sol = pd.DataFrame({'a': range(100)})
    b = db.from_sequence(range(100), npartitions=5)
    for f in [iter, tuple]:
        df = b.map_partitions(f).to_dataframe(meta=sol)
        dd.utils.assert_eq(df, sol, check_index=False)
        check_parts(df, sol)
Example #34
0
def bag_to_iterator(x, **kwargs):
    return db.from_sequence(x, **filter_kwargs(db.from_sequence, kwargs))
def test_reduction_empty():
    b = db.from_sequence(range(10), npartitions=100)
    assert b.filter(lambda x: x % 2 == 0).max().compute(get=dask.get) == 8
    assert b.filter(lambda x: x % 2 == 0).min().compute(get=dask.get) == 0
def test_empty():
    list(db.from_sequence([])) == []
def test_concat_after_map():
    a = db.from_sequence([1, 2])
    b = db.from_sequence([4, 5])
    result = db.concat([a.map(inc), b])
    assert list(result) == [2, 3, 4, 5]
def test_string_namespace_with_unicode():
    b = db.from_sequence([u'Alice Smith', u'Bob Jones', 'Charlie Smith'],
                         npartitions=2)
    assert list(b.str.lower()) == ['alice smith', 'bob jones', 'charlie smith']
def test_str_empty_split():
    b = db.from_sequence([u'Alice Smith', u'Bob Jones', 'Charlie Smith'],
                         npartitions=2)
    assert list(b.str.split()) == [['Alice', 'Smith'], ['Bob', 'Jones'],
                                   ['Charlie', 'Smith']]
def test_bag_class_extend():
    dictbag = BagOfDicts(*db.from_sequence([{'a': {'b': 'c'}}])._args)
    assert dictbag.get('a').get('b').compute()[0] == 'c'
    assert dictbag.get('a').set('d', 'EXTENSIBILITY!!!').compute()[0] == \
        {'b': 'c', 'd': 'EXTENSIBILITY!!!'}
    assert isinstance(dictbag.get('a').get('b'), BagOfDicts)
def test_bag_compute_forward_kwargs():
    x = db.from_sequence([1, 2, 3]).map(lambda a: a + 1)
    x.compute(bogus_keyword=10)
def test_zip(npartitions, hi=1000):
    evens = db.from_sequence(range(0, hi, 2), npartitions=npartitions)
    odds = db.from_sequence(range(1, hi, 2), npartitions=npartitions)
    pairs = db.zip(evens, odds)
    assert pairs.npartitions == npartitions
    assert list(pairs) == list(zip(range(0, hi, 2), range(1, hi, 2)))
Example #43
0
def test_empty_bag():
    b = db.from_sequence([])
    assert b.map(inc).all().compute(get=dask.get)
    assert not b.map(inc).any().compute(get=dask.get)
    assert not b.map(inc).sum().compute(get=dask.get)
    assert not b.map(inc).count().compute(get=dask.get)
Example #44
0
def test_string_namespace_with_unicode():
    b = db.from_sequence([u"Alice Smith", u"Bob Jones", "Charlie Smith"],
                         npartitions=2)
    assert list(b.str.lower()) == ["alice smith", "bob jones", "charlie smith"]
def test_msgpack_unicode():
    b = db.from_sequence([{"a": 1}]).groupby("a")
    result = b.compute(get=dask. async .get_sync)
    assert dict(result) == {1: [{'a': 1}]}
def test_from_long_sequence():
    L = list(range(1001))
    b = db.from_sequence(L)
    assert set(b) == set(L)
def test_from_sequence():
    b = db.from_sequence([1, 2, 3, 4, 5], npartitions=3)
    assert len(b.dask) == 3
    assert set(b) == set([1, 2, 3, 4, 5])
def test_map_function_with_multiple_arguments():
    b = db.from_sequence([(1, 10), (2, 20), (3, 30)], npartitions=3)
    assert list(
        b.map(lambda x, y: x + y).compute(get=dask.get)) == [11, 22, 33]
    assert list(b.map(list).compute()) == [[1, 10], [2, 20], [3, 30]]
def test_reduction_with_non_comparable_objects():
    b = db.from_sequence([StrictReal(x) for x in range(10)], partition_size=2)
    assert b.fold(max, max).compute(get=dask.get) == StrictReal(9)
def test_non_splittable_reductions():
    np = pytest.importorskip('numpy')
    data = list(range(100))
    c = db.from_sequence(data, npartitions=10)
    assert c.mean().compute() == np.mean(data)
    assert c.std().compute(get=dask.get) == np.std(data)
Example #51
0
def n_random_resamples(
    *args,
    samples,
    n_repeats,
    function=None,
    function_kwargs=None,
    bundle_args=True,
    replace=True,
    with_dask=True
):
    """
    Repeatedly randomly resample xarray args and return results passed through function.

    Parameters
    ----------
    *args : xarray DataArray or Dataset
        Objects containing data to be resampled.
        The coordinates of the first object are used for resampling.
        The same resampling is applied to all objects.
    samples : dict
        Dictionary containing dimensions to subsample, number of samples and continuous block size within the sample.
        Of the form {'dim1': (n_samples, block_size), 'dim2': (n_samples, block_size)}.
        The first object in args must contain all dimensions listed in samples, but subsequent objects need not.
    n_repeats : int
            Number of times to repeat the resampling process
    function : function object, optional
        Function to reduce the subsampled data.
    function_kwargs : dict, optional
        Keyword arguments to provide to function.
    bundle_args : bool, default True
        If True, pass all resampled objects to function together.
        Otherwise pass each object through function separately.
    replace : bool, default True
        Whether the sample is with or without replacement.
    with_dask : bool, default True
        If True, use dask to parallelize across n_repeats using dask.delayed

    Returns
    -------
    sample : xarray DataArray or Dataset
        Array containing the results of passing the subsampled data through function
    """

    if with_dask & (n_repeats > 500):
        n_args = itertools.repeat(args[0], times=n_repeats)
        b = db.from_sequence(n_args, npartitions=100)
        rs_list = b.map(
            random_resample,
            *(args[1:]),
            **{
                "samples": samples,
                "function": function,
                "function_kwargs": function_kwargs,
                "replace": replace,
            }
        ).compute()
    else:
        resample_ = dask.delayed(random_resample) if with_dask else random_resample
        rs_list = [
            resample_(
                *args,
                samples=samples,
                function=function,
                function_kwargs=function_kwargs,
                bundle_args=bundle_args,
                replace=replace
            )
            for _ in range(n_repeats)
        ]
        if with_dask:
            rs_list = dask.compute(rs_list)[0]

    if all(isinstance(r, tuple) for r in rs_list):
        return tuple(
            [xr.concat([r.unify_chunks() for r in rs], dim="k") for rs in zip(*rs_list)]
        )
    else:
        return xr.concat([r.unify_chunks() for r in rs_list], dim="k")
def test_topk_with_multiarg_lambda():
    b = db.from_sequence([(1, 10), (2, 9), (3, 8)], npartitions=2)
    assert list(b.topk(2, key=lambda a, b: b)) == [(1, 10), (2, 9)]
Example #53
0
def test_to_textfiles_name_function_warn():
    seq = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p']
    a = db.from_sequence(seq, npartitions=16)
    with tmpdir() as dn:
        with pytest.warns(None):
            a.to_textfiles(dn, name_function=str)
def test_topk_with_non_callable_key():
    b = db.from_sequence([(1, 10), (2, 9), (3, 8)], npartitions=2)
    assert list(b.topk(2, key=1)) == [(1, 10), (2, 9)]
    assert list(b.topk(2, key=0)) == [(3, 8), (2, 9)]
    assert b.topk(2, key=1).name == b.topk(2, key=1).name
Example #55
0
def test_flatten():
    b = db.from_sequence([[1], [2, 3]])
    assert list(b.flatten()) == [1, 2, 3]
    assert b.flatten().name == b.flatten().name
def test_pluck_with_default():
    b = db.from_sequence(['Hello', '', 'World'])
    pytest.raises(IndexError, lambda: list(b.pluck(0)))
    assert list(b.pluck(0, None)) == ['H', None, 'W']
    assert b.pluck(0, None).name == b.pluck(0, None).name
    assert b.pluck(0).name != b.pluck(0, None).name
Example #57
0
def test_groupby_with_indexer():
    b = db.from_sequence([[1, 2, 3], [1, 4, 9], [2, 3, 4]])
    result = dict(b.groupby(0))
    assert valmap(sorted, result) == {1: [[1, 2, 3], [1, 4, 9]],
                                      2: [[2, 3, 4]]}
def test_bag_with_single_callable():
    f = lambda: None
    b = db.from_sequence([f])
    assert list(b.compute(get=dask.get)) == [f]
Example #59
0
def test_ensure_compute_output_is_concrete():
    b = db.from_sequence([1, 2, 3])
    result = b.map(lambda x: x + 1).compute()
    assert not isinstance(result, Iterator)
Example #60
0
    assert_eq(c.std(), np.std(data))


def test_std():
    assert_eq(b.std(), math.sqrt(2.0))
    assert float(b.std()) == math.sqrt(2.0)


def test_var():
    assert_eq(b.var(), 2.0)
    assert float(b.var()) == 2.0


@pytest.mark.parametrize(
    "transform",
    [identity, dask.delayed, lambda x: db.from_sequence(x, npartitions=1)])
def test_join(transform):
    other = transform([1, 2, 3])
    c = b.join(other, on_self=isodd, on_other=iseven)
    assert_eq(c, list(join(iseven, [1, 2, 3], isodd, list(b))))
    assert_eq(b.join(other, isodd), list(join(isodd, [1, 2, 3], isodd,
                                              list(b))))
    assert c.name == b.join(other, on_self=isodd, on_other=iseven).name


def test_foldby():
    c = b.foldby(iseven, add, 0, add, 0)
    assert (reduceby, iseven, add, (b.name, 0), 0) in list(c.dask.values())
    assert set(c) == set(
        reduceby(iseven, lambda acc, x: acc + x, L, 0).items())
    assert c.name == b.foldby(iseven, add, 0, add, 0).name