Example #1
0
def test_map_method():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a, b=2, c=3):
        return a + b + c

    assert b.map(myadd).compute() == list(map(myadd, x))
    assert b.map(myadd, b2).compute() == list(map(myadd, x, x2))
    assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x]
    assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x]
    assert (b.map(myadd, b2, c=10).compute() == [
        myadd(i, j, 10) for (i, j) in zip(x, x2)
    ])
    x_sum = sum(x)
    assert (b.map(myadd, b.sum(),
                  c=10).compute() == [myadd(i, x_sum, 10) for i in x])

    # check that map works with multiarg functions. Can be removed after
    # deprecated behavior is removed
    assert b.map(add, b2).compute() == list(map(add, x, x2))

    # check that map works with vararg functions. Can be removed after
    # deprecated behavior is removed
    def vararg_inc(*args):
        return inc(*args)

    assert_eq(b.map(vararg_inc), list(map(inc, x)))
Example #2
0
def test_join(transform):
    other = transform([1, 2, 3])
    c = b.join(other, on_self=isodd, on_other=iseven)
    assert_eq(c, list(join(iseven, [1, 2, 3], isodd, list(b))))
    assert_eq(b.join(other, isodd),
              list(join(isodd, [1, 2, 3], isodd, list(b))))
    assert c.name == b.join(other, on_self=isodd, on_other=iseven).name
Example #3
0
def test_fold_bag():
    def binop(tot, x):
        tot.add(x)
        return tot
    c = b.fold(binop, combine=set.union, initial=set(), out_type=Bag)
    assert isinstance(c, Bag)
    assert_eq(c, list(set(range(5))))
Example #4
0
def test_repartition_npartitions(nin, nout):
    b = db.from_sequence(range(100), npartitions=nin)
    c = b.repartition(npartitions=nout)
    assert c.npartitions == nout
    assert_eq(b, c)
    results = dask.get(c.dask, c.__dask_keys__())
    assert all(results)
Example #5
0
def test_map_method():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a, b=2, c=3):
        return a + b + c

    assert b.map(myadd).compute() == list(map(myadd, x))
    assert b.map(myadd, b2).compute() == list(map(myadd, x, x2))
    assert b.map(myadd, 10).compute() == [myadd(i, 10) for i in x]
    assert b.map(myadd, b=10).compute() == [myadd(i, b=10) for i in x]
    assert (b.map(myadd, b2, c=10).compute() ==
            [myadd(i, j, 10) for (i, j) in zip(x, x2)])
    x_sum = sum(x)
    assert (b.map(myadd, b.sum(), c=10).compute() ==
            [myadd(i, x_sum, 10) for i in x])

    # check that map works with multiarg functions. Can be removed after
    # deprecated behavior is removed
    assert b.map(add, b2).compute() == list(map(add, x, x2))

    # check that map works with vararg functions. Can be removed after
    # deprecated behavior is removed
    def vararg_inc(*args):
        return inc(*args)

    assert_eq(b.map(vararg_inc), list(map(inc, x)))
Example #6
0
def test_join(transform):
    other = transform([1, 2, 3])
    c = b.join(other, on_self=isodd, on_other=iseven)
    assert_eq(c, list(join(iseven, [1, 2, 3], isodd, list(b))))
    assert_eq(b.join(other, isodd), list(join(isodd, [1, 2, 3], isodd,
                                              list(b))))
    assert c.name == b.join(other, on_self=isodd, on_other=iseven).name
Example #7
0
def test_bagged_array_delayed():
    da = pytest.importorskip("dask.array")

    obj = da.ones(10, chunks=5).to_delayed()[0]
    bag = db.from_delayed(obj)
    b = bag.compute()
    assert_eq(b, [1.0, 1.0, 1.0, 1.0, 1.0])
Example #8
0
def test_non_splittable_reductions(npartitions):
    np = pytest.importorskip('numpy')
    data = list(range(100))
    c = db.from_sequence(data, npartitions=npartitions)

    assert_eq(c.mean(), np.mean(data))
    assert_eq(c.std(), np.std(data))
Example #9
0
def test_non_splittable_reductions(npartitions):
    np = pytest.importorskip("numpy")
    data = list(range(100))
    c = db.from_sequence(data, npartitions=npartitions)

    assert_eq(c.mean(), np.mean(data))
    assert_eq(c.std(), np.std(data))
Example #10
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert_eq(b.min(split_every=2), 1)
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler='sync')
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(scheduler='sync')
Example #11
0
def test_repartition(nin, nout):
    b = db.from_sequence(range(100), npartitions=nin)
    c = b.repartition(npartitions=nout)

    assert c.npartitions == nout
    assert_eq(b, c)
    results = dask.get(c.dask, c.__dask_keys__())
    assert all(results)
Example #12
0
def test_reduction_empty_aggregate(npartitions):
    b = db.from_sequence([0, 0, 0, 1], npartitions=npartitions).filter(None)
    assert_eq(b.min(split_every=2), 1)
    vals = db.compute(b.min(split_every=2), b.max(split_every=2), scheduler="sync")
    assert vals == (1, 1)
    with pytest.raises(ValueError):
        b = db.from_sequence([0, 0, 0, 0], npartitions=npartitions)
        b.filter(None).min(split_every=2).compute(scheduler="sync")
Example #13
0
def test_fold_bag():
    def binop(tot, x):
        tot.add(x)
        return tot

    c = b.fold(binop, combine=set.union, initial=set(), out_type=Bag)
    assert isinstance(c, Bag)
    assert_eq(c, list(set(range(5))))
Example #14
0
def test_map_partitions_arg():
    def append_str(partition, s):
        return [x + s for x in partition]

    mybag = db.from_sequence(["a", "b", "c"])

    assert_eq(mybag.map_partitions(append_str, "foo"),
              ['afoo', 'bfoo', 'cfoo'])
    assert_eq(mybag.map_partitions(append_str, dask.delayed("foo")),
              ['afoo', 'bfoo', 'cfoo'])
Example #15
0
def test_map_partitions_arg():
    def append_str(partition, s):
        return [x + s for x in partition]

    mybag = db.from_sequence(["a", "b", "c"])

    assert_eq(mybag.map_partitions(append_str, "foo"),
              ["afoo", "bfoo", "cfoo"])
    assert_eq(mybag.map_partitions(append_str, dask.delayed("foo")),
              ["afoo", "bfoo", "cfoo"])
Example #16
0
def test_repartition_partition_size_complex_dtypes():
    np = pytest.importorskip("numpy")

    b = db.from_sequence([np.array(range(100)) for _ in range(4)], npartitions=1)
    total_mem = sum(b.map_partitions(total_mem_usage).compute())

    new_partition_size = total_mem // 4
    c = b.repartition(partition_size=new_partition_size)
    assert c.npartitions >= 4
    assert_eq(b, c)
Example #17
0
def test_multiple_repartition_partition_size():
    b = db.from_sequence(range(1, 100), npartitions=1)
    total_mem = sum(b.map_partitions(total_mem_usage).compute())

    c = b.repartition(partition_size=(total_mem // 2))
    assert c.npartitions >= 2
    assert_eq(b, c)

    d = c.repartition(partition_size=(total_mem // 5))
    assert d.npartitions >= 5
    assert_eq(c, d)
Example #18
0
def test_frequencies():
    c = b.frequencies()
    assert dict(c) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    c2 = b.frequencies(split_every=2)
    assert dict(c2) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    assert c.name == b.frequencies().name
    assert c.name != c2.name
    assert c2.name == b.frequencies(split_every=2).name
    # test bag with empty partitions
    b2 = db.from_sequence(range(20), partition_size=2)
    b2 = b2.filter(lambda x: x < 10)
    d = b2.frequencies()
    assert dict(d) == dict(zip(range(10), [1] * 10))
    bag = db.from_sequence([0, 0, 0, 0], npartitions=4)
    bag2 = bag.filter(None).frequencies(split_every=2)
    assert_eq(bag2, [])
Example #19
0
def test_frequencies():
    c = b.frequencies()
    assert dict(c) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    c2 = b.frequencies(split_every=2)
    assert dict(c2) == {0: 3, 1: 3, 2: 3, 3: 3, 4: 3}
    assert c.name == b.frequencies().name
    assert c.name != c2.name
    assert c2.name == b.frequencies(split_every=2).name
    # test bag with empty partitions
    b2 = db.from_sequence(range(20), partition_size=2)
    b2 = b2.filter(lambda x: x < 10)
    d = b2.frequencies()
    assert dict(d) == dict(zip(range(10), [1] * 10))
    bag = db.from_sequence([0, 0, 0, 0], npartitions=4)
    bag2 = bag.filter(None).frequencies(split_every=2)
    assert_eq(bag2, [])
Example #20
0
def test_reductions_are_lazy():
    current = [None]

    def part():
        for i in range(10):
            current[0] = i
            yield i

    def func(part):
        assert current[0] == 0
        return sum(part)

    b = Bag({("foo", 0): part()}, "foo", 1)

    res = b.reduction(func, sum)

    assert_eq(res, sum(range(10)))
Example #21
0
def test_reductions_are_lazy():
    current = [None]

    def part():
        for i in range(10):
            current[0] = i
            yield i

    def func(part):
        assert current[0] == 0
        return sum(part)

    b = Bag({('foo', 0): part()}, 'foo', 1)

    res = b.reduction(func, sum)

    assert_eq(res, sum(range(10)))
Example #22
0
def test_var():
    assert_eq(b.var(), 2.0)
    assert float(b.var()) == 2.0
Example #23
0
def test_empty_bag():
    b = db.from_sequence([])
    assert_eq(b.map(inc).all(), True)
    assert_eq(b.map(inc).any(), False)
    assert_eq(b.map(inc).sum(), False)
    assert_eq(b.map(inc).count(), False)
Example #24
0
def test_bag_with_single_callable():
    f = lambda: None
    b = db.from_sequence([f])
    assert_eq(b, [f])
Example #25
0
def test_reduction_with_non_comparable_objects():
    b = db.from_sequence([StrictReal(x) for x in range(10)], partition_size=2)
    assert_eq(b.fold(max, max), StrictReal(9))
Example #26
0
def test_std():
    assert_eq(b.std(), math.sqrt(2.0))
    assert float(b.std()) == math.sqrt(2.0)
Example #27
0
def test_empty_bag():
    b = db.from_sequence([])
    assert_eq(b.map(inc).all(), True)
    assert_eq(b.map(inc).any(), False)
    assert_eq(b.map(inc).sum(), False)
    assert_eq(b.map(inc).count(), False)
Example #28
0
def test_std():
    assert_eq(b.std(), math.sqrt(2.0))
    assert float(b.std()) == math.sqrt(2.0)
Example #29
0
def test_repartition_partition_size(nin, nout):
    b = db.from_sequence(range(1, 100), npartitions=nin)
    total_mem = sum(b.map_partitions(total_mem_usage).compute())
    c = b.repartition(partition_size=(total_mem // nout))
    assert c.npartitions >= nout
    assert_eq(b, c)
Example #30
0
def test_reduction_empty():
    b = db.from_sequence(range(10), npartitions=100)
    assert_eq(b.filter(lambda x: x % 2 == 0).max(), 8)
    assert_eq(b.filter(lambda x: x % 2 == 0).min(), 0)
Example #31
0
def test_reduction_with_non_comparable_objects():
    b = db.from_sequence([StrictReal(x) for x in range(10)], partition_size=2)
    assert_eq(b.fold(max, max), StrictReal(9))
Example #32
0
def test_bag_with_single_callable():
    f = lambda: None
    b = db.from_sequence([f])
    assert_eq(b, [f])
Example #33
0
def test_map_partitions_args_kwargs():
    x = [random.randint(-100, 100) for i in range(100)]
    y = [random.randint(-100, 100) for i in range(100)]

    dx = db.from_sequence(x, npartitions=10)
    dy = db.from_sequence(y, npartitions=10)

    def maximum(x, y=0):
        y = repeat(y) if isinstance(y, int) else y
        return [max(a, b) for (a, b) in zip(x, y)]

    sol = maximum(x, y=10)
    assert_eq(db.map_partitions(maximum, dx, y=10), sol)
    assert_eq(dx.map_partitions(maximum, y=10), sol)
    assert_eq(dx.map_partitions(maximum, 10), sol)

    sol = maximum(x, y)
    assert_eq(db.map_partitions(maximum, dx, dy), sol)
    assert_eq(dx.map_partitions(maximum, y=dy), sol)
    assert_eq(dx.map_partitions(maximum, dy), sol)

    dy_mean = dy.mean().apply(int)
    sol = maximum(x, int(sum(y) / len(y)))
    assert_eq(dx.map_partitions(maximum, y=dy_mean), sol)
    assert_eq(dx.map_partitions(maximum, dy_mean), sol)

    dy_mean = dask.delayed(dy_mean)
    assert_eq(dx.map_partitions(maximum, y=dy_mean), sol)
    assert_eq(dx.map_partitions(maximum, dy_mean), sol)
Example #34
0
def test_distinct_with_key():
    seq = [{"a": i} for i in [0, 1, 2, 1, 2, 3, 2, 3, 4, 5]]
    bag = db.from_sequence(seq, npartitions=3)
    expected = list(unique(seq, key=lambda x: x["a"]))
    assert_eq(bag.distinct(key="a"), expected)
    assert_eq(bag.distinct(key=lambda x: x["a"]), expected)
Example #35
0
def test_map_partitions_args_kwargs():
    x = [random.randint(-100, 100) for i in range(100)]
    y = [random.randint(-100, 100) for i in range(100)]

    dx = db.from_sequence(x, npartitions=10)
    dy = db.from_sequence(y, npartitions=10)

    def maximum(x, y=0):
        y = repeat(y) if isinstance(y, int) else y
        return [max(a, b) for (a, b) in zip(x, y)]

    sol = maximum(x, y=10)
    assert_eq(db.map_partitions(maximum, dx, y=10), sol)
    assert_eq(dx.map_partitions(maximum, y=10), sol)
    assert_eq(dx.map_partitions(maximum, 10), sol)

    sol = maximum(x, y)
    assert_eq(db.map_partitions(maximum, dx, dy), sol)
    assert_eq(dx.map_partitions(maximum, y=dy), sol)
    assert_eq(dx.map_partitions(maximum, dy), sol)

    dy_mean = dy.mean().apply(int)
    sol = maximum(x, int(sum(y) / len(y)))
    assert_eq(dx.map_partitions(maximum, y=dy_mean), sol)
    assert_eq(dx.map_partitions(maximum, dy_mean), sol)

    dy_mean = dask.delayed(dy_mean)
    assert_eq(dx.map_partitions(maximum, y=dy_mean), sol)
    assert_eq(dx.map_partitions(maximum, dy_mean), sol)
Example #36
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert_eq(b.mean(), sum(L) / len(L))
    assert_eq(b.sum(), sum(L))
    assert_eq(b.count(), len(L))
Example #37
0
def test_bag_map():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a=1, b=2, c=3):
        return a + b + c

    assert_eq(db.map(myadd, b), list(map(myadd, x)))
    assert_eq(db.map(myadd, a=b), list(map(myadd, x)))
    assert_eq(db.map(myadd, b, b2), list(map(myadd, x, x2)))
    assert_eq(db.map(myadd, b, 10), [myadd(i, 10) for i in x])
    assert_eq(db.map(myadd, 10, b=b), [myadd(10, b=i) for i in x])

    sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)]
    assert_eq(db.map(myadd, b, b=b2, c=100), sol)

    sol = [myadd(i, c=100) for (i, j) in zip(x, x2)]
    assert_eq(db.map(myadd, b, c=100), sol)

    x_sum = sum(x)
    sol = [myadd(x_sum, b=i, c=100) for i in x2]
    assert_eq(db.map(myadd, b.sum(), b=b2, c=100), sol)

    sol = [myadd(i, b=x_sum, c=100) for i in x2]
    assert_eq(db.map(myadd, b2, b.sum(), c=100), sol)

    sol = [myadd(a=100, b=x_sum, c=i) for i in x2]
    assert_eq(db.map(myadd, a=100, b=b.sum(), c=b2), sol)

    a = dask.delayed(10)
    assert_eq(db.map(myadd, b, a), [myadd(i, 10) for i in x])
    assert_eq(db.map(myadd, b, b=a), [myadd(i, b=10) for i in x])

    # Mispatched npartitions
    fewer_parts = db.from_sequence(range(100), npartitions=5)
    with pytest.raises(ValueError):
        db.map(myadd, b, fewer_parts)

    # No bags
    with pytest.raises(ValueError):
        db.map(myadd, b.sum(), 1, 2)

    # Unequal partitioning
    unequal = db.from_sequence(range(110), npartitions=10)
    with pytest.raises(ValueError):
        db.map(myadd, b, unequal, c=b2).compute()
    with pytest.raises(ValueError):
        db.map(myadd, b, b=unequal, c=b2).compute()
Example #38
0
def test_var():
    assert_eq(b.var(), 2.0)
    assert float(b.var()) == 2.0
Example #39
0
def test_reduction_empty():
    b = db.from_sequence(range(10), npartitions=100)
    assert_eq(b.filter(lambda x: x % 2 == 0).max(), 8)
    assert_eq(b.filter(lambda x: x % 2 == 0).min(), 0)
Example #40
0
def test_bag_map():
    b = db.from_sequence(range(100), npartitions=10)
    b2 = db.from_sequence(range(100, 200), npartitions=10)
    x = b.compute()
    x2 = b2.compute()

    def myadd(a=1, b=2, c=3):
        return a + b + c

    assert_eq(db.map(myadd, b), list(map(myadd, x)))
    assert_eq(db.map(myadd, a=b), list(map(myadd, x)))
    assert_eq(db.map(myadd, b, b2), list(map(myadd, x, x2)))
    assert_eq(db.map(myadd, b, 10), [myadd(i, 10) for i in x])
    assert_eq(db.map(myadd, 10, b=b), [myadd(10, b=i) for i in x])

    sol = [myadd(i, b=j, c=100) for (i, j) in zip(x, x2)]
    assert_eq(db.map(myadd, b, b=b2, c=100), sol)

    sol = [myadd(i, c=100) for (i, j) in zip(x, x2)]
    assert_eq(db.map(myadd, b, c=100), sol)

    x_sum = sum(x)
    sol = [myadd(x_sum, b=i, c=100) for i in x2]
    assert_eq(db.map(myadd, b.sum(), b=b2, c=100), sol)

    sol = [myadd(i, b=x_sum, c=100) for i in x2]
    assert_eq(db.map(myadd, b2, b.sum(), c=100), sol)

    sol = [myadd(a=100, b=x_sum, c=i) for i in x2]
    assert_eq(db.map(myadd, a=100, b=b.sum(), c=b2), sol)

    a = dask.delayed(10)
    assert_eq(db.map(myadd, b, a), [myadd(i, 10) for i in x])
    assert_eq(db.map(myadd, b, b=a), [myadd(i, b=10) for i in x])

    # Mispatched npartitions
    fewer_parts = db.from_sequence(range(100), npartitions=5)
    with pytest.raises(ValueError):
        db.map(myadd, b, fewer_parts)

    # No bags
    with pytest.raises(ValueError):
        db.map(myadd, b.sum(), 1, 2)

    # Unequal partitioning
    unequal = db.from_sequence(range(110), npartitions=10)
    with pytest.raises(ValueError):
        db.map(myadd, b, unequal, c=b2).compute()
    with pytest.raises(ValueError):
        db.map(myadd, b, b=unequal, c=b2).compute()
Example #41
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert_eq(b.mean(), sum(L) / len(L))
    assert_eq(b.sum(), sum(L))
    assert_eq(b.count(), len(L))