Esempio n. 1
0
def test_bag_array_conversion():
    import dask.bag as db
    b = db.range(10, npartitions=1)
    x, = b.map_partitions(np.asarray).to_delayed()
    x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]]
    z = da.concatenate([x])
    assert_eq(z, np.arange(10), check_graph=False)
Esempio n. 2
0
def test_temporary_directory(tmpdir):
    b = db.range(10, npartitions=4)

    with dask.set_options(temporary_directory=str(tmpdir)):
        b2 = b.groupby(lambda x: x % 2)
        b2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
Esempio n. 3
0
def test_temporary_directory():
    b = db.range(10, npartitions=4)

    with dask.set_options(temporary_directory=os.getcwd()):
        b2 = b.groupby(lambda x: x % 2)
        b2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(os.getcwd()))
Esempio n. 4
0
def test_bag_array_conversion():
    import dask.bag as db
    b = db.range(10, npartitions=1)
    x, = b.map_partitions(np.asarray).to_delayed()
    x, = [da.from_delayed(a, shape=(10,), dtype=int) for a in [x]]
    z = da.concatenate([x])
    assert_eq(z, np.arange(10), check_graph=False)
Esempio n. 5
0
def test_groupby_tasks_3():
    func = lambda x: x % 10
    b = db.range(20, npartitions=5).groupby(func,
                                            shuffle="tasks",
                                            max_branch=2)
    result = b.compute(scheduler="sync")
    assert dict(result) == groupby(func, range(20))
Esempio n. 6
0
def test_temporary_directory(tmpdir):
    b = db.range(10, npartitions=4)

    with dask.config.set(temporary_directory=str(tmpdir)):
        b2 = b.groupby(lambda x: x % 2)
        b2.compute()
        assert any(fn.endswith('.partd') for fn in os.listdir(str(tmpdir)))
Esempio n. 7
0
def test_KFold_bag():
    X = db.range(1000, npartitions=10)
    y = db.range(1000, npartitions=10)
    cv = list(KFold(4).split(X, y))

    for x_train, y_train, x_test, y_test in cv:
        train, test = dask.compute(x_train, x_test)
        assert len(train) + len(test) == 1000
        assert set(train) | set(test) == set(range(1000))

    assert (first(KFold(3).split(X, y))[0].name ==
            first(KFold(3).split(X, y))[0].name)
    assert (first(KFold(4).split(X, y))[0].name !=
            first(KFold(3).split(X, y))[0].name)

    with pytest.raises(ValueError):
        list(KFold(11).split(X, y))
Esempio n. 8
0
def test_optimize_fuse_keys():
    x = db.range(10, npartitions=2)
    y = x.map(inc)
    z = y.map(inc)

    dsk = z._optimize(z.dask, z._keys())
    assert not set(y.dask) & set(dsk)

    dsk = z._optimize(z.dask, z._keys(), fuse_keys=y._keys())
    assert all(k in dsk for k in y._keys())
Esempio n. 9
0
def test_optimize_fuse_keys():
    x = db.range(10, npartitions=2)
    y = x.map(inc)
    z = y.map(inc)

    dsk = z.__dask_optimize__(z.dask, z.__dask_keys__())
    assert not y.dask.keys() & dsk.keys()

    dsk = z.__dask_optimize__(z.dask, z.__dask_keys__(), fuse_keys=y.__dask_keys__())
    assert all(k in dsk for k in y.__dask_keys__())
Esempio n. 10
0
def test_temporary_directory(tmpdir):
    b = db.range(10, npartitions=4)

    # We use a pool to avoid a race condition between the pool close
    # cleaning up files, and the assert below.
    with ProcessPoolExecutor(4) as pool:
        with dask.config.set(temporary_directory=str(tmpdir), pool=pool):
            b2 = b.groupby(lambda x: x % 2)
            b2.compute()
            assert any(fn.endswith(".partd") for fn in os.listdir(str(tmpdir)))
Esempio n. 11
0
def test_random_split_errors():
    b = db.range(1000, npartitions=10)
    with pytest.raises(ValueError):
        random_split(b, 2)
    with pytest.raises(ValueError):
        random_split(b, -1)
    with pytest.raises(ValueError):
        random_split(b, 0.5, "not-a-seed-or-RandomState")
    with pytest.raises(TypeError):
        random_split("not-a-dask-object", 0.5)
Esempio n. 12
0
def test_optimize_fuse_keys():
    x = db.range(10, npartitions=2)
    y = x.map(inc)
    z = y.map(inc)

    dsk = z._optimize(z.dask, z._keys())
    assert not set(y.dask) & set(dsk)

    dsk = z._optimize(z.dask, z._keys(), fuse_keys=y._keys())
    assert all(k in dsk for k in y._keys())
Esempio n. 13
0
def test_random_split_bag():
    b = db.range(1000, npartitions=10)
    train, test = random_split(b, 0.2, 123)

    assert random_split(b, 0.2, 123)[0].name == train.name
    assert random_split(b, 0.3, 123)[0].name != train.name
    assert random_split(b, 0.2)[0].name != random_split(b, 0.2)[0].name

    train_c, test_c = dask.compute(train, test)
    assert 0.75 < len(train_c) / 1000 < 0.85
    assert len(train_c) + len(test_c) == 1000
    assert set(train_c) | set(test_c) == set(range(1000))
Esempio n. 14
0
def test_train_test_split():
    x = np.arange(1000)
    a = da.from_array(x, chunks=100)
    m = dm.from_array(a)
    b = db.range(1000, npartitions=10)

    train_a, test_a = train_test_split(a, test_size=0.2, random_state=123)

    train_a2, test_a2, train, test = train_test_split(a, a + 10, test_size=0.2,
                                                      random_state=123)
    assert train_a2.name == train_a.name
    assert test_a2.name == test_a.name
    assert train_a2.name != train.name
    assert train_a.chunks == train.chunks
    assert test_a.chunks == test.chunks

    train_b, test_b, train_m, test_m = train_test_split(b, m, random_state=123)

    parts_b = train_b._get(train_b.dask, train_b._keys())
    parts_m = train_m._get(train_m.dask, train_m._keys())
    for p_b, p_m in zip(parts_b, parts_m):
        assert set(p_b) == set(p_m)

    with pytest.raises(ValueError):
        train_test_split(a, invalid_option=1)  # invalid kwargs

    with pytest.raises(ValueError):
        train_test_split(test_size=0.2)  # no arrays

    with pytest.raises(ValueError):
        train_test_split(a, b)  # not all da.Array

    with pytest.raises(ValueError):
        train_test_split(a, da.from_array(x, chunks=10))  # Not aligned

    with pytest.raises(ValueError):
        train_test_split(m, db.range(1000, npartitions=12))  # Not aligned
Esempio n. 15
0
def test_optimize_globals():
    da = pytest.importorskip("dask.array")

    x = da.ones(10, chunks=(5, ))

    def optimize_double(dsk, keys):
        return {k: (mul, 2, v) for k, v in dsk.items()}

    from dask.array.utils import assert_eq

    assert_eq(x + 1, np.ones(10) + 1)

    with dask.config.set(array_optimize=optimize_double):
        assert_eq(x + 1, (np.ones(10) * 2 + 1) * 2, check_chunks=False)

    assert_eq(x + 1, np.ones(10) + 1)

    b = db.range(10, npartitions=2)

    with dask.config.set(array_optimize=optimize_double):
        xx, bb = dask.compute(x + 1, b.map(inc), scheduler="single-threaded")
        assert_eq(xx, (np.ones(10) * 2 + 1) * 2)
Esempio n. 16
0
def test_to_textfiles_empty_partitions():
    with tmpdir() as d:
        b = db.range(5, npartitions=5).filter(lambda x: x == 1).map(str)
        b.to_textfiles(os.path.join(d, '*.txt'))
        assert len(os.listdir(d)) == 5
Esempio n. 17
0
def test_groupby_tasks_3():
    func = lambda x: x % 10
    b = db.range(20, npartitions=5).groupby(func, method='tasks', max_branch=2)
    result = b.compute(get=dask.get)
    assert dict(result) == groupby(func, range(20))
Esempio n. 18
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert b.mean().compute(get=dask.get) == sum(L) / len(L)
    assert b.sum().compute(get=dask.get) == sum(L)
    assert b.count().compute(get=dask.get) == len(L)
Esempio n. 19
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert_eq(b.mean(), sum(L) / len(L))
    assert_eq(b.sum(), sum(L))
    assert_eq(b.count(), len(L))
Esempio n. 20
0
def test_groupby_tasks_2(size, npartitions, groups):
    func = lambda x: x % groups
    b = db.range(size, npartitions=npartitions).groupby(func, shuffle="tasks")
    result = b.compute(scheduler="sync")
    assert dict(result) == groupby(func, range(size))
Esempio n. 21
0
def test_groupby_tasks_3():
    func = lambda x: x % 10
    b = db.range(20, npartitions=5).groupby(func, shuffle='tasks', max_branch=2)
    result = b.compute(scheduler='sync')
    assert dict(result) == groupby(func, range(20))
Esempio n. 22
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert_eq(b.mean(), sum(L) / len(L))
    assert_eq(b.sum(), sum(L))
    assert_eq(b.count(), len(L))
Esempio n. 23
0
def test_bag_groupby_tasks_default(e, s, a, b):
    with dask.set_options(get=e.get):
        b = db.range(100, npartitions=10)
        b2 = b.groupby(lambda x: x % 13)
        assert not any('partd' in k[0] for k in b2.dask)
Esempio n. 24
0
def test_range():
    for npartitions in [1, 7, 10, 28]:
        b = db.range(100, npartitions=npartitions)
        assert len(b.dask) == npartitions
        assert b.npartitions == npartitions
        assert list(b) == list(range(100))
Esempio n. 25
0
def test_bag_groupby_tasks_default(c, s, a, b):
    b = db.range(100, npartitions=10)
    b2 = b.groupby(lambda x: x % 13)
    assert not any('partd' in k[0] for k in b2.dask)
Esempio n. 26
0
def test_groupby_tasks_3():
    func = lambda x: x % 10
    b = db.range(20, npartitions=5).groupby(func, method='tasks', max_branch=2)
    result = b.compute(get=dask.get)
    assert dict(result) == groupby(func, range(20))
Esempio n. 27
0
def test_groupby_tasks_2(size, npartitions, groups):
    func = lambda x: x % groups
    b = db.range(size, npartitions=npartitions).groupby(func, shuffle='tasks')
    result = b.compute(scheduler='sync')
    assert dict(result) == groupby(func, range(size))
Esempio n. 28
0
def test_bag_groupby_tasks_default(c, s, a, b):
    b = db.range(100, npartitions=10)
    b2 = b.groupby(lambda x: x % 13)
    assert not any('partd' in k[0] for k in b2.dask)
Esempio n. 29
0
def test_repeated_groupby():
    b = db.range(10, npartitions=4)
    c = b.groupby(lambda x: x % 3)
    assert valmap(len, dict(c)) == valmap(len, dict(c))
 def test_fit(self):
     d = from_sklearn(self.sk)
     b = db.from_sequence(self.raw_X)
     fit = d.fit(b, db.range(len(self.raw_X), len(self.raw_X)))
     assert fit is d
Esempio n. 31
0
def test_aggregation(npartitions):
    L = list(range(15))
    b = db.range(15, npartitions=npartitions)
    assert b.mean().compute(get=dask.get) == sum(L) / len(L)
    assert b.sum().compute(get=dask.get) == sum(L)
    assert b.count().compute(get=dask.get) == len(L)
Esempio n. 32
0
def test_repeated_groupby():
    b = db.range(10, npartitions=4)
    c = b.groupby(lambda x: x % 3)
    assert valmap(len, dict(c)) == valmap(len, dict(c))
Esempio n. 33
0
def test_groupby_tasks_2(size, npartitions, groups):
    func = lambda x: x % groups
    b = db.range(size, npartitions=npartitions).groupby(func, method='tasks')
    result = b.compute(get=dask.get)
    assert dict(result) == groupby(func, range(size))
Esempio n. 34
0
def test_bag_groupby_tasks_default(e, s, a, b):
    with dask.set_options(get=e.get):
        b = db.range(100, npartitions=10)
        b2 = b.groupby(lambda x: x % 13)
        assert not any('partd' in k[0] for k in b2.dask)
Esempio n. 35
0
def test_to_textfiles_empty_partitions():
    with tmpdir() as d:
        b = db.range(5, npartitions=5).filter(lambda x: x == 1).map(str)
        b.to_textfiles(os.path.join(d, '*.txt'))
        assert len(os.listdir(d)) == 5
Esempio n. 36
0
def test_groupby_tasks_2(size, npartitions, groups):
    func = lambda x: x % groups
    b = db.range(size, npartitions=npartitions).groupby(func, method='tasks')
    result = b.compute(get=dask.get)
    assert dict(result) == groupby(func, range(size))
Esempio n. 37
0
def test_range():
    for npartitions in [1, 7, 10, 28]:
        b = db.range(100, npartitions=npartitions)
        assert len(b.dask) == npartitions
        assert b.npartitions == npartitions
        assert list(b) == list(range(100))
Esempio n. 38
0
from dask import bag as db
from dask.diagnostics import ProgressBar
import sprite2.aws
import sprite2.dask
import math


# primality test
def is_prime(n):
    if n % 2 == 0 and n > 2:
        return False
    return all(n % i for i in range(3, int(math.sqrt(n)) + 1, 2))


# chunk 10 million
d = (db.range(100000000, npartitions=100).filter(is_prime).count())

# compute result
with ProgressBar():
    result = d.compute(get=sprite2.dask.get)
print(result)