Example #1
0
def test_to_task_dasks():
    a = delayed(1, name='a')
    b = delayed(2, name='b')
    task, dasks = to_task_dasks([a, b, 3])
    assert task == ['a', 'b', 3]
    assert len(dasks) == 2
    assert a.dask in dasks
    assert b.dask in dasks

    task, dasks = to_task_dasks((a, b, 3))
    assert task == (tuple, ['a', 'b', 3])
    assert len(dasks) == 2
    assert a.dask in dasks
    assert b.dask in dasks

    task, dasks = to_task_dasks({a: 1, b: 2})
    assert (task == (dict, [['b', 2], ['a', 1]]) or
            task == (dict, [['a', 1], ['b', 2]]))
    assert len(dasks) == 2
    assert a.dask in dasks
    assert b.dask in dasks

    f = namedtuple('f', ['x', 'y'])
    x = f(1, 2)
    task, dasks = to_task_dasks(x)
    assert task == x
    assert dasks == []
Example #2
0
def test_to_task_dask():
    a = delayed(1, name='a')
    b = delayed(2, name='b')
    task, dask = to_task_dask([a, b, 3])
    assert task == ['a', 'b', 3]

    task, dask = to_task_dask((a, b, 3))
    assert task == (tuple, ['a', 'b', 3])
    assert dict(dask) == merge(a.dask, b.dask)

    task, dask = to_task_dask({a: 1, b: 2})
    assert (task == (dict, [['b', 2], ['a', 1]]) or
            task == (dict, [['a', 1], ['b', 2]]))
    assert dict(dask) == merge(a.dask, b.dask)

    f = namedtuple('f', ['x', 'y'])
    x = f(1, 2)
    task, dask = to_task_dask(x)
    assert task == x
    assert dict(dask) == {}

    # Issue https://github.com/dask/dask/issues/2107
    class MyClass(dict):
        pass

    task, dask = to_task_dask(MyClass())
    assert type(task) is MyClass
    assert dict(dask) == {}
def test_grid_search_dask_inputs():
    # Numpy versions
    np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0)
    np_groups = np.random.RandomState(0).randint(0, 3, 15)
    # Dask array versions
    da_X = da.from_array(np_X, chunks=5)
    da_y = da.from_array(np_y, chunks=5)
    da_groups = da.from_array(np_groups, chunks=5)
    # Delayed versions
    del_X = delayed(np_X)
    del_y = delayed(np_y)
    del_groups = delayed(np_groups)

    cv = GroupKFold()
    clf = SVC(random_state=0)
    grid = {'C': [1]}

    sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_

    for X, y, groups in product([np_X, da_X, del_X],
                                [np_y, da_y, del_y],
                                [np_groups, da_groups, del_groups]):
        gs = dcv.GridSearchCV(clf, grid, cv=cv)

        with pytest.raises(ValueError) as exc:
            gs.fit(X, y)
        assert "The groups parameter should not be None" in str(exc.value)

        gs.fit(X, y, groups=groups)
        np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
Example #4
0
def test_pure():
    v1 = delayed(add, pure=True)(1, 2)
    v2 = delayed(add, pure=True)(1, 2)
    assert v1.key == v2.key

    myrand = delayed(random)
    assert myrand().key != myrand().key
Example #5
0
def test_custom_delayed():
    x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c'])
    x2 = delayed(add, pure=True)(x, (4, 5, 6))
    n = delayed(len, pure=True)(x)
    assert delayed(len, pure=True)(x).key == n.key
    assert x2.compute() == (1, 2, 3, 4, 5, 6)
    assert compute(n, x2, x) == (3, (1, 2, 3, 4, 5, 6), (1, 2, 3))
Example #6
0
def test_delayed():
    add2 = delayed(add)
    assert add2(1, 2).compute() == 3
    assert (add2(1, 2) + 3).compute() == 6
    assert add2(add2(1, 2), 3).compute() == 6
    a = delayed(1)
    b = add2(add2(a, 2), 3)
    assert a.key in b.dask
Example #7
0
def test_np_dtype_of_delayed():
    # This used to result in a segfault due to recursion, see
    # https://github.com/dask/dask/pull/4374#issuecomment-454381465
    np = pytest.importorskip('numpy')
    x = delayed(1)
    with pytest.raises(TypeError):
        np.dtype(x)
    assert delayed(np.array([1], dtype='f8')).dtype.compute() == np.dtype('f8')
Example #8
0
def test_from_delayed_sorted():
    a = pd.DataFrame({'x': [1, 2]}, index=[1, 10])
    b = pd.DataFrame({'x': [4, 1]}, index=[100, 200])

    A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted')
    assert A.known_divisions

    assert A.divisions == (1, 100, 200)
Example #9
0
def test_delayed_name():
    assert delayed(1)._key.startswith('int-')
    assert delayed(1, pure=True)._key.startswith('int-')
    assert delayed(1, name='X')._key == 'X'

    def myfunc(x):
        return x + 1

    assert delayed(myfunc)(1).key.startswith('myfunc')
Example #10
0
def test_kwargs():
    def mysum(a, b, c=(), **kwargs):
        return a + b + sum(c) + sum(kwargs.values())
    dmysum = delayed(mysum)
    ten = dmysum(1, 2, c=[delayed(3), 0], four=dmysum(2, 2))
    assert ten.compute() == 10
    dmysum = delayed(mysum, pure=True)
    ten = dmysum(1, 2, c=[delayed(3), 0], four=dmysum(2, 2))
    assert ten.compute() == 10
Example #11
0
def test_operators():
    a = delayed([1, 2, 3])
    assert a[0].compute() == 1
    assert (a + a).compute() == [1, 2, 3, 1, 2, 3]

    a = delayed(10)
    assert (a + 1).compute() == 11
    assert (1 + a).compute() == 11
    assert (a >> 1).compute() == 5
    assert (a > 2).compute()
    assert (a ** 2).compute() == 100
Example #12
0
def test_iterators():
    a = delayed(1)
    b = delayed(2)
    c = delayed(sum)(iter([a, b]))

    assert c.compute() == 3

    def f(seq):
        return sum(seq)

    c = delayed(f)(iter([a, b]))
    assert c.compute() == 3
Example #13
0
def test_from_delayed():
    from dask.delayed import delayed
    a, b, c = delayed([1, 2, 3]), delayed([4, 5, 6]), delayed([7, 8, 9])
    bb = from_delayed([a, b, c])
    assert bb.name == from_delayed([a, b, c]).name

    assert isinstance(bb, Bag)
    assert list(bb) == [1, 2, 3, 4, 5, 6, 7, 8, 9]

    asum_value = delayed(lambda X: sum(X))(a)
    asum_item = db.Item.from_delayed(asum_value)
    assert asum_value.compute() == asum_item.compute() == 6
Example #14
0
def test_from_delayed():
    df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list('abcd'))
    parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]]
    dfs = [delayed(parts.__getitem__)(i) for i in range(4)]
    meta = dfs[0].compute()

    my_len = lambda x: pd.Series([len(x)])

    for divisions in [None, [0, 1, 3, 6, 10]]:
        ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions)
        assert_eq(ddf, df)
        assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

        s = dd.from_delayed([d.a for d in dfs], meta=meta.a,
                            divisions=divisions)
        assert_eq(s, df.a)
        assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4]
        assert ddf.known_divisions == (divisions is not None)

    meta2 = [(c, 'f8') for c in df.columns]
    assert_eq(dd.from_delayed(dfs, meta=meta2), df)
    assert_eq(dd.from_delayed([d.a for d in dfs], meta=('a', 'f8')), df.a)

    with pytest.raises(ValueError):
        dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6])

    with pytest.raises(ValueError) as e:
        dd.from_delayed(dfs, meta=meta.a).compute()
    assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
Example #15
0
def test_methods():
    a = delayed("a b c d e")
    assert a.split(' ').compute() == ['a', 'b', 'c', 'd', 'e']
    assert a.upper().replace('B', 'A').split().count('A').compute() == 2
    assert a.split(' ', pure=True).key == a.split(' ', pure=True).key
    o = a.split(' ', dask_key_name='test')
    assert o.key == 'test'
Example #16
0
def test_compute():
    a = delayed(1) + 5
    b = a + 1
    c = a + 2
    assert compute(b, c) == (7, 8)
    assert compute(b) == (7,)
    assert compute([a, b], c) == ([6, 7], 8)
Example #17
0
def compute_n_splits(cv, X, y=None, groups=None):
    """Return the number of splits.

    Parameters
    ----------
    cv : BaseCrossValidator
    X, y, groups : array_like, dask object, or None

    Returns
    -------
    n_splits : int
    """
    if not any(is_dask_collection(i) for i in (X, y, groups)):
        return cv.get_n_splits(X, y, groups)

    if isinstance(cv, (_BaseKFold, BaseShuffleSplit)):
        return cv.n_splits

    elif isinstance(cv, PredefinedSplit):
        return len(cv.unique_folds)

    elif isinstance(cv, _CVIterableWrapper):
        return len(cv.cv)

    elif isinstance(cv, (LeaveOneOut, LeavePOut)) and not is_dask_collection(X):
        # Only `X` is referenced for these classes
        return cv.get_n_splits(X, None, None)

    elif (isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not
          is_dask_collection(groups)):
        # Only `groups` is referenced for these classes
        return cv.get_n_splits(None, None, groups)

    else:
        return delayed(cv).get_n_splits(X, y, groups).compute()
Example #18
0
def test_delayed_callable():
    f = delayed(add, pure=True)
    v = f(1, 2)
    assert v.dask == {v.key: (add, 1, 2)}

    assert f.dask == {f.key: add}
    assert f.compute() == add
Example #19
0
def open_files(path, hdfs=None, lazy=None, **auth):
    if lazy is not None:
        raise DeprecationWarning("Lazy keyword has been deprecated. "
                                 "Now always lazy")
    hdfs = hdfs or HDFileSystem(**auth)
    filenames = sorted(hdfs.glob(path))
    myopen = delayed(hdfs_open_file)
    return [myopen(fn, auth) for fn in filenames]
Example #20
0
def test_keys_from_array():
    da = pytest.importorskip('dask.array')
    from dask.array.utils import _check_dsk

    X = da.ones((10, 10), chunks=5).to_delayed().flatten()
    xs = [delayed(inc)(x) for x in X]

    _check_dsk(xs[0].dask)
Example #21
0
def test_mesos_is_delayed():
    def add(x, y):
        return x + y

    add1 = delayed(add)
    add2 = mesos(add)

    assert isinstance(add2, add1.__class__)
    assert add1(2, 3).compute() == add2(2, 3).compute()
Example #22
0
def test_nout():
    func = delayed(lambda x: (x, -x), nout=2, pure=True)
    x = func(1)
    assert len(x) == 2
    a, b = x
    assert compute(a, b) == (1, -1)
    assert a._length is None
    assert b._length is None
    pytest.raises(TypeError, lambda: len(a))
    pytest.raises(TypeError, lambda: list(a))

    pytest.raises(ValueError, lambda: delayed(add, nout=-1))
    pytest.raises(ValueError, lambda: delayed(add, nout=True))

    func = delayed(add, nout=1)
    a = func(1)
    assert a._length is None
    pytest.raises(TypeError, lambda: list(a))
    pytest.raises(TypeError, lambda: len(a))
Example #23
0
def test_to_task_dask():
    a = delayed(1, name='a')
    b = delayed(2, name='b')
    task, dask = to_task_dask([a, b, 3])
    assert task == ['a', 'b', 3]

    task, dask = to_task_dask((a, b, 3))
    assert task == (tuple, ['a', 'b', 3])
    assert dict(dask) == merge(a.dask, b.dask)

    task, dask = to_task_dask({a: 1, b: 2})
    assert (task == (dict, [['b', 2], ['a', 1]]) or
            task == (dict, [['a', 1], ['b', 2]]))
    assert dict(dask) == merge(a.dask, b.dask)

    f = namedtuple('f', ['x', 'y'])
    x = f(1, 2)
    task, dask = to_task_dask(x)
    assert task == x
    assert dict(dask) == {}
Example #24
0
def test_to_task_dask():
    with warnings.catch_warnings(record=True):
        a = delayed(1, name='a')
        b = delayed(2, name='b')
        task, dask = to_task_dask([a, b, 3])
        assert task == ['a', 'b', 3]

        task, dask = to_task_dask((a, b, 3))
        assert task == (tuple, ['a', 'b', 3])
        assert dict(dask) == merge(a.dask, b.dask)

        task, dask = to_task_dask({a: 1, b: 2})
        assert (task == (dict, [['b', 2], ['a', 1]]) or
                task == (dict, [['a', 1], ['b', 2]]))
        assert dict(dask) == merge(a.dask, b.dask)

        f = namedtuple('f', ['x', 'y'])
        x = f(1, 2)
        task, dask = to_task_dask(x)
        assert task == x
        assert dict(dask) == {}

        task, dask = to_task_dask(slice(a, b, 3))
        assert task == (slice, 'a', 'b', 3)
        assert dict(dask) == merge(a.dask, b.dask)

        # Issue https://github.com/dask/dask/issues/2107
        class MyClass(dict):
            pass

        task, dask = to_task_dask(MyClass())
        assert type(task) is MyClass
        assert dict(dask) == {}

        # Custom dask objects
        x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c'])
        task, dask = to_task_dask(x)
        assert task in dask
        f = dask.pop(task)
        assert f == (tuple, ['a', 'b', 'c'])
        assert dask == x._dask
Example #25
0
def test_finalize_name():
    import dask.array as da
    x = da.ones(10, chunks=5)
    v = delayed([x])
    assert set(x.dask).issubset(v.dask)

    def key(s):
        if isinstance(s, tuple):
            s = s[0]
        return s.split('-')[0]

    assert all(key(k).isalpha() for k in v.dask)
Example #26
0
def test_delayed_errors():
    a = delayed([1, 2, 3])
    # Immutable
    pytest.raises(TypeError, lambda: setattr(a, 'foo', 1))
    pytest.raises(TypeError, lambda: setitem(a, 1, 0))
    # Can't iterate, or check if contains
    pytest.raises(TypeError, lambda: 1 in a)
    pytest.raises(TypeError, lambda: list(a))
    # No dynamic generation of magic/hidden methods
    pytest.raises(AttributeError, lambda: a._hidden())
    # Truth of delayed forbidden
    pytest.raises(TypeError, lambda: bool(a))
Example #27
0
def test_array_delayed():
    np = pytest.importorskip('numpy')
    da = pytest.importorskip('dask.array')

    arr = np.arange(100).reshape((10, 10))
    darr = da.from_array(arr, chunks=(5, 5))
    val = delayed(sum)([arr, darr, 1])
    assert isinstance(val, Delayed)
    assert np.allclose(val.compute(), arr + arr + 1)
    assert val.sum().compute() == (arr + arr + 1).sum()
    assert val[0, 0].compute() == (arr + arr + 1)[0, 0]

    task, dsk = to_task_dask(darr)
    orig = set(darr.dask)
    final = set(dsk)
    assert orig.issubset(final)
    diff = final.difference(orig)
    assert len(diff) == 1

    delayed_arr = delayed(darr)
    assert (delayed_arr.compute() == arr).all()
Example #28
0
def test_delayed_picklable():
    # Delayed
    x = delayed(divmod, nout=2, pure=True)(1, 2)
    y = pickle.loads(pickle.dumps(x))
    assert x.dask == y.dask
    assert x._key == y._key
    assert x._length == y._length
    # DelayedLeaf
    x = delayed(1j + 2)
    y = pickle.loads(pickle.dumps(x))
    assert x.dask == y.dask
    assert x._key == y._key
    assert x._nout == y._nout
    assert x._pure == y._pure
    # DelayedAttr
    x = x.real
    y = pickle.loads(pickle.dumps(x))
    assert x._obj._key == y._obj._key
    assert x._obj.dask == y._obj.dask
    assert x._attr == y._attr
    assert x._key == y._key
Example #29
0
def test_traverse_false():
    # Create a list with a dask value, and test that it's not computed
    def fail(*args):
        raise ValueError("shouldn't have computed")

    a = delayed(fail)()

    # list
    x = [a, 1, 2, 3]
    res = delayed(x, traverse=False).compute()
    assert len(res) == 4
    assert res[0] is a
    assert res[1:] == x[1:]

    # tuple that looks like a task
    x = (fail, a, (fail, a))
    res = delayed(x, traverse=False).compute()
    assert isinstance(res, tuple)
    assert res[0] == fail
    assert res[1] is a

    # list containing task-like-things
    x = [1, (fail, a), a]
    res = delayed(x, traverse=False).compute()
    assert isinstance(res, list)
    assert res[0] == 1
    assert res[1][0] == fail and res[1][1] is a
    assert res[2] is a

    # traverse=False still hits top level
    b = delayed(1)
    x = delayed(b, traverse=False)
    assert x.compute() == 1
Example #30
0
def test_callable_obj():
    class Foo(object):
        def __init__(self, a):
            self.a = a

        def __call__(self):
            return 2

    foo = Foo(1)
    f = delayed(foo)
    assert f.compute() is foo
    assert f.a.compute() == 1
    assert f().compute() == 2
Example #31
0
def test_key_names_include_function_names():
    def myfunc(x):
        return x + 1
    assert delayed(myfunc)(1).key.startswith('myfunc')
Example #32
0
def test_method_getattr_call_same_task():
    a = delayed([1, 2, 3])
    o = a.index(1)
    # Don't getattr the method, then call in separate task
    assert getattr not in set(v[0] for v in o.__dask_graph__().values())
Example #33
0
def test_key_names_include_type_names():
    assert delayed(1).key.startswith('int')
Example #34
0
def compute(*args, **kwargs):
    """Compute several dask collections at once.

    Parameters
    ----------
    args : object
        Any number of objects. If it is a dask object, it's computed and the
        result is returned. By default, python builtin collections are also
        traversed to look for dask objects (for more information see the
        ``traverse`` keyword). Non-dask arguments are passed through unchanged.
    traverse : bool, optional
        By default dask traverses builtin python collections looking for dask
        objects passed to ``compute``. For large collections this can be
        expensive. If none of the arguments contain any dask objects, set
        ``traverse=False`` to avoid doing this traversal.
    get : callable, optional
        A scheduler ``get`` function to use. If not provided, the default is
        to check the global settings first, and then fall back to defaults for
        the collections.
    optimize_graph : bool, optional
        If True [default], the optimizations for each collection are applied
        before computation. Otherwise the graph is run as is. This can be
        useful for debugging.
    kwargs
        Extra keywords to forward to the scheduler ``get`` function.

    Examples
    --------
    >>> import dask.array as da
    >>> a = da.arange(10, chunks=2).sum()
    >>> b = da.arange(10, chunks=2).mean()
    >>> compute(a, b)
    (45, 4.5)

    By default, dask objects inside python collections will also be computed:

    >>> compute({'a': a, 'b': b, 'c': 1})  # doctest: +SKIP
    ({'a': 45, 'b': 4.5, 'c': 1},)
    """
    from dask.delayed import delayed
    traverse = kwargs.pop('traverse', True)
    if traverse:
        args = tuple(delayed(a)
                     if isinstance(a, (list, set, tuple, dict, Iterator))
                     else a for a in args)

    optimize_graph = kwargs.pop('optimize_graph', True)
    variables = [a for a in args if isinstance(a, Base)]
    if not variables:
        return args

    get = kwargs.pop('get', None) or _globals['get']

    if not get:
        get = variables[0]._default_get
        if not all(a._default_get == get for a in variables):
            raise ValueError("Compute called on multiple collections with "
                             "differing default schedulers. Please specify a "
                             "scheduler `get` function using either "
                             "the `get` kwarg or globally with `set_options`.")

    dsk = collections_to_dsk(variables, optimize_graph, **kwargs)
    keys = [var._keys() for var in variables]
    results = get(dsk, keys, **kwargs)

    results_iter = iter(results)
    return tuple(a if not isinstance(a, Base)
                 else a._finalize(next(results_iter))
                 for a in args)
Example #35
0
def test_literates_keys():
    a = delayed(1)
    b = a + 1
    lit = (a, b, 3)
    assert delayed(lit).key != delayed(lit).key
    assert delayed(lit, pure=True).key == delayed(lit, pure=True).key
Example #36
0
def test_named_value():
    assert 'X' in delayed(1, name='X').dask
Example #37
0
        return tokenize(self.based_on)

    __dask_scheduler__ = staticmethod(get2)

    __dask_optimize__ = globalmethod(
        dont_optimize,
        key="collection_optim",
        falsey=dont_optimize,
    )


def increment_(x: int) -> int:
    return x + 1


increment: Delayed = delayed(increment_)


def assert_isinstance(coll: DaskCollection, protocol: Any) -> None:
    assert isinstance(coll, protocol)


@pytest.mark.parametrize("protocol", [DaskCollection, HLGDaskCollection])
def test_isinstance_core(protocol: Any) -> None:
    from dask.array import Array
    from dask.bag import Bag
    from dask.dataframe import DataFrame

    arr: Array = da.ones(10)
    bag: Bag = db.from_sequence([1, 2, 3, 4, 5], npartitions=2)
    df: DataFrame = dds.timeseries()
Example #38
0
def test_sensitive_to_partials():
    assert (delayed(partial(add, 10), pure=True)(2)._key != delayed(
        partial(add, 20), pure=True)(2)._key)
Example #39
0
def test_delayed_picklable():
    x = delayed(1)
    y = pickle.loads(pickle.dumps(x))
    assert x.dask == y.dask
    assert x._key == y._key
Example #40
0
def test_delayed_method_descriptor():
    delayed(bytes.decode)(b"")  # does not err
Example #41
0
def test_delayed_name_on_call():
    f = delayed(add, pure=True)
    assert f(1, 2, dask_key_name="foo")._key == "foo"
Example #42
0
def test_delayed_compute_forward_kwargs():
    x = delayed(1) + 2
    x.compute(bogus_keyword=10)
Example #43
0
def test_nout_with_tasks(x):
    length = len(x)
    d = delayed(x, nout=length)
    assert len(d) == len(list(d)) == length
    assert d.compute() == x
Example #44
0
def test_lists_are_concrete():
    a = delayed(1)
    b = delayed(2)
    c = delayed(max)([[a, 10], [b, 20]], key=lambda x: x[0])[1]

    assert c.compute() == 20
Example #45
0
def test_value():
    v = delayed(1)
    assert v.compute() == 1
    assert 1 in v.dask.values()
Example #46
0
def test_attr_optimize():
    # Check that attribute access is inlined
    a = delayed([1, 2, 3])
    o = a.index(1)
    dsk = o._optimize(o.dask, o._keys())
    assert getattr not in set(v[0] for v in dsk.values())
Example #47
0
def test_common_subexpressions():
    a = delayed([1, 2, 3])
    res = a[0] + a[0]
    assert a[0].key in res.dask
    assert a.key in res.dask
    assert len(res.dask) == 3
Example #48
0
def test_attributes():
    a = delayed(2 + 1j)
    assert a.real.compute() == 2
    assert a.imag.compute() == 1
Example #49
0

@delayed
def modlevel_delayed1(x):
    return x + 1


@delayed(pure=False)
def modlevel_delayed2(x):
    return x + 1


@pytest.mark.parametrize(
    "f",
    [
        delayed(modlevel_eager),
        pytest.param(modlevel_delayed1,
                     marks=pytest.mark.xfail(reason="#3369")),
        pytest.param(modlevel_delayed2,
                     marks=pytest.mark.xfail(reason="#3369")),
    ],
)
def test_pickle(f):
    d = f(2)
    d = pickle.loads(pickle.dumps(d, protocol=pickle.HIGHEST_PROTOCOL))
    assert d.compute() == 3


@pytest.mark.parametrize(
    "f", [delayed(modlevel_eager), modlevel_delayed1, modlevel_delayed2])
def test_cloudpickle(f):
Example #50
0
async def run():
    number_of_cores_per_node = 16  # DAS-5 features 2x8 NUMA cores per compute node
    reservation_length = "08:00:00"  # 2 hours is more than enough... probably
    cluster = SLURMCluster(cores=number_of_cores_per_node,
                           memory="64 GB",
                           processes=4,
                           scheduler_options={"dashboard_address": ":6868"},
                           local_directory="./aip-logs",
                           interface='ib0',
                           walltime=reservation_length)

    # Grab 5 execution nodes -> 80 cores
    print("Scaling up, getting 5 nodes")
    cluster.scale_up(5)
    client = Client(cluster)

    print("Client is ready, parsing data files...")

    file_locations = "/var/scratch/lvs215/aip_tmp"
    data_files = []

    # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around
    for path, subdirs, files in os.walk(file_locations):
        for name in files:
            if isfile(os.path.join(path, name)) and not name.endswith(
                ("gz", "zip", "tar")):
                data_files.append(os.path.join(path, name))

    client.run(clear_all_files)

    # Create one task per file.
    print(data_files)
    print("Creating and executing tasks...")
    tasks = list(map(delayed(process_file), data_files))
    true_false_array = db.from_delayed(tasks)

    # DEBUG CODE
    # future = client.compute(true_false_array)
    # client.recreate_error_locally(future)

    # Time to compute them!
    start = datetime.datetime.now()
    res = true_false_array.compute()
    end = datetime.datetime.now()
    print(true_false_array)
    print(res)
    print("Tasks ran to completion! Copying databases.")
    if False not in true_false_array:  # If everything went alright, let all nodes copy their databases to the home dir.
        client.run(copy_database_to_home_folder)
        client.run(clear_all_files)
    else:
        print("Parsing one of the files went horribly wrong, quitting!")
        exit(-1)

    print("Beginning assembling of all databases into one!")
    # Now, each of the nodes has a local database file, we will now combine these databases into one.
    # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel.
    # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data
    database_manager = DatabaseManager(
    )  # This creates an empty aip.db if it doesn't exists.
    con3 = database_manager.db  # Reuse the connection

    # based on https://stackoverflow.com/a/37138506
    os.makedirs(db_files_location, exist_ok=True)
    for file in [
            os.path.join(db_files_location, f)
            for f in os.listdir(db_files_location)
            if isfile(os.path.join(db_files_location, f)) and f.endswith(".db")
    ]:
        con3.execute("ATTACH '{}' as dba".format(file))

        con3.execute("BEGIN")
        for row in con3.execute(
                "SELECT * FROM dba.sqlite_master WHERE type='table'"):
            combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1]
            print(combine)
            con3.execute(combine)
        con3.execute("detach database dba")
        con3.commit()
        # Now, delete the database as it has been copied.
        # os.remove("{}.db".format(hash(worker)))
    print("All done. Releasing all nodes.")
    await cluster.scale_down(cluster.workers)
    print("Nodes released.")
    print(end - start)
Example #51
0
File: sql.py Project: m-rossi/dask
def to_sql(
    df,
    name: str,
    uri: str,
    schema=None,
    if_exists: str = "fail",
    index: bool = True,
    index_label=None,
    chunksize=None,
    dtype=None,
    method=None,
    compute=True,
    parallel=False,
    engine_kwargs=None,
):
    """Store Dask Dataframe to a SQL table

    An empty table is created based on the "meta" DataFrame (and conforming to the caller's "if_exists" preference), and
    then each block calls pd.DataFrame.to_sql (with `if_exists="append"`).

    Databases supported by SQLAlchemy [1]_ are supported. Tables can be
    newly created, appended to, or overwritten.

    Parameters
    ----------
    name : str
        Name of SQL table.
    uri : string
        Full sqlalchemy URI for the database connection
    schema : str, optional
        Specify the schema (if database flavor supports this). If None, use
        default schema.
    if_exists : {'fail', 'replace', 'append'}, default 'fail'
        How to behave if the table already exists.

        * fail: Raise a ValueError.
        * replace: Drop the table before inserting new values.
        * append: Insert new values to the existing table.

    index : bool, default True
        Write DataFrame index as a column. Uses `index_label` as the column
        name in the table.
    index_label : str or sequence, default None
        Column label for index column(s). If None is given (default) and
        `index` is True, then the index names are used.
        A sequence should be given if the DataFrame uses MultiIndex.
    chunksize : int, optional
        Specify the number of rows in each batch to be written at a time.
        By default, all rows will be written at once.
    dtype : dict or scalar, optional
        Specifying the datatype for columns. If a dictionary is used, the
        keys should be the column names and the values should be the
        SQLAlchemy types or strings for the sqlite3 legacy mode. If a
        scalar is provided, it will be applied to all columns.
    method : {None, 'multi', callable}, optional
        Controls the SQL insertion clause used:

        * None : Uses standard SQL ``INSERT`` clause (one per row).
        * 'multi': Pass multiple values in a single ``INSERT`` clause.
        * callable with signature ``(pd_table, conn, keys, data_iter)``.

        Details and a sample callable implementation can be found in the
        section :ref:`insert method <io.sql.method>`.
    compute : bool, default True
        When true, call dask.compute and perform the load into SQL; otherwise, return a Dask object (or array of
        per-block objects when parallel=True)
    parallel : bool, default False
        When true, have each block append itself to the DB table concurrently. This can result in DB rows being in a
        different order than the source DataFrame's corresponding rows. When false, load each block into the SQL DB in
        sequence.
    engine_kwargs : dict or None
        Specific db engine parameters for sqlalchemy

    Raises
    ------
    ValueError
        When the table already exists and `if_exists` is 'fail' (the
        default).

    See Also
    --------
    read_sql : Read a DataFrame from a table.

    Notes
    -----
    Timezone aware datetime columns will be written as
    ``Timestamp with timezone`` type with SQLAlchemy if supported by the
    database. Otherwise, the datetimes will be stored as timezone unaware
    timestamps local to the original timezone.

    .. versionadded:: 0.24.0

    References
    ----------
    .. [1] https://docs.sqlalchemy.org
    .. [2] https://www.python.org/dev/peps/pep-0249/

    Examples
    --------
    Create a table from scratch with 4 rows.

    >>> import pandas as pd
    >>> import dask.dataframe as dd
    >>> df = pd.DataFrame([ {'i':i, 's':str(i)*2 } for i in range(4) ])
    >>> ddf = dd.from_pandas(df, npartitions=2)
    >>> ddf  # doctest: +SKIP
    Dask DataFrame Structure:
                       i       s
    npartitions=2
    0              int64  object
    2                ...     ...
    3                ...     ...
    Dask Name: from_pandas, 2 tasks

    >>> from dask.utils import tmpfile
    >>> from sqlalchemy import create_engine
    >>> with tmpfile() as f:
    ...     db = 'sqlite:///%s' %f
    ...     ddf.to_sql('test', db)
    ...     engine = create_engine(db, echo=False)
    ...     result = engine.execute("SELECT * FROM test").fetchall()
    >>> result
    [(0, 0, '00'), (1, 1, '11'), (2, 2, '22'), (3, 3, '33')]
    """
    if not isinstance(uri, str):
        raise ValueError(f"Expected URI to be a string, got {type(uri)}.")

    # This is the only argument we add on top of what Pandas supports
    kwargs = dict(
        name=name,
        uri=uri,
        engine_kwargs=engine_kwargs,
        schema=schema,
        if_exists=if_exists,
        index=index,
        index_label=index_label,
        chunksize=chunksize,
        dtype=dtype,
        method=method,
    )

    meta_task = delayed(_to_sql_chunk)(df._meta, **kwargs)

    # Partitions should always append to the empty table created from `meta` above
    worker_kwargs = dict(kwargs, if_exists="append")

    if parallel:
        # Perform the meta insert, then one task that inserts all blocks concurrently:
        result = [
            _extra_deps(
                _to_sql_chunk,
                d,
                extras=meta_task,
                **worker_kwargs,
                dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs),
            ) for d in df.to_delayed()
        ]
    else:
        # Chain the "meta" insert and each block's insert
        result = []
        last = meta_task
        for d in df.to_delayed():
            result.append(
                _extra_deps(
                    _to_sql_chunk,
                    d,
                    extras=last,
                    **worker_kwargs,
                    dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs),
                ))
            last = result[-1]
    result = delayed(result)

    if compute:
        dask_compute(result)
    else:
        return result
Example #52
0
def test_attributes():
    a = delayed(2 + 1j)
    assert a.real._key == a.real._key
    assert a.real.compute() == 2
    assert a.imag.compute() == 1
    assert (a.real + a.imag).compute() == 3
Example #53
0
def test_value_name():
    assert delayed(1)._key.startswith('int-')
    assert delayed(1, pure=True)._key.startswith('int-')
Example #54
0
        gs.visualize(filename=os.path.join(d, "mydask"))
        assert os.path.exists(os.path.join(d, "mydask.png"))

    # Doesn't work if not fitted
    gs = dcv.GridSearchCV(clf, grid)
    with pytest.raises(NotFittedError):
        gs.visualize()


np_X = np.random.normal(size=(20, 3))
np_y = np.random.randint(2, size=20)
np_groups = np.random.permutation(list(range(5)) * 4)
da_X = da.from_array(np_X, chunks=(3, 3))
da_y = da.from_array(np_y, chunks=3)
da_groups = da.from_array(np_groups, chunks=3)
del_X = delayed(np_X)
del_y = delayed(np_y)
del_groups = delayed(np_groups)


@pytest.mark.parametrize(
    ["cls", "has_shuffle"],
    [
        (KFold, True),
        (GroupKFold, False),
        (StratifiedKFold, True),
        (TimeSeriesSplit, False),
    ],
)
def test_kfolds(cls, has_shuffle):
    assert tokenize(cls(n_splits=3)) == tokenize(cls(n_splits=3))
Example #55
0
 def _get_random_state(self):
     i_subs = next(substreams)
     return delayed(get_substream_state, pure=True)(self.seed, i_subs)
        for i in block_size:  # changing blocks
            for j in range(1, 6):  # changing files (5 files per block size)
                # Create a new file
                longXTC1 = 'newtraj{}.xtc'.format(ii)
                copyfile(longXTC, longXTC1)
                # Provide the path to my file to all processes
                my_path = os.path.normpath(os.path.join(os.getcwd(), longXTC1))
                #                print (my_path)
                longXTC1 = os.path.abspath(my_path)
                # Define a new universe with the new trajectory
                u = mda.Universe(PSF, longXTC1)
                print(u)
                print("frames in trajectory ", u.trajectory.n_frames)
                print(len(u.trajectory))
                mobile = u.select_atoms(
                    "(resid 1:29 or resid 60:121 or resid 160:214) and name CA"
                )
                index = mobile.indices
                total = com_parallel_dask_distributed(mobile, index, i)
                total = delayed(total)
                start = time.time()
                output = total.compute(get=c.get)
                tot_time = time.time() - start
                file.write("XTC{} {} {} {} {} {} {} {}\n".format(
                    k, i, j, output[1], output[2], output[3], output[4],
                    tot_time))
                file.flush()
                # Deleting all files
                os.remove('newtraj{}.xtc'.format(ii))
                ii = ii + 1
Example #57
0
def test_attribute_of_attribute():
    x = delayed(123)
    assert isinstance(x.a, Delayed)
    assert isinstance(x.a.b, Delayed)
    assert isinstance(x.a.b.c, Delayed)
Example #58
0
def test_persist_delayedleaf():
    x = delayed(1)
    (xx,) = persist(x)
    assert isinstance(xx, Delayed)
    assert xx.compute() == 1
Example #59
0
def test_lists():
    a = delayed(1)
    b = delayed(2)
    c = delayed(sum)([a, b])
    assert c.compute() == 3
Example #60
0
 def read_data(self, node_name, sl):
     name = node_name + "-data"
     key = make_key(name, sl)
     return delayed(self._read_data(node_name, sl), name=key, pure=True)