def test_to_task_dasks(): a = delayed(1, name='a') b = delayed(2, name='b') task, dasks = to_task_dasks([a, b, 3]) assert task == ['a', 'b', 3] assert len(dasks) == 2 assert a.dask in dasks assert b.dask in dasks task, dasks = to_task_dasks((a, b, 3)) assert task == (tuple, ['a', 'b', 3]) assert len(dasks) == 2 assert a.dask in dasks assert b.dask in dasks task, dasks = to_task_dasks({a: 1, b: 2}) assert (task == (dict, [['b', 2], ['a', 1]]) or task == (dict, [['a', 1], ['b', 2]])) assert len(dasks) == 2 assert a.dask in dasks assert b.dask in dasks f = namedtuple('f', ['x', 'y']) x = f(1, 2) task, dasks = to_task_dasks(x) assert task == x assert dasks == []
def test_to_task_dask(): a = delayed(1, name='a') b = delayed(2, name='b') task, dask = to_task_dask([a, b, 3]) assert task == ['a', 'b', 3] task, dask = to_task_dask((a, b, 3)) assert task == (tuple, ['a', 'b', 3]) assert dict(dask) == merge(a.dask, b.dask) task, dask = to_task_dask({a: 1, b: 2}) assert (task == (dict, [['b', 2], ['a', 1]]) or task == (dict, [['a', 1], ['b', 2]])) assert dict(dask) == merge(a.dask, b.dask) f = namedtuple('f', ['x', 'y']) x = f(1, 2) task, dask = to_task_dask(x) assert task == x assert dict(dask) == {} # Issue https://github.com/dask/dask/issues/2107 class MyClass(dict): pass task, dask = to_task_dask(MyClass()) assert type(task) is MyClass assert dict(dask) == {}
def test_grid_search_dask_inputs(): # Numpy versions np_X, np_y = make_classification(n_samples=15, n_classes=2, random_state=0) np_groups = np.random.RandomState(0).randint(0, 3, 15) # Dask array versions da_X = da.from_array(np_X, chunks=5) da_y = da.from_array(np_y, chunks=5) da_groups = da.from_array(np_groups, chunks=5) # Delayed versions del_X = delayed(np_X) del_y = delayed(np_y) del_groups = delayed(np_groups) cv = GroupKFold() clf = SVC(random_state=0) grid = {'C': [1]} sol = SVC(C=1, random_state=0).fit(np_X, np_y).support_vectors_ for X, y, groups in product([np_X, da_X, del_X], [np_y, da_y, del_y], [np_groups, da_groups, del_groups]): gs = dcv.GridSearchCV(clf, grid, cv=cv) with pytest.raises(ValueError) as exc: gs.fit(X, y) assert "The groups parameter should not be None" in str(exc.value) gs.fit(X, y, groups=groups) np.testing.assert_allclose(sol, gs.best_estimator_.support_vectors_)
def test_pure(): v1 = delayed(add, pure=True)(1, 2) v2 = delayed(add, pure=True)(1, 2) assert v1.key == v2.key myrand = delayed(random) assert myrand().key != myrand().key
def test_custom_delayed(): x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c']) x2 = delayed(add, pure=True)(x, (4, 5, 6)) n = delayed(len, pure=True)(x) assert delayed(len, pure=True)(x).key == n.key assert x2.compute() == (1, 2, 3, 4, 5, 6) assert compute(n, x2, x) == (3, (1, 2, 3, 4, 5, 6), (1, 2, 3))
def test_delayed(): add2 = delayed(add) assert add2(1, 2).compute() == 3 assert (add2(1, 2) + 3).compute() == 6 assert add2(add2(1, 2), 3).compute() == 6 a = delayed(1) b = add2(add2(a, 2), 3) assert a.key in b.dask
def test_np_dtype_of_delayed(): # This used to result in a segfault due to recursion, see # https://github.com/dask/dask/pull/4374#issuecomment-454381465 np = pytest.importorskip('numpy') x = delayed(1) with pytest.raises(TypeError): np.dtype(x) assert delayed(np.array([1], dtype='f8')).dtype.compute() == np.dtype('f8')
def test_from_delayed_sorted(): a = pd.DataFrame({'x': [1, 2]}, index=[1, 10]) b = pd.DataFrame({'x': [4, 1]}, index=[100, 200]) A = dd.from_delayed([delayed(a), delayed(b)], divisions='sorted') assert A.known_divisions assert A.divisions == (1, 100, 200)
def test_delayed_name(): assert delayed(1)._key.startswith('int-') assert delayed(1, pure=True)._key.startswith('int-') assert delayed(1, name='X')._key == 'X' def myfunc(x): return x + 1 assert delayed(myfunc)(1).key.startswith('myfunc')
def test_kwargs(): def mysum(a, b, c=(), **kwargs): return a + b + sum(c) + sum(kwargs.values()) dmysum = delayed(mysum) ten = dmysum(1, 2, c=[delayed(3), 0], four=dmysum(2, 2)) assert ten.compute() == 10 dmysum = delayed(mysum, pure=True) ten = dmysum(1, 2, c=[delayed(3), 0], four=dmysum(2, 2)) assert ten.compute() == 10
def test_operators(): a = delayed([1, 2, 3]) assert a[0].compute() == 1 assert (a + a).compute() == [1, 2, 3, 1, 2, 3] a = delayed(10) assert (a + 1).compute() == 11 assert (1 + a).compute() == 11 assert (a >> 1).compute() == 5 assert (a > 2).compute() assert (a ** 2).compute() == 100
def test_iterators(): a = delayed(1) b = delayed(2) c = delayed(sum)(iter([a, b])) assert c.compute() == 3 def f(seq): return sum(seq) c = delayed(f)(iter([a, b])) assert c.compute() == 3
def test_from_delayed(): from dask.delayed import delayed a, b, c = delayed([1, 2, 3]), delayed([4, 5, 6]), delayed([7, 8, 9]) bb = from_delayed([a, b, c]) assert bb.name == from_delayed([a, b, c]).name assert isinstance(bb, Bag) assert list(bb) == [1, 2, 3, 4, 5, 6, 7, 8, 9] asum_value = delayed(lambda X: sum(X))(a) asum_item = db.Item.from_delayed(asum_value) assert asum_value.compute() == asum_item.compute() == 6
def test_from_delayed(): df = pd.DataFrame(data=np.random.normal(size=(10, 4)), columns=list('abcd')) parts = [df.iloc[:1], df.iloc[1:3], df.iloc[3:6], df.iloc[6:10]] dfs = [delayed(parts.__getitem__)(i) for i in range(4)] meta = dfs[0].compute() my_len = lambda x: pd.Series([len(x)]) for divisions in [None, [0, 1, 3, 6, 10]]: ddf = dd.from_delayed(dfs, meta=meta, divisions=divisions) assert_eq(ddf, df) assert list(ddf.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) s = dd.from_delayed([d.a for d in dfs], meta=meta.a, divisions=divisions) assert_eq(s, df.a) assert list(s.map_partitions(my_len).compute()) == [1, 2, 3, 4] assert ddf.known_divisions == (divisions is not None) meta2 = [(c, 'f8') for c in df.columns] assert_eq(dd.from_delayed(dfs, meta=meta2), df) assert_eq(dd.from_delayed([d.a for d in dfs], meta=('a', 'f8')), df.a) with pytest.raises(ValueError): dd.from_delayed(dfs, meta=meta, divisions=[0, 1, 3, 6]) with pytest.raises(ValueError) as e: dd.from_delayed(dfs, meta=meta.a).compute() assert str(e.value).startswith('Metadata mismatch found in `from_delayed`')
def test_methods(): a = delayed("a b c d e") assert a.split(' ').compute() == ['a', 'b', 'c', 'd', 'e'] assert a.upper().replace('B', 'A').split().count('A').compute() == 2 assert a.split(' ', pure=True).key == a.split(' ', pure=True).key o = a.split(' ', dask_key_name='test') assert o.key == 'test'
def test_compute(): a = delayed(1) + 5 b = a + 1 c = a + 2 assert compute(b, c) == (7, 8) assert compute(b) == (7,) assert compute([a, b], c) == ([6, 7], 8)
def compute_n_splits(cv, X, y=None, groups=None): """Return the number of splits. Parameters ---------- cv : BaseCrossValidator X, y, groups : array_like, dask object, or None Returns ------- n_splits : int """ if not any(is_dask_collection(i) for i in (X, y, groups)): return cv.get_n_splits(X, y, groups) if isinstance(cv, (_BaseKFold, BaseShuffleSplit)): return cv.n_splits elif isinstance(cv, PredefinedSplit): return len(cv.unique_folds) elif isinstance(cv, _CVIterableWrapper): return len(cv.cv) elif isinstance(cv, (LeaveOneOut, LeavePOut)) and not is_dask_collection(X): # Only `X` is referenced for these classes return cv.get_n_splits(X, None, None) elif (isinstance(cv, (LeaveOneGroupOut, LeavePGroupsOut)) and not is_dask_collection(groups)): # Only `groups` is referenced for these classes return cv.get_n_splits(None, None, groups) else: return delayed(cv).get_n_splits(X, y, groups).compute()
def test_delayed_callable(): f = delayed(add, pure=True) v = f(1, 2) assert v.dask == {v.key: (add, 1, 2)} assert f.dask == {f.key: add} assert f.compute() == add
def open_files(path, hdfs=None, lazy=None, **auth): if lazy is not None: raise DeprecationWarning("Lazy keyword has been deprecated. " "Now always lazy") hdfs = hdfs or HDFileSystem(**auth) filenames = sorted(hdfs.glob(path)) myopen = delayed(hdfs_open_file) return [myopen(fn, auth) for fn in filenames]
def test_keys_from_array(): da = pytest.importorskip('dask.array') from dask.array.utils import _check_dsk X = da.ones((10, 10), chunks=5).to_delayed().flatten() xs = [delayed(inc)(x) for x in X] _check_dsk(xs[0].dask)
def test_mesos_is_delayed(): def add(x, y): return x + y add1 = delayed(add) add2 = mesos(add) assert isinstance(add2, add1.__class__) assert add1(2, 3).compute() == add2(2, 3).compute()
def test_nout(): func = delayed(lambda x: (x, -x), nout=2, pure=True) x = func(1) assert len(x) == 2 a, b = x assert compute(a, b) == (1, -1) assert a._length is None assert b._length is None pytest.raises(TypeError, lambda: len(a)) pytest.raises(TypeError, lambda: list(a)) pytest.raises(ValueError, lambda: delayed(add, nout=-1)) pytest.raises(ValueError, lambda: delayed(add, nout=True)) func = delayed(add, nout=1) a = func(1) assert a._length is None pytest.raises(TypeError, lambda: list(a)) pytest.raises(TypeError, lambda: len(a))
def test_to_task_dask(): a = delayed(1, name='a') b = delayed(2, name='b') task, dask = to_task_dask([a, b, 3]) assert task == ['a', 'b', 3] task, dask = to_task_dask((a, b, 3)) assert task == (tuple, ['a', 'b', 3]) assert dict(dask) == merge(a.dask, b.dask) task, dask = to_task_dask({a: 1, b: 2}) assert (task == (dict, [['b', 2], ['a', 1]]) or task == (dict, [['a', 1], ['b', 2]])) assert dict(dask) == merge(a.dask, b.dask) f = namedtuple('f', ['x', 'y']) x = f(1, 2) task, dask = to_task_dask(x) assert task == x assert dict(dask) == {}
def test_to_task_dask(): with warnings.catch_warnings(record=True): a = delayed(1, name='a') b = delayed(2, name='b') task, dask = to_task_dask([a, b, 3]) assert task == ['a', 'b', 3] task, dask = to_task_dask((a, b, 3)) assert task == (tuple, ['a', 'b', 3]) assert dict(dask) == merge(a.dask, b.dask) task, dask = to_task_dask({a: 1, b: 2}) assert (task == (dict, [['b', 2], ['a', 1]]) or task == (dict, [['a', 1], ['b', 2]])) assert dict(dask) == merge(a.dask, b.dask) f = namedtuple('f', ['x', 'y']) x = f(1, 2) task, dask = to_task_dask(x) assert task == x assert dict(dask) == {} task, dask = to_task_dask(slice(a, b, 3)) assert task == (slice, 'a', 'b', 3) assert dict(dask) == merge(a.dask, b.dask) # Issue https://github.com/dask/dask/issues/2107 class MyClass(dict): pass task, dask = to_task_dask(MyClass()) assert type(task) is MyClass assert dict(dask) == {} # Custom dask objects x = Tuple({'a': 1, 'b': 2, 'c': (add, 'a', 'b')}, ['a', 'b', 'c']) task, dask = to_task_dask(x) assert task in dask f = dask.pop(task) assert f == (tuple, ['a', 'b', 'c']) assert dask == x._dask
def test_finalize_name(): import dask.array as da x = da.ones(10, chunks=5) v = delayed([x]) assert set(x.dask).issubset(v.dask) def key(s): if isinstance(s, tuple): s = s[0] return s.split('-')[0] assert all(key(k).isalpha() for k in v.dask)
def test_delayed_errors(): a = delayed([1, 2, 3]) # Immutable pytest.raises(TypeError, lambda: setattr(a, 'foo', 1)) pytest.raises(TypeError, lambda: setitem(a, 1, 0)) # Can't iterate, or check if contains pytest.raises(TypeError, lambda: 1 in a) pytest.raises(TypeError, lambda: list(a)) # No dynamic generation of magic/hidden methods pytest.raises(AttributeError, lambda: a._hidden()) # Truth of delayed forbidden pytest.raises(TypeError, lambda: bool(a))
def test_array_delayed(): np = pytest.importorskip('numpy') da = pytest.importorskip('dask.array') arr = np.arange(100).reshape((10, 10)) darr = da.from_array(arr, chunks=(5, 5)) val = delayed(sum)([arr, darr, 1]) assert isinstance(val, Delayed) assert np.allclose(val.compute(), arr + arr + 1) assert val.sum().compute() == (arr + arr + 1).sum() assert val[0, 0].compute() == (arr + arr + 1)[0, 0] task, dsk = to_task_dask(darr) orig = set(darr.dask) final = set(dsk) assert orig.issubset(final) diff = final.difference(orig) assert len(diff) == 1 delayed_arr = delayed(darr) assert (delayed_arr.compute() == arr).all()
def test_delayed_picklable(): # Delayed x = delayed(divmod, nout=2, pure=True)(1, 2) y = pickle.loads(pickle.dumps(x)) assert x.dask == y.dask assert x._key == y._key assert x._length == y._length # DelayedLeaf x = delayed(1j + 2) y = pickle.loads(pickle.dumps(x)) assert x.dask == y.dask assert x._key == y._key assert x._nout == y._nout assert x._pure == y._pure # DelayedAttr x = x.real y = pickle.loads(pickle.dumps(x)) assert x._obj._key == y._obj._key assert x._obj.dask == y._obj.dask assert x._attr == y._attr assert x._key == y._key
def test_traverse_false(): # Create a list with a dask value, and test that it's not computed def fail(*args): raise ValueError("shouldn't have computed") a = delayed(fail)() # list x = [a, 1, 2, 3] res = delayed(x, traverse=False).compute() assert len(res) == 4 assert res[0] is a assert res[1:] == x[1:] # tuple that looks like a task x = (fail, a, (fail, a)) res = delayed(x, traverse=False).compute() assert isinstance(res, tuple) assert res[0] == fail assert res[1] is a # list containing task-like-things x = [1, (fail, a), a] res = delayed(x, traverse=False).compute() assert isinstance(res, list) assert res[0] == 1 assert res[1][0] == fail and res[1][1] is a assert res[2] is a # traverse=False still hits top level b = delayed(1) x = delayed(b, traverse=False) assert x.compute() == 1
def test_callable_obj(): class Foo(object): def __init__(self, a): self.a = a def __call__(self): return 2 foo = Foo(1) f = delayed(foo) assert f.compute() is foo assert f.a.compute() == 1 assert f().compute() == 2
def test_key_names_include_function_names(): def myfunc(x): return x + 1 assert delayed(myfunc)(1).key.startswith('myfunc')
def test_method_getattr_call_same_task(): a = delayed([1, 2, 3]) o = a.index(1) # Don't getattr the method, then call in separate task assert getattr not in set(v[0] for v in o.__dask_graph__().values())
def test_key_names_include_type_names(): assert delayed(1).key.startswith('int')
def compute(*args, **kwargs): """Compute several dask collections at once. Parameters ---------- args : object Any number of objects. If it is a dask object, it's computed and the result is returned. By default, python builtin collections are also traversed to look for dask objects (for more information see the ``traverse`` keyword). Non-dask arguments are passed through unchanged. traverse : bool, optional By default dask traverses builtin python collections looking for dask objects passed to ``compute``. For large collections this can be expensive. If none of the arguments contain any dask objects, set ``traverse=False`` to avoid doing this traversal. get : callable, optional A scheduler ``get`` function to use. If not provided, the default is to check the global settings first, and then fall back to defaults for the collections. optimize_graph : bool, optional If True [default], the optimizations for each collection are applied before computation. Otherwise the graph is run as is. This can be useful for debugging. kwargs Extra keywords to forward to the scheduler ``get`` function. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) By default, dask objects inside python collections will also be computed: >>> compute({'a': a, 'b': b, 'c': 1}) # doctest: +SKIP ({'a': 45, 'b': 4.5, 'c': 1},) """ from dask.delayed import delayed traverse = kwargs.pop('traverse', True) if traverse: args = tuple(delayed(a) if isinstance(a, (list, set, tuple, dict, Iterator)) else a for a in args) optimize_graph = kwargs.pop('optimize_graph', True) variables = [a for a in args if isinstance(a, Base)] if not variables: return args get = kwargs.pop('get', None) or _globals['get'] if not get: get = variables[0]._default_get if not all(a._default_get == get for a in variables): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") dsk = collections_to_dsk(variables, optimize_graph, **kwargs) keys = [var._keys() for var in variables] results = get(dsk, keys, **kwargs) results_iter = iter(results) return tuple(a if not isinstance(a, Base) else a._finalize(next(results_iter)) for a in args)
def test_literates_keys(): a = delayed(1) b = a + 1 lit = (a, b, 3) assert delayed(lit).key != delayed(lit).key assert delayed(lit, pure=True).key == delayed(lit, pure=True).key
def test_named_value(): assert 'X' in delayed(1, name='X').dask
return tokenize(self.based_on) __dask_scheduler__ = staticmethod(get2) __dask_optimize__ = globalmethod( dont_optimize, key="collection_optim", falsey=dont_optimize, ) def increment_(x: int) -> int: return x + 1 increment: Delayed = delayed(increment_) def assert_isinstance(coll: DaskCollection, protocol: Any) -> None: assert isinstance(coll, protocol) @pytest.mark.parametrize("protocol", [DaskCollection, HLGDaskCollection]) def test_isinstance_core(protocol: Any) -> None: from dask.array import Array from dask.bag import Bag from dask.dataframe import DataFrame arr: Array = da.ones(10) bag: Bag = db.from_sequence([1, 2, 3, 4, 5], npartitions=2) df: DataFrame = dds.timeseries()
def test_sensitive_to_partials(): assert (delayed(partial(add, 10), pure=True)(2)._key != delayed( partial(add, 20), pure=True)(2)._key)
def test_delayed_picklable(): x = delayed(1) y = pickle.loads(pickle.dumps(x)) assert x.dask == y.dask assert x._key == y._key
def test_delayed_method_descriptor(): delayed(bytes.decode)(b"") # does not err
def test_delayed_name_on_call(): f = delayed(add, pure=True) assert f(1, 2, dask_key_name="foo")._key == "foo"
def test_delayed_compute_forward_kwargs(): x = delayed(1) + 2 x.compute(bogus_keyword=10)
def test_nout_with_tasks(x): length = len(x) d = delayed(x, nout=length) assert len(d) == len(list(d)) == length assert d.compute() == x
def test_lists_are_concrete(): a = delayed(1) b = delayed(2) c = delayed(max)([[a, 10], [b, 20]], key=lambda x: x[0])[1] assert c.compute() == 20
def test_value(): v = delayed(1) assert v.compute() == 1 assert 1 in v.dask.values()
def test_attr_optimize(): # Check that attribute access is inlined a = delayed([1, 2, 3]) o = a.index(1) dsk = o._optimize(o.dask, o._keys()) assert getattr not in set(v[0] for v in dsk.values())
def test_common_subexpressions(): a = delayed([1, 2, 3]) res = a[0] + a[0] assert a[0].key in res.dask assert a.key in res.dask assert len(res.dask) == 3
def test_attributes(): a = delayed(2 + 1j) assert a.real.compute() == 2 assert a.imag.compute() == 1
@delayed def modlevel_delayed1(x): return x + 1 @delayed(pure=False) def modlevel_delayed2(x): return x + 1 @pytest.mark.parametrize( "f", [ delayed(modlevel_eager), pytest.param(modlevel_delayed1, marks=pytest.mark.xfail(reason="#3369")), pytest.param(modlevel_delayed2, marks=pytest.mark.xfail(reason="#3369")), ], ) def test_pickle(f): d = f(2) d = pickle.loads(pickle.dumps(d, protocol=pickle.HIGHEST_PROTOCOL)) assert d.compute() == 3 @pytest.mark.parametrize( "f", [delayed(modlevel_eager), modlevel_delayed1, modlevel_delayed2]) def test_cloudpickle(f):
async def run(): number_of_cores_per_node = 16 # DAS-5 features 2x8 NUMA cores per compute node reservation_length = "08:00:00" # 2 hours is more than enough... probably cluster = SLURMCluster(cores=number_of_cores_per_node, memory="64 GB", processes=4, scheduler_options={"dashboard_address": ":6868"}, local_directory="./aip-logs", interface='ib0', walltime=reservation_length) # Grab 5 execution nodes -> 80 cores print("Scaling up, getting 5 nodes") cluster.scale_up(5) client = Client(cluster) print("Client is ready, parsing data files...") file_locations = "/var/scratch/lvs215/aip_tmp" data_files = [] # Create a list of all the files we want to parse. Skip the compressed sources if they are still lingering around for path, subdirs, files in os.walk(file_locations): for name in files: if isfile(os.path.join(path, name)) and not name.endswith( ("gz", "zip", "tar")): data_files.append(os.path.join(path, name)) client.run(clear_all_files) # Create one task per file. print(data_files) print("Creating and executing tasks...") tasks = list(map(delayed(process_file), data_files)) true_false_array = db.from_delayed(tasks) # DEBUG CODE # future = client.compute(true_false_array) # client.recreate_error_locally(future) # Time to compute them! start = datetime.datetime.now() res = true_false_array.compute() end = datetime.datetime.now() print(true_false_array) print(res) print("Tasks ran to completion! Copying databases.") if False not in true_false_array: # If everything went alright, let all nodes copy their databases to the home dir. client.run(copy_database_to_home_folder) client.run(clear_all_files) else: print("Parsing one of the files went horribly wrong, quitting!") exit(-1) print("Beginning assembling of all databases into one!") # Now, each of the nodes has a local database file, we will now combine these databases into one. # We do this process sequentially, because we are not sure yet if SQLite likes it if all nodes do this in parallel. # TODO: test if we can do this procedure in each node through the copy_database_to_home_folder, would save copying data database_manager = DatabaseManager( ) # This creates an empty aip.db if it doesn't exists. con3 = database_manager.db # Reuse the connection # based on https://stackoverflow.com/a/37138506 os.makedirs(db_files_location, exist_ok=True) for file in [ os.path.join(db_files_location, f) for f in os.listdir(db_files_location) if isfile(os.path.join(db_files_location, f)) and f.endswith(".db") ]: con3.execute("ATTACH '{}' as dba".format(file)) con3.execute("BEGIN") for row in con3.execute( "SELECT * FROM dba.sqlite_master WHERE type='table'"): combine = "INSERT INTO " + row[1] + " SELECT * FROM dba." + row[1] print(combine) con3.execute(combine) con3.execute("detach database dba") con3.commit() # Now, delete the database as it has been copied. # os.remove("{}.db".format(hash(worker))) print("All done. Releasing all nodes.") await cluster.scale_down(cluster.workers) print("Nodes released.") print(end - start)
def to_sql( df, name: str, uri: str, schema=None, if_exists: str = "fail", index: bool = True, index_label=None, chunksize=None, dtype=None, method=None, compute=True, parallel=False, engine_kwargs=None, ): """Store Dask Dataframe to a SQL table An empty table is created based on the "meta" DataFrame (and conforming to the caller's "if_exists" preference), and then each block calls pd.DataFrame.to_sql (with `if_exists="append"`). Databases supported by SQLAlchemy [1]_ are supported. Tables can be newly created, appended to, or overwritten. Parameters ---------- name : str Name of SQL table. uri : string Full sqlalchemy URI for the database connection schema : str, optional Specify the schema (if database flavor supports this). If None, use default schema. if_exists : {'fail', 'replace', 'append'}, default 'fail' How to behave if the table already exists. * fail: Raise a ValueError. * replace: Drop the table before inserting new values. * append: Insert new values to the existing table. index : bool, default True Write DataFrame index as a column. Uses `index_label` as the column name in the table. index_label : str or sequence, default None Column label for index column(s). If None is given (default) and `index` is True, then the index names are used. A sequence should be given if the DataFrame uses MultiIndex. chunksize : int, optional Specify the number of rows in each batch to be written at a time. By default, all rows will be written at once. dtype : dict or scalar, optional Specifying the datatype for columns. If a dictionary is used, the keys should be the column names and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode. If a scalar is provided, it will be applied to all columns. method : {None, 'multi', callable}, optional Controls the SQL insertion clause used: * None : Uses standard SQL ``INSERT`` clause (one per row). * 'multi': Pass multiple values in a single ``INSERT`` clause. * callable with signature ``(pd_table, conn, keys, data_iter)``. Details and a sample callable implementation can be found in the section :ref:`insert method <io.sql.method>`. compute : bool, default True When true, call dask.compute and perform the load into SQL; otherwise, return a Dask object (or array of per-block objects when parallel=True) parallel : bool, default False When true, have each block append itself to the DB table concurrently. This can result in DB rows being in a different order than the source DataFrame's corresponding rows. When false, load each block into the SQL DB in sequence. engine_kwargs : dict or None Specific db engine parameters for sqlalchemy Raises ------ ValueError When the table already exists and `if_exists` is 'fail' (the default). See Also -------- read_sql : Read a DataFrame from a table. Notes ----- Timezone aware datetime columns will be written as ``Timestamp with timezone`` type with SQLAlchemy if supported by the database. Otherwise, the datetimes will be stored as timezone unaware timestamps local to the original timezone. .. versionadded:: 0.24.0 References ---------- .. [1] https://docs.sqlalchemy.org .. [2] https://www.python.org/dev/peps/pep-0249/ Examples -------- Create a table from scratch with 4 rows. >>> import pandas as pd >>> import dask.dataframe as dd >>> df = pd.DataFrame([ {'i':i, 's':str(i)*2 } for i in range(4) ]) >>> ddf = dd.from_pandas(df, npartitions=2) >>> ddf # doctest: +SKIP Dask DataFrame Structure: i s npartitions=2 0 int64 object 2 ... ... 3 ... ... Dask Name: from_pandas, 2 tasks >>> from dask.utils import tmpfile >>> from sqlalchemy import create_engine >>> with tmpfile() as f: ... db = 'sqlite:///%s' %f ... ddf.to_sql('test', db) ... engine = create_engine(db, echo=False) ... result = engine.execute("SELECT * FROM test").fetchall() >>> result [(0, 0, '00'), (1, 1, '11'), (2, 2, '22'), (3, 3, '33')] """ if not isinstance(uri, str): raise ValueError(f"Expected URI to be a string, got {type(uri)}.") # This is the only argument we add on top of what Pandas supports kwargs = dict( name=name, uri=uri, engine_kwargs=engine_kwargs, schema=schema, if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype, method=method, ) meta_task = delayed(_to_sql_chunk)(df._meta, **kwargs) # Partitions should always append to the empty table created from `meta` above worker_kwargs = dict(kwargs, if_exists="append") if parallel: # Perform the meta insert, then one task that inserts all blocks concurrently: result = [ _extra_deps( _to_sql_chunk, d, extras=meta_task, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs), ) for d in df.to_delayed() ] else: # Chain the "meta" insert and each block's insert result = [] last = meta_task for d in df.to_delayed(): result.append( _extra_deps( _to_sql_chunk, d, extras=last, **worker_kwargs, dask_key_name="to_sql-%s" % tokenize(d, **worker_kwargs), )) last = result[-1] result = delayed(result) if compute: dask_compute(result) else: return result
def test_attributes(): a = delayed(2 + 1j) assert a.real._key == a.real._key assert a.real.compute() == 2 assert a.imag.compute() == 1 assert (a.real + a.imag).compute() == 3
def test_value_name(): assert delayed(1)._key.startswith('int-') assert delayed(1, pure=True)._key.startswith('int-')
gs.visualize(filename=os.path.join(d, "mydask")) assert os.path.exists(os.path.join(d, "mydask.png")) # Doesn't work if not fitted gs = dcv.GridSearchCV(clf, grid) with pytest.raises(NotFittedError): gs.visualize() np_X = np.random.normal(size=(20, 3)) np_y = np.random.randint(2, size=20) np_groups = np.random.permutation(list(range(5)) * 4) da_X = da.from_array(np_X, chunks=(3, 3)) da_y = da.from_array(np_y, chunks=3) da_groups = da.from_array(np_groups, chunks=3) del_X = delayed(np_X) del_y = delayed(np_y) del_groups = delayed(np_groups) @pytest.mark.parametrize( ["cls", "has_shuffle"], [ (KFold, True), (GroupKFold, False), (StratifiedKFold, True), (TimeSeriesSplit, False), ], ) def test_kfolds(cls, has_shuffle): assert tokenize(cls(n_splits=3)) == tokenize(cls(n_splits=3))
def _get_random_state(self): i_subs = next(substreams) return delayed(get_substream_state, pure=True)(self.seed, i_subs)
for i in block_size: # changing blocks for j in range(1, 6): # changing files (5 files per block size) # Create a new file longXTC1 = 'newtraj{}.xtc'.format(ii) copyfile(longXTC, longXTC1) # Provide the path to my file to all processes my_path = os.path.normpath(os.path.join(os.getcwd(), longXTC1)) # print (my_path) longXTC1 = os.path.abspath(my_path) # Define a new universe with the new trajectory u = mda.Universe(PSF, longXTC1) print(u) print("frames in trajectory ", u.trajectory.n_frames) print(len(u.trajectory)) mobile = u.select_atoms( "(resid 1:29 or resid 60:121 or resid 160:214) and name CA" ) index = mobile.indices total = com_parallel_dask_distributed(mobile, index, i) total = delayed(total) start = time.time() output = total.compute(get=c.get) tot_time = time.time() - start file.write("XTC{} {} {} {} {} {} {} {}\n".format( k, i, j, output[1], output[2], output[3], output[4], tot_time)) file.flush() # Deleting all files os.remove('newtraj{}.xtc'.format(ii)) ii = ii + 1
def test_attribute_of_attribute(): x = delayed(123) assert isinstance(x.a, Delayed) assert isinstance(x.a.b, Delayed) assert isinstance(x.a.b.c, Delayed)
def test_persist_delayedleaf(): x = delayed(1) (xx,) = persist(x) assert isinstance(xx, Delayed) assert xx.compute() == 1
def test_lists(): a = delayed(1) b = delayed(2) c = delayed(sum)([a, b]) assert c.compute() == 3
def read_data(self, node_name, sl): name = node_name + "-data" key = make_key(name, sl) return delayed(self._read_data(node_name, sl), name=key, pure=True)