コード例 #1
0
def test_str_graph():
    dsk = {"x": 1}
    assert str_graph(dsk) == dsk

    dsk = {("x", 1): (inc, 1)}
    assert str_graph(dsk) == {str(("x", 1)): (inc, 1)}

    dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))}
    assert str_graph(dsk) == {
        str(("x", 1)): (inc, 1),
        str(("x", 2)): (inc, str(("x", 1))),
    }

    dsks = [
        {"x": 1},
        {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))},
        {("x", 1): (sum, [1, 2, 3]), ("x", 2): (sum, [("x", 1), ("x", 1)])},
    ]
    for dsk in dsks:
        sdsk = str_graph(dsk)
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, str) for k in sdsk)
        assert dask.get(dsk, keys) == dask.get(sdsk, skeys)

    dsk = {("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x", (("y", 1),)), (("z", 1),))}
    dsk = str_graph(dsk, extra_values=(("z", 1),))
    assert dsk["('y', 1)"][0].dsk["x"] == "('y', 1)"
    assert dsk["('y', 1)"][1][0] == "('z', 1)"
コード例 #2
0
def test_identical_nodes(
    optimizer: Tuple[
        str,
        Callable[[Dict[Hashable, Any], Union[Hashable, Iterable[Hashable]]], Dict[Hashable, Any]],
    ]
) -> None:
    """Small test for the presence of identical nodes."""
    cache_dir, graphchain_optimize = optimizer

    def foo(x: int) -> int:
        return x + 1

    def bar(*args: int) -> int:
        return sum(args)

    dsk = {"foo1": (foo, 1), "foo2": (foo, 1), "top1": (bar, "foo1", "foo2")}

    # First run
    newdsk = graphchain_optimize(dsk, ["top1"])  # type: ignore[arg-type]
    result = dask.get(newdsk, ["top1"])
    assert result == (4,)

    # Second run
    newdsk = graphchain_optimize(dsk, ["top1"])  # type: ignore[arg-type]
    result = dask.get(newdsk, ["top1"])
    assert result == (4,)
コード例 #3
0
ファイル: test_utils.py プロジェクト: wanjinchang/distributed
def test_str_graph():
    dsk = {b'x': 1}
    assert str_graph(dsk) == dsk

    dsk = {('x', 1): (inc, 1)}
    assert str_graph(dsk) == {str(('x', 1)): (inc, 1)}

    dsk = {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))}
    assert str_graph(dsk) == {
        str(('x', 1)): (inc, 1),
        str(('x', 2)): (inc, str(('x', 1)))
    }

    dsks = [{
        'x': 1
    }, {
        ('x', 1): (inc, 1),
        ('x', 2): (inc, ('x', 1))
    }, {
        ('x', 1): (sum, [1, 2, 3]),
        ('x', 2): (sum, [('x', 1), ('x', 1)])
    }]
    for dsk in dsks:
        sdsk = str_graph(dsk)
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, (str, bytes)) for k in sdsk)
        assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
コード例 #4
0
ファイル: test_graphchain.py プロジェクト: pnyczyk/graphchain
def test_cache_deletion(
        dask_graph: Dict[Hashable, Any],
        optimizer: Tuple[
            str,
            Callable[[Dict[Hashable, Any]], Dict[Hashable, Any]]]) \
        -> None:
    """Test cache deletion.

    Tests the ability to obtain results in the event that cache files are
    deleted (in the even of a cache-miss, the exec-store wrapper should be
    re-run by the load-wrapper).
    """
    dsk = dask_graph
    cache_dir, graphchain_optimize = optimizer
    storage = fs.osfs.OSFS(cache_dir)

    # Cleanup first
    storage.removetree("/")

    # Run optimizer (first time)
    newdsk = graphchain_optimize(dsk, keys=["top1"])  # type: ignore
    result = dask.get(newdsk, ["top1"])

    newdsk = graphchain_optimize(dsk, keys=["top1"])  # type: ignore
    result = dask.get(newdsk, ["top1"])

    # Check the final result
    assert result == (-14, )
コード例 #5
0
ファイル: test_bag.py プロジェクト: serazing/dask
def test_groupby_tasks():
    b = db.from_sequence(range(160), npartitions=4)
    out = b.groupby(lambda x: x % 10, max_branch=4, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))

    b = db.from_sequence(range(1000), npartitions=100)
    out = b.groupby(lambda x: x % 123, method='tasks')
    assert len(out.dask) < 100**2
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))

    b = db.from_sequence(range(10000), npartitions=345)
    out = b.groupby(lambda x: x % 2834, max_branch=24, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))
コード例 #6
0
ファイル: test_order.py プロジェクト: martindurant/dask
def test_run_smaller_sections(abcde):
    """
            aa
           / |
      b   d  bb dd
     / \ /|  | /
    a   c e  cc

    Prefer to run acb first because then we can get that out of the way
    """
    a, b, c, d, e = abcde
    aa, bb, cc, dd = [x * 2 for x in [a, b, c, d]]

    expected = [a, c, b, e, d, cc, bb, aa, dd]

    log = []

    def f(x):
        def _(*args):
            log.append(x)
        return _

    dsk = {a: (f(a),),
           c: (f(c),),
           e: (f(e),),
           cc: (f(cc),),
           b: (f(b), a, c),
           d: (f(d), c, e),
           bb: (f(bb), cc),
           aa: (f(aa), d, bb),
           dd: (f(dd), cc)}

    dask.get(dsk, [aa, b, dd])  # trigger computation

    assert log == expected
コード例 #7
0
def test_run_smaller_sections(abcde):
    """
            aa
           / |
      b   d  bb dd
     / \ /|  | /
    a   c e  cc

    Prefer to run acb first because then we can get that out of the way
    """
    a, b, c, d, e = abcde
    aa, bb, cc, dd = [x * 2 for x in [a, b, c, d]]

    expected = [a, c, b, e, d, cc, bb, aa, dd]

    log = []

    def f(x):
        def _(*args):
            log.append(x)
        return _

    dsk = {a: (f(a),),
           c: (f(c),),
           e: (f(e),),
           cc: (f(cc),),
           b: (f(b), a, c),
           d: (f(d), c, e),
           bb: (f(bb), cc),
           aa: (f(aa), d, bb),
           dd: (f(dd), cc)}

    dask.get(dsk, [aa, b, dd])  # trigger computation

    assert log == expected
コード例 #8
0
ファイル: test_bag.py プロジェクト: dukebody/dask
def test_groupby_tasks():
    b = db.from_sequence(range(160), npartitions=4)
    out = b.groupby(lambda x: x % 10, max_branch=4, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))


    b = db.from_sequence(range(1000), npartitions=100)
    out = b.groupby(lambda x: x % 123, method='tasks')
    assert len(out.dask) < 100**2
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))


    b = db.from_sequence(range(10000), npartitions=345)
    out = b.groupby(lambda x: x % 2834, max_branch=24, method='tasks')
    partitions = dask.get(out.dask, out._keys())

    for a in partitions:
        for b in partitions:
            if a is not b:
                assert not set(pluck(0, a)) & set(pluck(0, b))
コード例 #9
0
def test_str_graph():
    dsk = {"x": 1}
    assert str_graph(dsk) == dsk

    dsk = {("x", 1): (inc, 1)}
    assert str_graph(dsk) == {str(("x", 1)): (inc, 1)}

    dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))}
    assert str_graph(dsk) == {
        str(("x", 1)): (inc, 1),
        str(("x", 2)): (inc, str(("x", 1))),
    }

    dsks = [
        {
            "x": 1
        },
        {
            ("x", 1): (inc, 1),
            ("x", 2): (inc, ("x", 1))
        },
        {
            ("x", 1): (sum, [1, 2, 3]),
            ("x", 2): (sum, [("x", 1), ("x", 1)])
        },
    ]
    for dsk in dsks:
        sdsk = str_graph(dsk)
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, str) for k in sdsk)
        assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
コード例 #10
0
def test_ephemeral_locking(zk, dsk2):
    with pytest.raises(LockTimeout):
        with Lock(zk, name="dsk2", timeout=1, ephemeral=True), \
                Lock(zk, name="dsk2", timeout=1, ephemeral=True):
            get(dsk2, 'f')

    with pytest.raises(NoNodeError):
        zk.get("/epos/dsk2")
コード例 #11
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5, ))
    y = da.sum(x + 1 + 2 + 3)

    a = y._optimize(y.dask, y._keys())

    with dask.set_options(fuse_ave_width=0):
        b = y._optimize(y.dask, y._keys())

    assert dask.get(a, y._keys()) == dask.get(b, y._keys())
    assert len(a) < len(b)
コード例 #12
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5,))
    y = da.sum(x + 1 + 2 + 3)

    a = y.__dask_optimize__(y.dask, y.__dask_keys__())

    with dask.config.set(fuse_ave_width=0):
        b = y.__dask_optimize__(y.dask, y.__dask_keys__())

    assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__())
    assert len(a) < len(b)
コード例 #13
0
ファイル: test_dask.py プロジェクト: thrasibule/distributed
    def f(c, a, b):
        dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y')}
        keys = 'z'
        result = yield _get(c.ip, c.port, dsk, keys, gather=True)
        assert result == dask.get(dsk, keys)

        dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y'),
               'a': (inc, 'z'), 'b': (add, 'a', 'x')}
        keys = 'b'
        result = yield _get(c.ip, c.port, dsk, keys, gather=True)
        assert result == dask.get(dsk, keys)
コード例 #14
0
def test_turn_off_fusion():
    x = da.ones(10, chunks=(5, ))
    y = da.sum(x + 1 + 2 + 3)

    a = y.__dask_optimize__(y.dask, y.__dask_keys__())

    with dask.config.set({"optimization.fuse.ave-width": 0}):
        b = y.__dask_optimize__(y.dask, y.__dask_keys__())

    assert dask.get(a, y.__dask_keys__()) == dask.get(b, y.__dask_keys__())
    assert len(a) < len(b)
コード例 #15
0
ファイル: test_simple.py プロジェクト: nbren12/dask.targeted
def test_targeted_callback(fun, tgt, dsk):
    from dask import get

    with TargetedCallback():
        tgt.exists.return_value = False
        get(dsk, 'a')
        fun.assert_called_once()

        fun.reset_mock()
        tgt.exists.return_value = True
        get(dsk, 'a')
        fun.assert_not_called()
コード例 #16
0
def test_shuffle(shuffle):
    s = shuffle_func(d, d.b, shuffle=shuffle)
    assert isinstance(s, dd.DataFrame)
    assert s.npartitions == d.npartitions

    x = dask.get(s.dask, (s._name, 0))
    y = dask.get(s.dask, (s._name, 1))

    assert not (set(x.b) & set(y.b))  # disjoint
    assert set(s.dask).issuperset(d.dask)

    assert shuffle_func(d, d.b)._name == shuffle_func(d, d.b)._name
コード例 #17
0
ファイル: test_shuffle.py プロジェクト: fortizc/dask
def test_shuffle(shuffle):
    s = shuffle_func(d, d.b, shuffle=shuffle)
    assert isinstance(s, dd.DataFrame)
    assert s.npartitions == d.npartitions

    x = dask.get(s.dask, (s._name, 0))
    y = dask.get(s.dask, (s._name, 1))

    assert not (set(x.b) & set(y.b))  # disjoint
    assert set(s.dask).issuperset(d.dask)

    assert shuffle_func(d, d.b)._name == shuffle_func(d, d.b)._name
コード例 #18
0
def test_order_cycle():
    with pytest.raises(RuntimeError, match="Cycle detected"):
        dask.get({"a": (f, "a")}, "a")  # we encounter this in `get`
    with pytest.raises(RuntimeError, match="Cycle detected"):
        order({"a": (f, "a")})  # trivial self-loop
    with pytest.raises(RuntimeError, match="Cycle detected"):
        order({("a", 0): (f, ("a", 0))})  # non-string
    with pytest.raises(RuntimeError, match="Cycle detected"):
        order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a")})  # non-trivial loop
    with pytest.raises(RuntimeError, match="Cycle detected"):
        order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a", "d"), "d": 1})
    with pytest.raises(RuntimeError, match="Cycle detected"):
        order({"a": (f, "b"), "b": (f, "c"), "c": (f, "a", "d"), "d": (f, "b")})
コード例 #19
0
def test_persisting(zk, dsk1):
    with Persist(zk, name="dsk1"):
        with Ran() as r:
            assert get(dsk1, 'w') == 6
            assert r.steps == ['z', 'w']
        with Ran() as r:
            assert get(dsk1, 'w') == 6
            assert r.steps == []

        assert loads(zk.get("/epos/dsk1/z")[0]) == 3
        assert loads(zk.get("/epos/dsk1/w")[0]) == 6

    # tests ephemeral=False, znode still exists after context handler
    assert loads(zk.get("/epos/dsk1/w")[0]) == 6
コード例 #20
0
def test_stringify():
    obj = "Hello"
    assert stringify(obj) is obj
    obj = b"Hello"
    assert stringify(obj) is obj
    dsk = {"x": 1}

    assert stringify(dsk) == str(dsk)
    assert stringify(dsk, exclusive=()) == dsk

    dsk = {("x", 1): (inc, 1)}
    assert stringify(dsk) == str({("x", 1): (inc, 1)})
    assert stringify(dsk, exclusive=()) == {("x", 1): (inc, 1)}

    dsk = {("x", 1): (inc, 1), ("x", 2): (inc, ("x", 1))}
    assert stringify(dsk, exclusive=dsk) == {
        ("x", 1): (inc, 1),
        ("x", 2): (inc, str(("x", 1))),
    }

    dsks = [
        {
            "x": 1
        },
        {
            ("x", 1): (inc, 1),
            ("x", 2): (inc, ("x", 1))
        },
        {
            ("x", 1): (sum, [1, 2, 3]),
            ("x", 2): (sum, [("x", 1), ("x", 1)])
        },
    ]
    for dsk in dsks:
        sdsk = {
            stringify(k): stringify(v, exclusive=dsk)
            for k, v in dsk.items()
        }
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, str) for k in sdsk)
        assert get(dsk, keys) == get(sdsk, skeys)

    dsk = {
        ("y", 1): (SubgraphCallable({"x": ("y", 1)}, "x",
                                    (("y", 1), )), (("z", 1), ))
    }
    dsk = stringify(dsk, exclusive=set(dsk) | {("z", 1)})
    assert dsk[("y", 1)][0].dsk["x"] == "('y', 1)"
    assert dsk[("y", 1)][1][0] == "('z', 1)"
コード例 #21
0
def test_exec_only_nodes(
    dask_graph: Dict[Hashable, Any],
    optimizer_exec_only_nodes: Tuple[
        str,
        Callable[[Dict[Hashable, Any], Union[Hashable, Iterable[Hashable]]], Dict[Hashable, Any]],
    ],
) -> None:
    """Test skipping some tasks.

    Tests that execution-only nodes execute in the event that dependencies of
    their parent nodes (i.e. in the dask graph) get modified.
    """
    dsk = dask_graph
    cache_dir, graphchain_optimize = optimizer_exec_only_nodes

    # Cleanup temporary directory
    filelist = os.listdir(cache_dir)
    for entry in filelist:
        entrypath = os.path.join(cache_dir, entry)
        if os.path.isdir(entrypath):
            shutil.rmtree(entrypath, ignore_errors=True)
        else:
            os.remove(entrypath)
    filelist = os.listdir(cache_dir)
    assert not filelist

    # Run optimizer first time
    newdsk = graphchain_optimize(dsk, ["top1"])
    result = dask.get(newdsk, ["top1"])
    assert result == (-14,)

    # Modify function
    def goo(*args: int) -> int:
        # hash miss this!
        return sum(args) + 1

    dsk["goo1"] = (goo, *dsk["goo1"][1:])

    # Run optimizer a second time
    newdsk = graphchain_optimize(dsk, ["top1"])

    # Check the final result:
    # The output of node 'boo1' is needed at node 'baz2'
    # because 'goo1' was modified. A matching result indicates
    # that the boo1 node was executed, its dependencies loaded
    # which is the desired behaviour in such cases.
    result = dask.get(newdsk, ["top1"])
    assert result == (-14,)
コード例 #22
0
def say_hello():
    geojson = request.json['geojson']
    geojson = json.dumps(geojson) if isinstance(geojson, dict) else geojson

    graph = {
        "aoi": ["geojson", geojson],
        "aoi-dissolved": ["dissolve", "aoi"],
        "dissolved-geom": ["split", "aoi-dissolved"],
        "aoi-prj": ["project_local", "dissolved-geom"],
        "aoi-area": ["get_area", "aoi-prj"]
    }

    graph = create_dag_from_json(graph)
    outputs = ['dissolved-geom', 'aoi-area']
    results = dask.get(graph, outputs)

    final_output = {}

    for result, name in zip(results, outputs):
        if isinstance(result, dict) and 'features' in result.keys():
            final_output[name] = analysis_funcs.ogr2json(result)
        else:
            final_output[name] = result

    return jsonify(final_output), 200
コード例 #23
0
ファイル: test_graphchain.py プロジェクト: pnyczyk/graphchain
def test_second_run(
        dask_graph: Dict[Hashable, Any],
        optimizer: Tuple[
            str,
            Callable[[Dict[Hashable, Any]], Dict[Hashable, Any]]]) \
        -> None:
    """Second run.

    Tests a second run of the graphchain optimization function `optimize`. It
    checks the final result, that that all function calls are wrapped - for
    loading and the the result key has no dependencies.
    """
    dsk = dask_graph
    _, graphchain_optimize = optimizer

    # Run optimizer
    newdsk = graphchain_optimize(dsk, keys=["top1"])  # type: ignore

    # Check the final result
    result = dask.get(newdsk, ["top1"])
    assert result == (-14, )

    # Check that the functions are wrapped for loading
    for key in dsk.keys():
        newtask = newdsk[key]
        assert isinstance(newtask, tuple)
        assert isinstance(newtask[0], CachedComputation)
コード例 #24
0
    def f(c, a, b):
        e = Executor((c.ip, c.port), start=False, loop=loop)
        yield e._start()

        x_dsk = {('x', i, j): np.random.random((3, 3)) for i in range(3)
                                                       for j in range(2)}
        y_dsk = {('y', i, j): np.random.random((3, 3)) for i in range(2)
                                                       for j in range(3)}
        x_futures = yield e._scatter(x_dsk)
        y_futures = yield e._scatter(y_dsk)

        dt = np.random.random(0).dtype
        x_local = da.Array(x_dsk, 'x', ((3, 3, 3), (3, 3)), dt)
        y_local = da.Array(y_dsk, 'y', ((3, 3), (3, 3, 3)), dt)

        x_remote = da.Array(x_futures, 'x', ((3, 3, 3), (3, 3)), dt)
        y_remote = da.Array(y_futures, 'y', ((3, 3), (3, 3, 3)), dt)

        exprs = [lambda x, y: x.T + y,
                 lambda x, y: x.mean() + y.mean(),
                 lambda x, y: x.dot(y).std(axis=0),
                 lambda x, y: x - x.mean(axis=1)[:, None]]

        for expr in exprs:
            local = expr(x_local, y_local)
            local_results = dask.get(local.dask, local._keys())
            local_result = da.Array._finalize(local, local_results)

            remote = expr(x_remote, y_remote)
            remote_results = yield e._get(remote.dask, remote._keys())
            remote_result = da.Array._finalize(remote, remote_results)

            assert np.all(local_result == remote_result)

        yield e._shutdown()
コード例 #25
0
def test_chunked_dot_product():
    x = np.arange(400).reshape((20, 20))
    o = np.ones((20, 20))

    d = {'x': x, 'o': o}

    getx = getem('x', (5, 5), shape=(20, 20))
    geto = getem('o', (5, 5), shape=(20, 20))

    result = top(dotmany,
                 'out',
                 'ik',
                 'x',
                 'ij',
                 'o',
                 'jk',
                 numblocks={
                     'x': (4, 4),
                     'o': (4, 4)
                 })

    dsk = merge(d, getx, geto, result)
    out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)])

    assert eq(np.dot(x, o), concatenate3(out))
コード例 #26
0
    def __call__(self, input_buffer):

        keys = sorted(['input'] + list(self.dsk.keys()))
        more = False

        while True:

            if more:
                self.dsk['input'] = Stream.NoNewData
                more = False
            else:
                buf = next(input_buffer)
                if buf not in [Stream.EndOfStream, Stream.NoNewData]:
                    self.t_ |= buf.getExtent()
                self.dsk['input'] = buf

            outputs = {
                key: output
                for key, output in zip(keys, dask.get(self.dsk, keys))
            }

            for key in keys:
                if isinstance(outputs[key], More):
                    more = True
                    outputs[key] = outputs[key].output

            if all(Stream.EndOfStream == o for o in outputs.values()):
                return

            outputs['t'] = self.t_.end

            yield outputs
コード例 #27
0
def test_ephemeral_persisting(zk, dsk2):
    with Persist(zk, name="dsk2", ns="/test/dags", ephemeral=True):
        with Ran() as r:
            assert get(dsk2, 'e') == 10
            assert r.steps == ['e']
        with Ran() as r:
            assert get(dsk2, 's') == 0.4
            assert r.steps == ['f', 's']
        with Ran() as r:
            assert get(dsk2, 's') == 0.4
            assert r.steps == []

        assert loads(zk.get("/test/dags/dsk2/e")[0]) == 10

    with pytest.raises(NoNodeError):
        zk.get("/test/dags/dsk2/e")
コード例 #28
0
def NO_test_single_run_s3(
        dask_graph: dict,
        optimizer_s3: Tuple[str, Callable]) -> None:
    """Run on S3.

    Tests a single run of the graphchain optimization function ``optimize``
    using Amazon S3 as a persistency layer. It checks the final result, that
    all function calls are wrapped - for execution and output storing, that the
    hashchain is created, that hashed outputs (the <hash>.pickle[.lz4] files)
    are generated and that the name of each file is a key in the hashchain.
    """
    dsk = dask_graph
    cache_dir, graphchain_optimize = optimizer_s3

    # Run optimizer
    newdsk = graphchain_optimize(dsk, keys=["top1"])

    # Check the final result
    result = dask.get(newdsk, ["top1"])
    assert result == (-14, )

    data_ext = ".pickle.lz4"

    # Check that all functions have been wrapped
    for key, _task in dsk.items():
        newtask = newdsk[key]
        isinstance(newtask, CachedComputation)

    # Check that the hash files are written and that each
    # filename can be found as a key in the hashchain
    # (the association of hash <-> DAG tasks is not tested)
    storage = fs.open_fs(cache_dir)
    filelist = storage.listdir("/")
    nfiles = sum(map(lambda x: x.endswith(data_ext), filelist))
    assert nfiles == len(dsk)
コード例 #29
0
ファイル: daskutil.py プロジェクト: ScottWales/climtas
def graph_sizes(arr: dask.array.Array) -> T.Dict[T.Hashable, T.Dict]:
    """
    Get the node sizes for each node in arr's Dask graph, to be used in
    visualisation functions

    Sizes are returned using the 'label' graphviz attribute

    >>> import dask.dot
    >>> a = dask.array.zeros((10,10), chunks=(5,5))
    >>> sizes = graph_sizes(a)
    >>> dask.dot.to_graphviz(a.dask, data_attributes=sizes) # doctest: +ELLIPSIS
    <graphviz.dot.Digraph object ...>

    Note: All nodes will be computed to calculate the size
    """

    keys = list(arr.dask.keys())
    sizes = dict(
        zip(
            keys,
            [
                {"label": dask.utils.format_bytes(x.nbytes)}
                if isinstance(x, numpy.ndarray)
                else {}
                for x in dask.get(arr.dask, keys)
            ],
        )
    )

    return sizes
コード例 #30
0
ファイル: featurize.py プロジェクト: acrellin/cesium
def featurize_single_ts(ts, features_to_use, custom_script_path=None,
                        custom_functions=None, raise_exceptions=True):
    """Compute feature values for a given single time-series. Data is
    returned as dictionaries/lists of lists.

    Parameters
    ----------
    ts : TimeSeries object
        Single time series to be featurized.
    features_to_use : list of str
        List of feature names to be generated.
    custom_functions : dict, optional
        Dictionary of custom feature functions to be evaluated for the given
        time series, or a dictionary representing a dask graph of function
        evaluations. Dictionaries of functions should have keys `feature_name`
        and values functions that take arguments (t, m, e); in the case of a
        dask graph, these arrays should be referenced as 't', 'm', 'e',
        respectively, and any values with keys present in `features_to_use`
        will be computed.
    raise_exceptions : bool, optional
        If True, exceptions during feature computation are raised immediately;
        if False, exceptions are supressed and `np.nan` is returned for the
        given feature and any dependent features. Defaults to True.

    Returns
    -------
    dict
        Dictionary with feature names as keys, lists of feature values (one per
        channel) as values.
    """
    # Initialize empty feature array for all channels
    feature_values = np.empty((len(features_to_use), ts.n_channels))
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(ts.n_channels)):
        feature_graph = generate_dask_graph(t_i, m_i, e_i)
        feature_graph.update(ts.meta_features)

        if custom_functions:
            # If values in custom_functions are functions, add calls to graph
            if all(hasattr(v, '__call__') for v in custom_functions.values()):
                feature_graph.update({feat: f(t_i, m_i, e_i)
                                      for feat, f in custom_functions.items()})
            # Otherwise, custom_functions is another dask graph
            else:
                feature_graph.update(custom_functions)

        # Do not execute in parallel; parallelization has already taken place
        # at the level of time series, so we compute features for a single time
        # series in serial.
        if raise_exceptions:
            raise_callback = reraise
        else:
            raise_callback = lambda e, tb: None
        dask_values = dask.get(feature_graph, features_to_use,
                               raise_exception=raise_callback,
                               pack_exception=pack_exception)
        feature_values[:, i] = [x if not isinstance(x, Exception) else np.nan
                                for x in dask_values]
    index = pd.MultiIndex.from_product((features_to_use, range(ts.n_channels)),
                                       names=('feature', 'channel'))
    return pd.Series(feature_values.ravel(), index=index)
コード例 #31
0
def test_first_run(dask_graph: dict, optimizer: Tuple[str, Callable]) -> None:
    """First run.

    Tests a first run of the graphchain optimization function ``optimize``. It
    checks the final result, that that all function calls are wrapped - for
    execution and output storing, that the hashchain is created, that hashed
    outputs (the <hash>.pickle[.lz4] files) are generated and that the name of
    each file is a key in the hashchain.
    """
    dsk = dask_graph
    cache_dir, graphchain_optimize = optimizer

    # Run optimizer
    newdsk = graphchain_optimize(dsk, keys=["top1"])

    # Check the final result
    result = dask.get(newdsk, ["top1"])
    assert result == (-14, )

    # Check that all functions have been wrapped
    for key, _task in dsk.items():
        newtask = newdsk[key]
        assert isinstance(newtask[0], CachedComputation)

    # Check that the hash files are written and that each
    # filename can be found as a key in the hashchain
    # (the association of hash <-> DAG tasks is not tested)
    storage = fs.osfs.OSFS(cache_dir)
    filelist = storage.listdir("/")
    nfiles = len(filelist)
    assert nfiles >= len(dsk)
    storage.close()
コード例 #32
0
ファイル: stream.py プロジェクト: instinct2k18/pyannote-audio
    def __call__(self, input_buffer):

        keys = sorted(['input'] + list(self.dsk.keys()))
        more = False

        while True:

            if more:
                self.dsk['input'] = Stream.NoNewData
                more = False
            else:
                buf = next(input_buffer)
                if buf not in [Stream.EndOfStream, Stream.NoNewData]:
                    self.t_ |= buf.getExtent()
                self.dsk['input'] = buf

            outputs = {key: output
                       for key, output in zip(keys, dask.get(self.dsk, keys))}

            for key in keys:
                if isinstance(outputs[key], More):
                    more = True
                    outputs[key] = outputs[key].output

            if all(Stream.EndOfStream == o for o in outputs.values()):
                return

            outputs['t'] = self.t_.end

            yield outputs
コード例 #33
0
ファイル: test_bag.py プロジェクト: PhanidharJammula/py
def test_repartition_npartitions(nin, nout):
    b = db.from_sequence(range(100), npartitions=nin)
    c = b.repartition(npartitions=nout)
    assert c.npartitions == nout
    assert_eq(b, c)
    results = dask.get(c.dask, c.__dask_keys__())
    assert all(results)
コード例 #34
0
    def f(c, a, b):
        dsk = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'y')}
        keys = 'z'
        result = yield _get(c.ip, c.port, dsk, keys, gather=True)
        assert result == dask.get(dsk, keys)

        dsk = {
            'x': 1,
            'y': (inc, 'x'),
            'z': (inc, 'y'),
            'a': (inc, 'z'),
            'b': (add, 'a', 'x')
        }
        keys = 'b'
        result = yield _get(c.ip, c.port, dsk, keys, gather=True)
        assert result == dask.get(dsk, keys)
コード例 #35
0
    def test_dask_workflow_with_explicit_parallel_sub_tasks(self):
        """
        We do explicit Parrallel call for some tasks
        """
        import dask
        # runner = GlobalFakeRunner()
        runner = FakeRunner()
        # decorate functions...

        generate_pricedata = dfp.job_delayed(runner)(self.generate_pricedata)
        generate_fundata = dfp.job_delayed(runner)(self.generate_fundata)
        generate_riskdata = dfp.job_delayed(runner)(self.generate_riskdata)
        generate_predictors = dfp.job_delayed(runner)(self.generate_predictors)
        generate_positions = dfp.delayed(self.generate_positions)

        # from dask.multiprocessing import get
        # from dask.threaded import get
        from dask. async import get_sync as get

        # declare the dataflow
        dsk = dict()
        pools = ['pool1', 'pool2']
        for pool in pools:
            dsk[(pool, 'pricedata')] = generate_pricedata(pool),
            dsk[(pool, 'fundata')] = generate_fundata(pool),
            dsk[(pool, 'riskdata')] = generate_riskdata(pool,
                                                        'risk'), (pool,
                                                                  'pricedata')
            dsk[(pool, 'pred')] = generate_predictors(pool, 'risk'), [
                (pool, t) for t in ['pricedata', 'fundata', 'riskdata']
            ]
            dsk[(pool, 'positions')] = dfp.ParallelJobs(runner)([
                generate_positions(pool,
                                   'risk',
                                   'momentum',
                                   'markowitz_aversion',
                                   max_risk=max_risk) for max_risk in range(10)
            ]), (pool, 'pred')
        # get(dsk, [(pool,'pred') for pool in pools])  # executes in parallel
        # results = get(dsk, dsk.keys())
        jobids = dict(zip(dsk.keys(), get(dsk, dsk.keys())))
        assert len(jobids) == 10
        assert jobids[('pool2', 'positions')] == [
            19, 20, 21, 22, 23, 24, 25, 26, 27, 28
        ]
        get(dsk, ('pool2', 'positions'))
        get(dsk, ('pool2', 'positions'))
コード例 #36
0
ファイル: test_bag.py プロジェクト: postelrich/dask
def test_repartition(nin, nout):
    b = db.from_sequence(range(100), npartitions=nin)
    c = b.repartition(npartitions=nout)

    assert c.npartitions == nout
    assert b.compute(get=dask.get) == c.compute(get=dask.get)
    results = dask.get(c.dask, c.__dask_keys__())
    assert all(results)
コード例 #37
0
def test_local_parents_of_reduction(abcde):
    """

            c1
            |
        b1  c2
        |  /|
    a1  b2  c3
    |  /|
    a2  b3
    |
    a3

    Prefer to finish a1 stack before proceding to b2
    """
    a, b, c, d, e = abcde
    a1, a2, a3 = [a + i for i in '123']
    b1, b2, b3 = [b + i for i in '123']
    c1, c2, c3 = [c + i for i in '123']

    expected = [a3, a2, a1, b3, b2, b1, c3, c2, c1]

    log = []

    def f(x):
        def _(*args):
            log.append(x)

        return _

    dsk = {
        a3: (f(a3), ),
        a2: (f(a2), a3),
        a1: (f(a1), a2),
        b3: (f(b3), ),
        b2: (f(b2), b3, a2),
        b1: (f(b1), b2),
        c3: (f(c3), ),
        c2: (f(c2), c3, b2),
        c1: (f(c1), c2)
    }

    order(dsk)
    dask.get(dsk, [a1, b1, c1])  # trigger computation

    assert log == expected
コード例 #38
0
def test_repartition(nin, nout):
    b = db.from_sequence(range(100), npartitions=nin)
    c = b.repartition(npartitions=nout)

    assert c.npartitions == nout
    assert b.compute(get=dask.get) == c.compute(get=dask.get)
    results = dask.get(c.dask, c._keys())
    assert all(results)
コード例 #39
0
 def __call__(self, dsk, keys, **kwargs):
     """Compute dask task and keep track of number of times we do so."""
     import dask
     self.total_computes += 1
     if self.total_computes > self.max_computes:
         raise RuntimeError("Too many dask computations were scheduled: "
                            "{}".format(self.total_computes))
     return dask.get(dsk, keys, **kwargs)
コード例 #40
0
ファイル: test_order.py プロジェクト: martindurant/dask
def test_local_parents_of_reduction(abcde):
    """

            c1
            |
        b1  c2
        |  /|
    a1  b2  c3
    |  /|
    a2  b3
    |
    a3

    Prefer to finish a1 stack before proceding to b2
    """
    a, b, c, d, e = abcde
    a1, a2, a3 = [a + i for i in '123']
    b1, b2, b3 = [b + i for i in '123']
    c1, c2, c3 = [c + i for i in '123']

    expected = [a3, a2, a1,
                b3, b2, b1,
                c3, c2, c1]

    log = []

    def f(x):
        def _(*args):
            log.append(x)
        return _

    dsk = {a3: (f(a3),),
           a2: (f(a2), a3),
           a1: (f(a1), a2),
           b3: (f(b3),),
           b2: (f(b2), b3, a2),
           b1: (f(b1), b2),
           c3: (f(c3),),
           c2: (f(c2), c3, b2),
           c1: (f(c1), c2)}

    order(dsk)
    dask.get(dsk, [a1, b1, c1])  # trigger computation

    assert log == expected
コード例 #41
0
ファイル: test_utils.py プロジェクト: tomMoral/distributed
def test_str_graph():
    dsk = {'x': 1}
    assert str_graph(dsk) == dsk

    dsk = {('x', 1): (inc, 1)}
    assert str_graph(dsk) == {str(('x', 1)): (inc, 1)}

    dsk = {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))}
    assert str_graph(dsk) == {str(('x', 1)): (inc, 1),
                              str(('x', 2)): (inc, str(('x', 1)))}

    dsks = [{'x': 1},
            {('x', 1): (inc, 1), ('x', 2): (inc, ('x', 1))},
            {('x', 1): (sum, [1, 2, 3]),
             ('x', 2): (sum, [('x', 1), ('x', 1)])}]
    for dsk in dsks:
        sdsk = str_graph(dsk)
        keys = list(dsk)
        skeys = [str(k) for k in keys]
        assert all(isinstance(k, str) for k in sdsk)
        assert dask.get(dsk, keys) == dask.get(sdsk, skeys)
コード例 #42
0
ファイル: test_array_core.py プロジェクト: hc10024/dask
def test_chunked_transpose_plus_one():
    x = np.arange(400).reshape((20, 20))

    d = {'x': x}

    getx = getem('x', (5, 5), shape=(20, 20))

    f = lambda x: x.T + 1
    comp = top(f, 'out', 'ij', 'x', 'ji', numblocks={'x': (4, 4)})

    dsk = merge(d, getx, comp)
    out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)])

    assert eq(concatenate3(out), x.T + 1)
コード例 #43
0
ファイル: test_array_core.py プロジェクト: hc10024/dask
def test_chunked_dot_product():
    x = np.arange(400).reshape((20, 20))
    o = np.ones((20, 20))

    d = {'x': x, 'o': o}

    getx = getem('x', (5, 5), shape=(20, 20))
    geto = getem('o', (5, 5), shape=(20, 20))

    result = top(dotmany, 'out', 'ik', 'x', 'ij', 'o', 'jk',
                 numblocks={'x': (4, 4), 'o': (4, 4)})

    dsk = merge(d, getx, geto, result)
    out = dask.get(dsk, [[('out', i, j) for j in range(4)] for i in range(4)])

    assert eq(np.dot(x, o), concatenate3(out))
コード例 #44
0
def test_minimize_data_transfer():
    x = np.ones(100)
    y = da.from_array(x, chunks=25)
    z = y + 1
    dsk = z.__dask_optimize__(z.dask, z.__dask_keys__())

    keys = list(dsk)
    results = dask.get(dsk, keys)
    big_key = [k for k, r in zip(keys, results) if r is x][0]
    dependencies, dependents = dask.core.get_deps(dsk)
    deps = dependents[big_key]

    assert len(deps) == 4
    for dep in deps:
        assert dsk[dep][0] in (getitem, getter)
        assert dsk[dep][1] == big_key
コード例 #45
0
ファイル: test_context.py プロジェクト: chrislaing/dask
 def myget(dsk, keys, **kwargs):
     var[0] = var[0] + 1
     return dask.get(dsk, keys, **kwargs)
コード例 #46
0
ファイル: test_dask.py プロジェクト: jjhelmus/xray
 def counting_get(*args, **kwargs):
     count[0] += 1
     return dask.get(*args, **kwargs)
コード例 #47
0
ファイル: util.py プロジェクト: acrellin/cesium
def generate_features(t, m, e, features_to_use):
    """Utility function that generates features from a dask DAG."""
    graph = generate_dask_graph(t, m, e)
    values = dask.get(graph, features_to_use)
    return dict(zip(features_to_use, values))
コード例 #48
0
ファイル: test_base.py プロジェクト: fortizc/dask
 def my_get(dsk, keys):
     assert dsk == dict(y.dask)  # but they aren't
     return dask.get(dsk, keys)
コード例 #49
0
ファイル: test_base.py プロジェクト: caseyclements/dask
 def get(dsk, keys, *args, **kwargs):
     called[0] = True
     return dask.get(dsk, keys)