Esempio n. 1
0
def test_blockwise_non_blockwise_output():
    x = da.ones(10, chunks=(5, ))
    y = ((x + 1) + 2) + 3
    w = y.sum()
    z = ((y * 2) * 3) * 4

    z_top_before = tuple(z.dask.dicts[z.name].indices)
    (zz, ) = dask.optimize(z)
    z_top_after = tuple(z.dask.dicts[z.name].indices)
    assert z_top_before == z_top_after, "z_top mutated"

    dsk = optimize_blockwise(z.dask,
                             keys=list(dask.core.flatten(z.__dask_keys__())))
    assert isinstance(dsk, HighLevelGraph)
    assert (len([
        layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)
    ]) == 1)

    dsk = optimize_blockwise(
        HighLevelGraph.merge(w.dask, z.dask),
        keys=list(dask.core.flatten([w.__dask_keys__(),
                                     z.__dask_keys__()])),
    )
    assert isinstance(dsk, HighLevelGraph)
    assert (len([
        layer
        for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)
    ]) >= 1)
Esempio n. 2
0
def collections_to_dsk(collections, optimize_graph=True, optimizations=(), **kwargs):
    """
    Convert many collections into a single dask graph, after optimization
    """
    from dask.highlevelgraph import HighLevelGraph

    optimizations = tuple(optimizations) + tuple(config.get("optimizations", ()))

    if optimize_graph:
        groups = groupby(optimization_function, collections)

        graphs = []
        for opt, val in groups.items():
            dsk, keys = _extract_graph_and_keys(val)
            dsk = opt(dsk, keys, **kwargs)

            for opt_inner in optimizations:
                dsk = opt_inner(dsk, keys, **kwargs)

            graphs.append(dsk)

        # Merge all graphs
        if any(isinstance(graph, HighLevelGraph) for graph in graphs):
            dsk = HighLevelGraph.merge(*graphs)
        else:
            dsk = merge(*map(ensure_dict, graphs))
    else:
        dsk, _ = _extract_graph_and_keys(collections)

    return dsk
Esempio n. 3
0
    def __dask_graph__(self):
        graphs = {k: v.__dask_graph__() for k, v in self.data_vars.items()}
        graphs = {k: v for k, v in graphs.items() if v is not None}

        if len(graphs) > 0:
            return HighLevelGraph.merge(*graphs.values())

        return None
Esempio n. 4
0
        def __dask_graph__(self):
            graphs = {k: v.__dask_graph__() for k, v in self.data_vars.items()}
            # Excise anything that is not a dask collection
            graphs = {k: v for k, v in graphs.items() if v is not None}

            if len(graphs) > 0:
                return HighLevelGraph.merge(*graphs.values())

            return None
Esempio n. 5
0
def compute_as_if_collection(cls, dsk, keys, scheduler=None, get=None, **kwargs):
    """Compute a graph as if it were of type cls.

    Allows for applying the same optimizations and default scheduler."""
    from dask.highlevelgraph import HighLevelGraph

    schedule = get_scheduler(scheduler=scheduler, cls=cls, get=get)
    dsk2 = optimization_function(cls)(dsk, keys, **kwargs)
    # see https://github.com/dask/dask/issues/8991.
    # This merge should be removed once the underlying issue is fixed.
    dsk2 = HighLevelGraph.merge(dsk2)
    return schedule(dsk2, keys, **kwargs)
Esempio n. 6
0
def rearrange_by_column_disk(df, column, npartitions=None, compute=False):
    """Shuffle using local disk

    See Also
    --------
    rearrange_by_column_tasks:
        Same function, but using tasks rather than partd
        Has a more informative docstring
    """
    if npartitions is None:
        npartitions = df.npartitions

    token = tokenize(df, column, npartitions)
    always_new_token = uuid.uuid1().hex

    p = ("zpartd-" + always_new_token, )
    dsk1 = {p: (maybe_buffered_partd(), )}

    # Partition data on disk
    name = "shuffle-partition-" + always_new_token
    dsk2 = {(name, i): (shuffle_group_3, key, column, npartitions, p)
            for i, key in enumerate(df.__dask_keys__())}

    dependencies = []
    if compute:
        graph = HighLevelGraph.merge(df.dask, dsk1, dsk2)
        graph = HighLevelGraph.from_collections(name, graph, dependencies=[df])
        keys = [p, sorted(dsk2)]
        pp, values = compute_as_if_collection(DataFrame, graph, keys)
        dsk1 = {p: pp}
        dsk2 = dict(zip(sorted(dsk2), values))
    else:
        dependencies.append(df)

    # Barrier
    barrier_token = "barrier-" + always_new_token
    dsk3 = {barrier_token: (barrier, list(dsk2))}

    # Collect groups
    name = "shuffle-collect-" + token
    dsk4 = {(name, i): (collect, p, i, df._meta, barrier_token)
            for i in range(npartitions)}

    divisions = (None, ) * (npartitions + 1)

    layer = toolz.merge(dsk1, dsk2, dsk3, dsk4)
    graph = HighLevelGraph.from_collections(name,
                                            layer,
                                            dependencies=dependencies)
    return new_dd_object(graph, name, df._meta, divisions)
Esempio n. 7
0
def _extract_graph_and_keys(vals):
    """Given a list of dask vals, return a single graph and a list of keys such
    that ``get(dsk, keys)`` is equivalent to ``[v.compute() for v in vals]``."""
    from dask.highlevelgraph import HighLevelGraph

    graphs, keys = [], []
    for v in vals:
        graphs.append(v.__dask_graph__())
        keys.append(v.__dask_keys__())

    if any(isinstance(graph, HighLevelGraph) for graph in graphs):
        graph = HighLevelGraph.merge(*graphs)
    else:
        graph = merge(*map(ensure_dict, graphs))

    return graph, keys
Esempio n. 8
0
def cascaded_compute(callback, arrays, batch_size=None, optimize=True):
    """Dask helper function for iterating over computed dask arrays.

    Args:
        callback (callable): Called with a single numpy array computed from
                             the provided dask arrays.
        arrays (list, tuple): Dask arrays to pass to callback.
        batch_size (int): Group computation in to this many arrays at a time.
        optimize (bool): Whether to try to optimize the dask graphs of the
                         provided arrays.

    Returns: `dask.Delayed` object to be computed

    """
    def _callback_wrapper(arr, previous_call, cb=callback):
        del previous_call  # used only for task ordering
        return cb(arr)

    array_batches = []
    if not batch_size:
        array_batches.append(arrays)
    else:
        arr_gens = iter(arrays)
        array_batches = (arrs
                         for arrs in zip_longest(*([arr_gens] * batch_size)))

    for batch_arrs in array_batches:
        batch_arrs = [x for x in batch_arrs if x is not None]
        if optimize:
            # optimize Dask graph over all objects
            dsk = da.Array.__dask_optimize__(
                # combine all Dask Array graphs
                HighLevelGraph.merge(*[e.__dask_graph__()
                                       for e in batch_arrs]),
                # get Dask Array keys in result
                list(dask.core.flatten([e.__dask_keys__()
                                        for e in batch_arrs])))
            # rebuild Dask Arrays
            batch_arrs = [
                da.Array(dsk, e.name, e.chunks, e.dtype) for e in batch_arrs
            ]

        current_write = None
        for dask_arr in batch_arrs:
            current_write = dask.delayed(_callback_wrapper)(dask_arr,
                                                            current_write)
        yield current_write
Esempio n. 9
0
def _checkpoint_one(collection, split_every) -> Delayed:
    tok = tokenize(collection)
    name = "checkpoint-" + tok

    keys_iter = flatten(collection.__dask_keys__())
    try:
        next(keys_iter)
        next(keys_iter)
    except StopIteration:
        # Collection has 0 or 1 keys; no need for a map step
        layer = {name: (chunks.checkpoint, collection.__dask_keys__())}
        dsk = HighLevelGraph.from_collections(name,
                                              layer,
                                              dependencies=(collection, ))
        return Delayed(name, dsk)

    # Collection has 2+ keys; apply a two-step map->reduce algorithm so that we
    # transfer over the network and store in RAM only a handful of None's instead of
    # the full computed collection's contents
    dsks = []
    map_names = set()
    map_keys = []

    for prev_name in get_collection_names(collection):
        map_name = "checkpoint_map-" + tokenize(prev_name, tok)
        map_names.add(map_name)
        map_layer = _build_map_layer(chunks.checkpoint, prev_name, map_name,
                                     collection)
        map_keys += list(map_layer.get_output_keys())
        dsks.append(
            HighLevelGraph.from_collections(map_name,
                                            map_layer,
                                            dependencies=(collection, )))

    # recursive aggregation
    reduce_layer: dict = {}
    while split_every and len(map_keys) > split_every:
        k = (name, len(reduce_layer))
        reduce_layer[k] = (chunks.checkpoint, map_keys[:split_every])
        map_keys = map_keys[split_every:] + [k]
    reduce_layer[name] = (chunks.checkpoint, map_keys)

    dsks.append(
        HighLevelGraph({name: reduce_layer}, dependencies={name: map_names}))
    dsk = HighLevelGraph.merge(*dsks)

    return Delayed(name, dsk)
Esempio n. 10
0
 def block_one(coll):
     tok = tokenize(coll, blocker)
     dsks = []
     rename = {}
     for prev_name in get_collection_names(coll):
         new_name = "wait_on-" + tokenize(prev_name, tok)
         rename[prev_name] = new_name
         layer = _build_map_layer(
             chunks.bind, prev_name, new_name, coll, dependencies=(blocker,)
         )
         dsks.append(
             HighLevelGraph.from_collections(
                 new_name, layer, dependencies=(coll, blocker)
             )
         )
     dsk = HighLevelGraph.merge(*dsks)
     rebuild, args = coll.__dask_postpersist__()
     return rebuild(dsk, *args, rename=rename)
Esempio n. 11
0
def test_blockwise_non_blockwise_output():
    x = da.ones(10, chunks=(5,))
    y = (((x + 1) + 2) + 3)
    w = y.sum()
    z = (((y * 2) * 3) * 4)

    z_top_before = tuple(z.dask.dicts[z.name].indices)
    (zz,) = dask.optimize(z)
    z_top_after = tuple(z.dask.dicts[z.name].indices)
    assert z_top_before == z_top_after, "z_top mutated"

    dsk = optimize_blockwise(z.dask, keys=list(dask.core.flatten(z.__dask_keys__())))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in dsk.dicts.values() if isinstance(layer, Blockwise)]) == 1

    dsk = optimize_blockwise(HighLevelGraph.merge(w.dask, z.dask),
                             keys=list(dask.core.flatten([w.__dask_keys__(), z.__dask_keys__()])))
    assert isinstance(dsk, HighLevelGraph)
    assert len([layer for layer in z.dask.dicts.values() if isinstance(layer, Blockwise)]) >= 1
Esempio n. 12
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds))
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5)
            current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 )
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = RDA_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5)
        daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt)
        return image[box(*output_bounds)]
Esempio n. 13
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5)
            current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = img_md["dataType"]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in range(y_chunks):
            for x in range(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5)
        daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks),
                                                list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt)
        return image[box(*output_bounds)]
Esempio n. 14
0
def fit(model,
        x,
        y,
        compute=True,
        shuffle_blocks=True,
        random_state=None,
        **kwargs):
    """ Fit scikit learn model against dask arrays

    Model must support the ``partial_fit`` interface for online or batch
    learning.

    Ideally your rows are independent and identically distributed. By default,
    this function will step through chunks of the arrays in random order.

    Parameters
    ----------
    model: sklearn model
        Any model supporting partial_fit interface
    x: dask Array
        Two dimensional array, likely tall and skinny
    y: dask Array
        One dimensional array with same chunks as x's rows
    compute : bool
        Whether to compute this result
    shuffle_blocks : bool
        Whether to shuffle the blocks with ``random_state`` or not
    random_state : int or numpy.random.RandomState
        Random state to use when shuffling blocks
    kwargs:
        options to pass to partial_fit

    Examples
    --------
    >>> import dask.array as da
    >>> X = da.random.random((10, 3), chunks=(5, 3))
    >>> y = da.random.randint(0, 2, 10, chunks=(5,))

    >>> from sklearn.linear_model import SGDClassifier
    >>> sgd = SGDClassifier()

    >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0])
    >>> sgd  # doctest: +SKIP
    SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
           fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
           loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
           random_state=None, shuffle=False, verbose=0, warm_start=False)

    This passes all of X and y through the classifier sequentially.  We can use
    the classifier as normal on in-memory data

    >>> import numpy as np
    >>> sgd.predict(np.random.random((4, 3)))  # doctest: +SKIP
    array([1, 0, 0, 1])

    Or predict on a larger dataset

    >>> z = da.random.random((400, 3), chunks=(100, 3))
    >>> da.learn.predict(sgd, z)  # doctest: +SKIP
    dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64>
    """
    if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"):
        x = x.to_dask_array()
    assert x.ndim == 2
    if y is not None:
        if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"):
            y = y.to_dask_array()

        assert y.ndim == 1
        assert x.chunks[0] == y.chunks[0]

    assert hasattr(model, "partial_fit")
    if len(x.chunks[1]) > 1:
        x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1])))

    nblocks = len(x.chunks[0])
    order = list(range(nblocks))
    if shuffle_blocks:
        rng = sklearn.utils.check_random_state(random_state)
        rng.shuffle(order)

    name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order)
    dsk = {(name, -1): model}
    dsk.update({(name, i): (
        _partial_fit,
        (name, i - 1),
        (x.name, order[i], 0),
        (getattr(y, "name", ""), order[i]),
        kwargs,
    )
                for i in range(nblocks)})

    graphs = {x.name: x.__dask_graph__(), name: dsk}
    if hasattr(y, "__dask_graph__"):
        graphs[y.name] = y.__dask_graph__()

    try:
        from dask.highlevelgraph import HighLevelGraph

        new_dsk = HighLevelGraph.merge(*graphs.values())
    except ImportError:
        from dask import sharedict

        new_dsk = sharedict.merge(*graphs.values())

    value = Delayed((name, nblocks - 1), new_dsk)

    if compute:
        return value.compute()
    else:
        return value