Exemple #1
0
def optimize(dsk, keys, **kwargs):
    flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys]
    dsk, dependencies = cull(dsk, flatkeys)
    dsk, dependencies = fuse(dsk, keys, dependencies=dependencies,
                             ave_width=_globals.get('fuse_ave_width', 1))
    dsk, _ = cull(dsk, keys)
    return dsk
Exemple #2
0
def inlined_array(a, inline_arrays=None):
    """ Flatten underlying graph """
    agraph = a.__dask_graph__()
    akeys = set(flatten(a.__dask_keys__()))

    # Inline everything except the output keys
    if inline_arrays is None:
        inline_keys = set(agraph.keys()) - akeys
        dsk2 = inline(agraph, keys=inline_keys, inline_constants=True)
        dsk3, _ = cull(dsk2, akeys)

        graph = HighLevelGraph.from_collections(a.name, dsk3, [])
        return da.Array(graph, a.name, a.chunks, dtype=a.dtype)

    # We're given specific arrays to inline, promote to list
    if isinstance(inline_arrays, da.Array):
        inline_arrays = [inline_arrays]
    elif isinstance(inline_arrays, tuple):
        inline_arrays = list(inline_arrays)

    if not isinstance(inline_arrays, list):
        raise TypeError("Invalid inline_arrays, must be "
                        "(None, list, tuple, dask.array.Array)")

    inline_names = set(a.name for a in inline_arrays)
    layers = agraph.layers.copy()
    deps = {k: v.copy() for k, v in agraph.dependencies.items()}
    # We want to inline layers that depend on the inlined arrays
    inline_layers = set(k for k, v in deps.items()
                        if len(inline_names.intersection(v)) > 0)

    for layer_name in inline_layers:
        dsk = dict(layers[layer_name])
        layer_keys = set(dsk.keys())
        inline_keys = set()

        for array in inline_arrays:
            dsk.update(layers[array.name])
            deps.pop(array.name, None)
            deps[layer_name].discard(array.name)
            inline_keys.update(layers[array.name].keys())

        dsk2 = inline(dsk, keys=inline_keys, inline_constants=True)
        layers[layer_name], _ = cull(dsk2, layer_keys)

    # Remove layers containing the inlined arrays
    for inline_name in inline_names:
        layers.pop(inline_name)

    return da.Array(HighLevelGraph(layers, deps), a.name, a.chunks, a.dtype)
Exemple #3
0
def inlined_array(a, inline_arrays=None):
    """ Flatten underlying graph """
    agraph = a.__dask_graph__()
    akeys = set(flatten(a.__dask_keys__()))

    # Inline everything except the output keys
    if inline_arrays is None:
        inline_keys = set(agraph.keys()) - akeys
        dsk2 = inline(agraph, keys=inline_keys, inline_constants=True)
        dsk3, _ = cull(dsk2, akeys)

        graph = HighLevelGraph.from_collections(a.name, dsk3, [])
        return da.Array(graph, a.name, a.chunks, dtype=a.dtype)

    # We're given specific arrays to inline, promote to list
    if isinstance(inline_arrays, da.Array):
        inline_arrays = [inline_arrays]
    elif isinstance(inline_arrays, tuple):
        inline_arrays = list(inline_arrays)

    if not isinstance(inline_arrays, list):
        raise TypeError("Invalid inline_arrays, must be "
                        "(None, list, tuple, dask.array.Array)")

    layers = agraph.layers.copy()
    deps = agraph.dependencies.copy()
    inline_keys = set()
    dsk = dict(layers[a.name])

    # Inline specified arrays
    for array in inline_arrays:
        # Remove array from layers and dependencies
        try:
            dsk.update(layers.pop(array.name))
            del deps[array.name]
        except KeyError:
            raise ValueError("%s is not a valid dependency of a"
                             % array.name)

        # Record keys to inline
        inline_keys.update(flatten(array.__dask_keys__()))

    dsk2 = inline(dsk, keys=inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, akeys)

    layers[a.name] = dsk3
    graph = HighLevelGraph(layers, deps)

    return da.Array(graph, a.name, a.chunks, a.dtype)
def cached_array(array):
    """
    Return a new array that functionally has the same values as array,
    but flattens the underlying graph and introduces a cache lookup
    when the individual array chunks are accessed.

    Useful for caching data that can fit in-memory for the duration
    of the graph's execution.
    """
    dsk = dict(array.__dask_graph__())
    keys = set(flatten(array.__dask_keys__()))

    # Inline + cull everything except the current array
    inline_keys = set(dsk.keys() - keys)
    dsk2 = inline(dsk, inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, keys)

    # Create a cache used to store array values
    cache = ArrayCache(uuid.uuid4().hex)

    for k in keys:
        dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k))

    graph = HighLevelGraph.from_collections(array.name, dsk3, [])

    return da.Array(graph, array.name, array.chunks, array.dtype)
Exemple #5
0
def optimize(dsk, keys, **kwargs):
    if not isinstance(keys, (list, set)):
        keys = [keys]
    keys = list(core.flatten(keys))

    if not isinstance(dsk, HighLevelGraph):
        dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=())
    else:
        # Perform Blockwise optimizations for HLG input
        dsk = optimize_dataframe_getitem(dsk, keys=keys)
        dsk = optimize_blockwise(dsk, keys=keys)
        dsk = fuse_roots(dsk, keys=keys)
    dsk = dsk.cull(set(keys))

    # Do not perform low-level fusion unless the user has
    # specified True explicitly. The configuration will
    # be None by default.
    if not config.get("optimization.fuse.active"):
        return dsk

    dependencies = dsk.get_all_dependencies()
    dsk = ensure_dict(dsk)

    fuse_subgraphs = config.get("optimization.fuse.subgraphs")
    if fuse_subgraphs is None:
        fuse_subgraphs = True
    dsk, _ = fuse(
        dsk,
        keys,
        dependencies=dependencies,
        fuse_subgraphs=fuse_subgraphs,
    )
    dsk, _ = cull(dsk, keys)
    return dsk
Exemple #6
0
def inline_pattern(dsk: dict, pat_ls: List[str], inline_constants: bool) -> dict:
    """
    Inline tasks whose keys match certain patterns.

    Parameters
    ----------
    dsk : dict
        Input dask graph.
    pat_ls : List[str]
        List of patterns to check.
    inline_constants : bool
        Whether to inline constants.

    Returns
    -------
    dsk : dict
        Dask graph with keys inlined.

    See Also
    -------
    dask.optimization.inline
    """
    keys = [k for k in dsk.keys() if check_pat(k, pat_ls)]
    if keys:
        dsk = inline(dsk, keys, inline_constants=inline_constants)
        for k in keys:
            del dsk[k]
        if inline_constants:
            dsk, dep = cull(dsk, set(list(flatten(keys))))
    return dsk
Exemple #7
0
    def get(self, key=None):
        """Execute and return the result of the computation *key*.

        Only *key* and its dependencies are computed.

        Parameters
        ----------
        key : str, optional
            If not provided, :attr:`default_key` is used.

        Raises
        ------
        ValueError
            If `key` and :attr:`default_key` are both :obj:`None`.
        """
        if key is None:
            if self.default_key is not None:
                key = self.default_key
            else:
                raise ValueError('no default reporting key set')

        # Cull the graph, leaving only those needed to compute *key*
        dsk, deps = cull(self.graph, key)
        log.debug('Cull {} -> {} keys'.format(len(self.graph), len(dsk)))

        try:
            return dask_get(dsk, key)
        except Exception as exc:
            raise ComputationError from exc
def test_SubgraphCallable():
    non_hashable = [1, 2, 3]

    dsk = {
        'a': (apply, add, ['in1', 2]),
        'b': (apply, partial_by_order, ['in2'], {
            'function': func_with_kwargs,
            'other': [(1, 20)],
            'c': 4
        }),
        'c': (apply, partial_by_order, ['in2', 'in1'], {
            'function': func_with_kwargs,
            'other': [(1, 20)]
        }),
        'd': (inc, 'a'),
        'e': (add, 'c', 'd'),
        'f': ['a', 2, 'b', (add, 'b', (sum, non_hashable))],
        'h': (add, (sum, 'f'), (sum, ['a', 'b']))
    }

    f = SubgraphCallable(dsk, 'h', ['in1', 'in2'], name='test')
    assert f.name == 'test'
    assert repr(f) == 'test'

    dsk2 = dsk.copy()
    dsk2.update({'in1': 1, 'in2': 2})
    assert f(1, 2) == get_sync(cull(dsk2, ['h'])[0], ['h'])[0]
    assert f(1, 2) == f(1, 2)

    f2 = pickle.loads(pickle.dumps(f))
    assert f2(1, 2) == f(1, 2)
Exemple #9
0
def test_SubgraphCallable():
    non_hashable = [1, 2, 3]

    dsk = {'a': (apply, add, ['in1', 2]),
           'b': (apply, partial_by_order, ['in2'],
                 {'function': func_with_kwargs, 'other': [(1, 20)], 'c': 4}),
           'c': (apply, partial_by_order, ['in2', 'in1'],
                 {'function': func_with_kwargs, 'other': [(1, 20)]}),
           'd': (inc, 'a'),
           'e': (add, 'c', 'd'),
           'f': ['a', 2, 'b', (add, 'b', (sum, non_hashable))],
           'g': (dontcall, 'in1'),
           'h': (add, (sum, 'f'), (sum, ['a', 'b']))}

    f = SubgraphCallable(dsk, 'h', ['in1', 'in2'], name='test')
    assert f.name == 'test'
    assert repr(f) == 'test'

    dsk2 = dsk.copy()
    dsk2.update({'in1': 1, 'in2': 2})
    assert f(1, 2) == get_sync(cull(dsk2, ['h'])[0], ['h'])[0]
    assert f(1, 2) == f(1, 2)

    f2 = pickle.loads(pickle.dumps(f))
    assert f2(1, 2) == f(1, 2)
Exemple #10
0
def test_SubgraphCallable():
    non_hashable = [1, 2, 3]

    dsk = {
        "a": (apply, add, ["in1", 2]),
        "b": (
            apply,
            partial_by_order,
            ["in2"],
            {"function": func_with_kwargs, "other": [(1, 20)], "c": 4},
        ),
        "c": (
            apply,
            partial_by_order,
            ["in2", "in1"],
            {"function": func_with_kwargs, "other": [(1, 20)]},
        ),
        "d": (inc, "a"),
        "e": (add, "c", "d"),
        "f": ["a", 2, "b", (add, "b", (sum, non_hashable))],
        "h": (add, (sum, "f"), (sum, ["a", "b"])),
    }

    f = SubgraphCallable(dsk, "h", ["in1", "in2"], name="test")
    assert f.name == "test"
    assert repr(f) == "test"

    dsk2 = dsk.copy()
    dsk2.update({"in1": 1, "in2": 2})
    assert f(1, 2) == get_sync(cull(dsk2, ["h"])[0], ["h"])[0]
    assert f(1, 2) == f(1, 2)

    f2 = pickle.loads(pickle.dumps(f))
    assert f2(1, 2) == f(1, 2)
Exemple #11
0
def test_shuffle_hlg_layer():
    # This test checks that the `ShuffleLayer` HLG Layer
    # is used (as expected) for a multi-stage shuffle.
    ddf = dd.from_pandas(pd.DataFrame({"a": np.random.randint(0, 10, 100)}),
                         npartitions=10)
    ddf_shuffled = ddf.shuffle("a", max_branch=3, shuffle="tasks")
    keys = [(ddf_shuffled._name, i) for i in range(ddf_shuffled.npartitions)]

    # Make sure HLG culling reduces the graph size
    dsk = ddf_shuffled.__dask_graph__()
    dsk_culled = dsk.cull(set(keys))
    assert len(dsk_culled) < len(dsk)
    assert isinstance(dsk_culled, dask.highlevelgraph.HighLevelGraph)

    # Ensure we have ShuffleLayers
    assert any(
        isinstance(layer, dd.shuffle.ShuffleLayer)
        for layer in dsk.layers.values())
    # Check ShuffleLayer names
    for name, layer in dsk.layers.items():
        if isinstance(layer, dd.shuffle.ShuffleLayer):
            assert name.startswith("shuffle-")

    # Since we already culled the HLG,
    # culling the dictionary should not change the graph
    dsk_dict = dict(dsk_culled)
    dsk_dict_culled, _ = cull(dsk_dict, keys)
    assert dsk_dict_culled == dsk_dict
def test_fuse_getitem():
    def load(*args):
        pass

    dsk = {"x": (load, "store", "part", ["a", "b"]), "y": (getitem, "x", "a")}
    dsk2 = fuse_getitem(dsk, load, 3)
    dsk2, dependencies = cull(dsk2, "y")
    assert dsk2 == {"y": (load, "store", "part", "a")}
Exemple #13
0
def test_fuse_getitem():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    dsk2 = fuse_getitem(dsk, load, 3)
    dsk2, dependencies = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
Exemple #14
0
def _generate_dask_graph(data, keys):
    """
    Generate a dask graph from a subset of REGISTRY.
    """
    tasks = cull(REGISTRY, keys)[0]
    for k in unresolved(tasks, keys):
        tasks[k] = data[k]
    return tasks
def test_fuse_getitem():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    dsk2 = fuse_getitem(dsk, load, 3)
    dsk2, dependencies = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
Exemple #16
0
def test_fuse_selections():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    merge = lambda t1, t2: (load, t2[1], t2[2], t1[2])
    dsk2 = fuse_selections(dsk, getitem, load, merge)
    dsk2, dependencies = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
Exemple #17
0
def test_inline_cull_dependencies():
    d = {'a': 1,
         'b': 'a',
         'c': 'b',
         'd': ['a', 'b', 'c'],
         'e': (add, (len, 'd'), 'a')}

    d2, dependencies = cull(d, ['d', 'e'])
    inline(d2, {'b'}, dependencies=dependencies)
def test_inline_cull_dependencies():
    d = {'a': 1,
         'b': 'a',
         'c': 'b',
         'd': ['a', 'b', 'c'],
         'e': (add, (len, 'd'), 'a')}

    d2, dependencies = cull(d, ['d', 'e'])
    inline(d2, {'b'}, dependencies=dependencies)
def test_fuse_selections():
    def load(*args):
        pass
    dsk = {'x': (load, 'store', 'part', ['a', 'b']),
           'y': (getitem, 'x', 'a')}
    merge = lambda t1, t2: (load, t2[1], t2[2], t1[2])
    dsk2 = fuse_selections(dsk, getitem, load, merge)
    dsk2, dependencies = cull(dsk2, 'y')
    assert dsk2 == {'y': (load, 'store', 'part', 'a')}
def test_fuse_selections():
    def load(*args):
        pass

    dsk = {"x": (load, "store", "part", ["a", "b"]), "y": (getitem, "x", "a")}
    merge = lambda t1, t2: (load, t2[1], t2[2], t1[2])
    dsk2 = fuse_selections(dsk, getitem, load, merge)
    dsk2, dependencies = cull(dsk2, "y")
    assert dsk2 == {"y": (load, "store", "part", "a")}
Exemple #21
0
 def __dask_optimize__(dsk, keys, **kwargs):
     """
     Notes
     -----
     The dask default optimizer induces too many (unnecesarry)
     IO calls -- we turn this off feature off by default, and only apply a culling.
     """
     from dask.optimization import cull
     dsk2, dependencies = cull(dsk, keys)
     return dsk2
Exemple #22
0
def test_inline_cull_dependencies():
    d = {
        "a": 1,
        "b": "a",
        "c": "b",
        "d": ["a", "b", "c"],
        "e": (add, (len, "d"), "a")
    }

    d2, dependencies = cull(d, ["d", "e"])
    inline(d2, {"b"}, dependencies=dependencies)
Exemple #23
0
def get(graph, key, local=True):
    # new function because from version 19.1 on, dask no longer culls graphs (apparently it's only done by distributed)
    # so the function combines a get call with previous culling (which means to modify a graph in such a way that only the relevant bit to the key remains)

    import dask
    from dask.optimization import cull
    cgraph = cull(graph, key)[0]

    # with ProgressBar():
    if local:
        return dask.local.get_sync(cgraph, key)
    else:
        return dask.threaded.get(cgraph, key)
Exemple #24
0
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled, dependencies = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']}

    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z'])[0] == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    pytest.raises(KeyError, lambda: cull(d, 'badkey'))
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)}
    culled, dependencies = cull(d, 'out')
    assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)}
    assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']}

    assert cull(d, 'out') == cull(d, ['out'])
    assert cull(d, ['out', 'z'])[0] == d
    assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z'])
    pytest.raises(KeyError, lambda: cull(d, 'badkey'))
Exemple #26
0
def test_cull():
    # 'out' depends on 'x' and 'y', but not 'z'
    d = {"x": 1, "y": (inc, "x"), "z": (inc, "x"), "out": (add, "y", 10)}
    culled, dependencies = cull(d, "out")
    assert culled == {"x": 1, "y": (inc, "x"), "out": (add, "y", 10)}
    assert dependencies == {"x": [], "y": ["x"], "out": ["y"]}

    assert cull(d, "out") == cull(d, ["out"])
    assert cull(d, ["out", "z"])[0] == d
    assert cull(d, [["out"], ["z"]]) == cull(d, ["out", "z"])
    pytest.raises(KeyError, lambda: cull(d, "badkey"))
Exemple #27
0
    def __dask_optimize__(dsk, keys, **kwargs):
        """
        Optimize the dask object.

        .. note::

            The dask default optimizer induces too many (unnecesarry)
            IO calls. We turn this feature off by default, and only apply a culling.

        """
        from dask.optimization import cull
        dsk2, dependencies = cull(dsk, keys)
        return dsk2
Exemple #28
0
 def __dask_optimize__(cls, dsk, keys):
     dsk1, _ = optimization.cull(dsk, keys)
     dsk2 = {}
     coll = []
     for key, val in dsk1.items():
         if isinstance(key, tuple) and key[0].startswith('image'):
             name, z, x, y = key
             dfn, url, token, chunk = val
             dsk2[key] = (operator.getitem, "load_urls", (z, x, y))
             coll.append([url, token, (z, x, y)])
         else:
             dsk2[key] = val
     dsk2['load_urls'] = (cls.__fetch__, coll)
     return dsk2
Exemple #29
0
 def __dask_optimize__(cls, dsk, keys):
     dsk1, _ = optimization.cull(dsk, keys)
     dsk2 = {}
     coll = []
     for key, val in dsk1.items():
         if isinstance(key, tuple) and key[0].startswith('image'):
             name, z, x, y = key
             dfn, url, token, chunk = val
             dsk2[key] = (operator.getitem, "load_urls", (z, x, y))
             coll.append([url, token, (z, x, y)])
         else:
             dsk2[key] = val
     dsk2['load_urls'] = (cls.__fetch__, coll)
     return dsk2
Exemple #30
0
    def get(self, key=None):
        """Execute and return the result of the computation *key*.

        Only *key* and its dependencies are computed.

        Parameters
        ----------
        key : str, optional
            If not provided, :attr:`default_key` is used.

        Raises
        ------
        ValueError
            If `key` and :attr:`default_key` are both :obj:`None`.
        """
        if key is None:
            if self.default_key is not None:
                key = self.default_key
            else:
                raise ValueError('no default reporting key set')

        # Cull the graph, leaving only those needed to compute *key*
        dsk, deps = cull(self.graph, key)
        log.debug('Cull {} -> {} keys'.format(len(self.graph), len(dsk)))

        try:
            # Protect 'config' dict, so that dask schedulers do not try to
            # interpret its contents as further tasks. Workaround for
            # https://github.com/dask/dask/issues/3523
            dsk['config'] = dask.core.quote(dsk['config'])
        except KeyError:
            pass

        try:
            return dask_get(dsk, key)
        except Exception as exc:
            # Print the exception in case ComputationError.__str__ fails;
            # workaround for https://github.com/iiasa/ixmp/issues/206
            print(exc)
            raise ComputationError from exc
Exemple #31
0
def cached_array(array, token=None):
    """
    Return a new array that functionally has the same values as array,
    but flattens the underlying graph and introduces a cache lookup
    when the individual array chunks are accessed.

    Useful for caching data that can fit in-memory for the duration
    of the graph's execution.

    Parameters
    ----------
    array : :class:`dask.array.Array`
        dask array to cache.
    token : optional, str
        A unique token for identifying the internal cache.
        If None, it will be automatically generated.
    """
    dsk = dict(array.__dask_graph__())
    keys = set(flatten(array.__dask_keys__()))

    if token is None:
        token = uuid.uuid4().hex

    # Inline + cull everything except the current array
    inline_keys = set(dsk.keys() - keys)
    dsk2 = inline(dsk, inline_keys, inline_constants=True)
    dsk3, _ = cull(dsk2, keys)

    # Create a cache used to store array values
    cache = ArrayCache(token)

    assert len(dsk3) == len(keys)

    for k in keys:
        dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k))

    graph = HighLevelGraph.from_collections(array.name, dsk3, [])

    return da.Array(graph, array.name, array.chunks, array.dtype)
Exemple #32
0
def optimize(dsk, keys, **kwargs):
    dsk, _ = cull(dsk, keys)
    return dsk
Exemple #33
0
 def cull2(dsk, keys):  # type: ignore
     return cull(dsk if type(dsk) is dict else dict(dsk), keys)
Exemple #34
0
 def from_darray(cls, darr, new=tuple.__new__, len=len):
     dsk, _ = optimization.cull(darr.dask, darr.__dask_keys__())
     itr = [dsk, darr.name, darr.chunks, darr.dtype, darr.shape]
     return cls._make(itr)
Exemple #35
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds))
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5)
            current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 )
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = RDA_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5)
        daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt)
        return image[box(*output_bounds)]
Exemple #36
0
def featurize_single_ts(ts,
                        features_to_use,
                        custom_script_path=None,
                        custom_functions=None,
                        raise_exceptions=True):
    """Compute feature values for a given single time-series. Data is
    returned as dictionaries/lists of lists.

    Parameters
    ----------
    ts : TimeSeries object
        Single time series to be featurized.
    features_to_use : list of str
        List of feature names to be generated.
    custom_functions : dict, optional
        Dictionary of custom feature functions to be evaluated for the given
        time series, or a dictionary representing a dask graph of function
        evaluations. Dictionaries of functions should have keys `feature_name`
        and values functions that take arguments (t, m, e); in the case of a
        dask graph, these arrays should be referenced as 't', 'm', 'e',
        respectively, and any values with keys present in `features_to_use`
        will be computed.
    raise_exceptions : bool, optional
        If True, exceptions during feature computation are raised immediately;
        if False, exceptions are supressed and `np.nan` is returned for the
        given feature and any dependent features. Defaults to True.

    Returns
    -------
    dict
        Dictionary with feature names as keys, lists of feature values (one per
        channel) as values.
    """
    # Initialize empty feature array for all channels
    feature_values = np.empty((len(features_to_use), ts.n_channels))
    for (t_i, m_i, e_i), i in zip(ts.channels(), range(ts.n_channels)):
        feature_graph = generate_dask_graph(t_i, m_i, e_i)
        feature_graph.update(ts.meta_features)

        if custom_functions:
            # If values in custom_functions are functions, add calls to graph
            if all(hasattr(v, '__call__') for v in custom_functions.values()):
                feature_graph.update({
                    feat: f(t_i, m_i, e_i)
                    for feat, f in custom_functions.items()
                })
            # Otherwise, custom_functions is another dask graph
            else:
                feature_graph.update(custom_functions)

        # Do not execute in parallel; parallelization has already taken place
        # at the level of time series, so we compute features for a single time
        # series in serial.
        if raise_exceptions:
            raise_callback = reraise
        else:
            raise_callback = lambda e, tb: None
        culled_feature_graph, _ = cull(feature_graph, features_to_use)
        dask_values = dask.get(culled_feature_graph,
                               features_to_use,
                               raise_exception=raise_callback,
                               pack_exception=pack_exception)
        feature_values[:, i] = [
            x if not isinstance(x, Exception) else np.nan for x in dask_values
        ]
    index = pd.MultiIndex.from_product((features_to_use, range(ts.n_channels)),
                                       names=('feature', 'channel'))
    return pd.Series(feature_values.ravel(), index=index)
Exemple #37
0
    def warp(self, dem=None, proj="EPSG:4326", **kwargs):
        """Delayed warp across an entire AOI or Image

        Creates a new dask image by deferring calls to the warp_geometry on chunks

        Args:
            dem (ndarray): optional. A DEM for warping to specific elevation planes
            proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612")

        Returns:
            daskarray: a warped image as deferred image array
        """
        try:
            img_md = self.rda.metadata["image"]
            x_size = img_md["tileXSize"]
            y_size = img_md["tileYSize"]
        except (AttributeError, KeyError):
            x_size = kwargs.get("chunk_size", 256)
            y_size = kwargs.get("chunk_size", 256)

        # Create an affine transform to convert between real-world and pixels
        if self.proj is None:
            from_proj = "EPSG:4326"
        else:
            from_proj = self.proj

        try:
            # NOTE: this only works on images that have rda rpcs metadata
            center = wkt.loads(
                self.rda.metadata["image"]["imageBoundsWGS84"]).centroid
            g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] /
                                    2).bounds))
            tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5)
            current_bounds = wkt.loads(
                self.rda.metadata["image"]["imageBoundsWGS84"]).bounds
        except (AttributeError, KeyError, TypeError):
            tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj),
                          pyproj.Proj(init=proj))
            gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area /
                                     (self.shape[1] * self.shape[2]))**0.5)
            current_bounds = self.bounds

        tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj),
                      pyproj.Proj(init=proj))
        itfm = partial(pyproj.transform, pyproj.Proj(init=proj),
                       pyproj.Proj(init=from_proj))
        output_bounds = ops.transform(tfm, box(*current_bounds)).bounds
        gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3],
                               0.0, -1 * gsd)

        ll = ~gtf * (output_bounds[:2])
        ur = ~gtf * (output_bounds[2:])
        x_chunks = int((ur[0] - ll[0]) / x_size) + 1
        y_chunks = int((ll[1] - ur[1]) / y_size) + 1

        num_bands = self.shape[0]

        try:
            dtype = RDA_TO_DTYPE[img_md["dataType"]]
        except:
            dtype = 'uint8'

        daskmeta = {
            "dask": {},
            "chunks": (num_bands, y_size, x_size),
            "dtype": dtype,
            "name": "warp-{}".format(self.name),
            "shape": (num_bands, y_chunks * y_size, x_chunks * x_size)
        }

        def px_to_geom(xmin, ymin):
            xmax = int(xmin + x_size)
            ymax = int(ymin + y_size)
            bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin)))
            return box(*bounds)

        full_bounds = box(*output_bounds)

        dasks = []
        if isinstance(dem, GeoDaskImage):
            if dem.proj != proj:
                dem = dem.warp(proj=proj, dem=dem)
            dasks.append(dem.dask)

        for y in xrange(y_chunks):
            for x in xrange(x_chunks):
                xmin = x * x_size
                ymin = y * y_size
                geometry = px_to_geom(xmin, ymin)
                daskmeta["dask"][(daskmeta["name"], 0, y,
                                  x)] = (self._warp, geometry, gsd, dem, proj,
                                         dtype, 5)
        daskmeta["dask"], _ = optimization.cull(
            sharedict.merge(daskmeta["dask"], *dasks),
            list(daskmeta["dask"].keys()))

        gi = mapping(full_bounds)
        gt = AffineTransform(gtf, proj)
        image = GeoDaskImage(daskmeta,
                             __geo_interface__=gi,
                             __geo_transform__=gt)
        return image[box(*output_bounds)]
Exemple #38
0
 def from_darray(cls, darr, new=tuple.__new__, len=len):
     dsk, _ = optimization.cull(darr.dask, darr.__dask_keys__())
     itr = [dsk, darr.name, darr.chunks, darr.dtype, darr.shape]
     return cls._make(itr)