def optimize(dsk, keys, **kwargs): flatkeys = list(flatten(keys)) if isinstance(keys, list) else [keys] dsk, dependencies = cull(dsk, flatkeys) dsk, dependencies = fuse(dsk, keys, dependencies=dependencies, ave_width=_globals.get('fuse_ave_width', 1)) dsk, _ = cull(dsk, keys) return dsk
def inlined_array(a, inline_arrays=None): """ Flatten underlying graph """ agraph = a.__dask_graph__() akeys = set(flatten(a.__dask_keys__())) # Inline everything except the output keys if inline_arrays is None: inline_keys = set(agraph.keys()) - akeys dsk2 = inline(agraph, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) graph = HighLevelGraph.from_collections(a.name, dsk3, []) return da.Array(graph, a.name, a.chunks, dtype=a.dtype) # We're given specific arrays to inline, promote to list if isinstance(inline_arrays, da.Array): inline_arrays = [inline_arrays] elif isinstance(inline_arrays, tuple): inline_arrays = list(inline_arrays) if not isinstance(inline_arrays, list): raise TypeError("Invalid inline_arrays, must be " "(None, list, tuple, dask.array.Array)") inline_names = set(a.name for a in inline_arrays) layers = agraph.layers.copy() deps = {k: v.copy() for k, v in agraph.dependencies.items()} # We want to inline layers that depend on the inlined arrays inline_layers = set(k for k, v in deps.items() if len(inline_names.intersection(v)) > 0) for layer_name in inline_layers: dsk = dict(layers[layer_name]) layer_keys = set(dsk.keys()) inline_keys = set() for array in inline_arrays: dsk.update(layers[array.name]) deps.pop(array.name, None) deps[layer_name].discard(array.name) inline_keys.update(layers[array.name].keys()) dsk2 = inline(dsk, keys=inline_keys, inline_constants=True) layers[layer_name], _ = cull(dsk2, layer_keys) # Remove layers containing the inlined arrays for inline_name in inline_names: layers.pop(inline_name) return da.Array(HighLevelGraph(layers, deps), a.name, a.chunks, a.dtype)
def inlined_array(a, inline_arrays=None): """ Flatten underlying graph """ agraph = a.__dask_graph__() akeys = set(flatten(a.__dask_keys__())) # Inline everything except the output keys if inline_arrays is None: inline_keys = set(agraph.keys()) - akeys dsk2 = inline(agraph, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) graph = HighLevelGraph.from_collections(a.name, dsk3, []) return da.Array(graph, a.name, a.chunks, dtype=a.dtype) # We're given specific arrays to inline, promote to list if isinstance(inline_arrays, da.Array): inline_arrays = [inline_arrays] elif isinstance(inline_arrays, tuple): inline_arrays = list(inline_arrays) if not isinstance(inline_arrays, list): raise TypeError("Invalid inline_arrays, must be " "(None, list, tuple, dask.array.Array)") layers = agraph.layers.copy() deps = agraph.dependencies.copy() inline_keys = set() dsk = dict(layers[a.name]) # Inline specified arrays for array in inline_arrays: # Remove array from layers and dependencies try: dsk.update(layers.pop(array.name)) del deps[array.name] except KeyError: raise ValueError("%s is not a valid dependency of a" % array.name) # Record keys to inline inline_keys.update(flatten(array.__dask_keys__())) dsk2 = inline(dsk, keys=inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, akeys) layers[a.name] = dsk3 graph = HighLevelGraph(layers, deps) return da.Array(graph, a.name, a.chunks, a.dtype)
def cached_array(array): """ Return a new array that functionally has the same values as array, but flattens the underlying graph and introduces a cache lookup when the individual array chunks are accessed. Useful for caching data that can fit in-memory for the duration of the graph's execution. """ dsk = dict(array.__dask_graph__()) keys = set(flatten(array.__dask_keys__())) # Inline + cull everything except the current array inline_keys = set(dsk.keys() - keys) dsk2 = inline(dsk, inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, keys) # Create a cache used to store array values cache = ArrayCache(uuid.uuid4().hex) for k in keys: dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k)) graph = HighLevelGraph.from_collections(array.name, dsk3, []) return da.Array(graph, array.name, array.chunks, array.dtype)
def optimize(dsk, keys, **kwargs): if not isinstance(keys, (list, set)): keys = [keys] keys = list(core.flatten(keys)) if not isinstance(dsk, HighLevelGraph): dsk = HighLevelGraph.from_collections(id(dsk), dsk, dependencies=()) else: # Perform Blockwise optimizations for HLG input dsk = optimize_dataframe_getitem(dsk, keys=keys) dsk = optimize_blockwise(dsk, keys=keys) dsk = fuse_roots(dsk, keys=keys) dsk = dsk.cull(set(keys)) # Do not perform low-level fusion unless the user has # specified True explicitly. The configuration will # be None by default. if not config.get("optimization.fuse.active"): return dsk dependencies = dsk.get_all_dependencies() dsk = ensure_dict(dsk) fuse_subgraphs = config.get("optimization.fuse.subgraphs") if fuse_subgraphs is None: fuse_subgraphs = True dsk, _ = fuse( dsk, keys, dependencies=dependencies, fuse_subgraphs=fuse_subgraphs, ) dsk, _ = cull(dsk, keys) return dsk
def inline_pattern(dsk: dict, pat_ls: List[str], inline_constants: bool) -> dict: """ Inline tasks whose keys match certain patterns. Parameters ---------- dsk : dict Input dask graph. pat_ls : List[str] List of patterns to check. inline_constants : bool Whether to inline constants. Returns ------- dsk : dict Dask graph with keys inlined. See Also ------- dask.optimization.inline """ keys = [k for k in dsk.keys() if check_pat(k, pat_ls)] if keys: dsk = inline(dsk, keys, inline_constants=inline_constants) for k in keys: del dsk[k] if inline_constants: dsk, dep = cull(dsk, set(list(flatten(keys)))) return dsk
def get(self, key=None): """Execute and return the result of the computation *key*. Only *key* and its dependencies are computed. Parameters ---------- key : str, optional If not provided, :attr:`default_key` is used. Raises ------ ValueError If `key` and :attr:`default_key` are both :obj:`None`. """ if key is None: if self.default_key is not None: key = self.default_key else: raise ValueError('no default reporting key set') # Cull the graph, leaving only those needed to compute *key* dsk, deps = cull(self.graph, key) log.debug('Cull {} -> {} keys'.format(len(self.graph), len(dsk))) try: return dask_get(dsk, key) except Exception as exc: raise ComputationError from exc
def test_SubgraphCallable(): non_hashable = [1, 2, 3] dsk = { 'a': (apply, add, ['in1', 2]), 'b': (apply, partial_by_order, ['in2'], { 'function': func_with_kwargs, 'other': [(1, 20)], 'c': 4 }), 'c': (apply, partial_by_order, ['in2', 'in1'], { 'function': func_with_kwargs, 'other': [(1, 20)] }), 'd': (inc, 'a'), 'e': (add, 'c', 'd'), 'f': ['a', 2, 'b', (add, 'b', (sum, non_hashable))], 'h': (add, (sum, 'f'), (sum, ['a', 'b'])) } f = SubgraphCallable(dsk, 'h', ['in1', 'in2'], name='test') assert f.name == 'test' assert repr(f) == 'test' dsk2 = dsk.copy() dsk2.update({'in1': 1, 'in2': 2}) assert f(1, 2) == get_sync(cull(dsk2, ['h'])[0], ['h'])[0] assert f(1, 2) == f(1, 2) f2 = pickle.loads(pickle.dumps(f)) assert f2(1, 2) == f(1, 2)
def test_SubgraphCallable(): non_hashable = [1, 2, 3] dsk = {'a': (apply, add, ['in1', 2]), 'b': (apply, partial_by_order, ['in2'], {'function': func_with_kwargs, 'other': [(1, 20)], 'c': 4}), 'c': (apply, partial_by_order, ['in2', 'in1'], {'function': func_with_kwargs, 'other': [(1, 20)]}), 'd': (inc, 'a'), 'e': (add, 'c', 'd'), 'f': ['a', 2, 'b', (add, 'b', (sum, non_hashable))], 'g': (dontcall, 'in1'), 'h': (add, (sum, 'f'), (sum, ['a', 'b']))} f = SubgraphCallable(dsk, 'h', ['in1', 'in2'], name='test') assert f.name == 'test' assert repr(f) == 'test' dsk2 = dsk.copy() dsk2.update({'in1': 1, 'in2': 2}) assert f(1, 2) == get_sync(cull(dsk2, ['h'])[0], ['h'])[0] assert f(1, 2) == f(1, 2) f2 = pickle.loads(pickle.dumps(f)) assert f2(1, 2) == f(1, 2)
def test_SubgraphCallable(): non_hashable = [1, 2, 3] dsk = { "a": (apply, add, ["in1", 2]), "b": ( apply, partial_by_order, ["in2"], {"function": func_with_kwargs, "other": [(1, 20)], "c": 4}, ), "c": ( apply, partial_by_order, ["in2", "in1"], {"function": func_with_kwargs, "other": [(1, 20)]}, ), "d": (inc, "a"), "e": (add, "c", "d"), "f": ["a", 2, "b", (add, "b", (sum, non_hashable))], "h": (add, (sum, "f"), (sum, ["a", "b"])), } f = SubgraphCallable(dsk, "h", ["in1", "in2"], name="test") assert f.name == "test" assert repr(f) == "test" dsk2 = dsk.copy() dsk2.update({"in1": 1, "in2": 2}) assert f(1, 2) == get_sync(cull(dsk2, ["h"])[0], ["h"])[0] assert f(1, 2) == f(1, 2) f2 = pickle.loads(pickle.dumps(f)) assert f2(1, 2) == f(1, 2)
def test_shuffle_hlg_layer(): # This test checks that the `ShuffleLayer` HLG Layer # is used (as expected) for a multi-stage shuffle. ddf = dd.from_pandas(pd.DataFrame({"a": np.random.randint(0, 10, 100)}), npartitions=10) ddf_shuffled = ddf.shuffle("a", max_branch=3, shuffle="tasks") keys = [(ddf_shuffled._name, i) for i in range(ddf_shuffled.npartitions)] # Make sure HLG culling reduces the graph size dsk = ddf_shuffled.__dask_graph__() dsk_culled = dsk.cull(set(keys)) assert len(dsk_culled) < len(dsk) assert isinstance(dsk_culled, dask.highlevelgraph.HighLevelGraph) # Ensure we have ShuffleLayers assert any( isinstance(layer, dd.shuffle.ShuffleLayer) for layer in dsk.layers.values()) # Check ShuffleLayer names for name, layer in dsk.layers.items(): if isinstance(layer, dd.shuffle.ShuffleLayer): assert name.startswith("shuffle-") # Since we already culled the HLG, # culling the dictionary should not change the graph dsk_dict = dict(dsk_culled) dsk_dict_culled, _ = cull(dsk_dict, keys) assert dsk_dict_culled == dsk_dict
def test_fuse_getitem(): def load(*args): pass dsk = {"x": (load, "store", "part", ["a", "b"]), "y": (getitem, "x", "a")} dsk2 = fuse_getitem(dsk, load, 3) dsk2, dependencies = cull(dsk2, "y") assert dsk2 == {"y": (load, "store", "part", "a")}
def test_fuse_getitem(): def load(*args): pass dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')} dsk2 = fuse_getitem(dsk, load, 3) dsk2, dependencies = cull(dsk2, 'y') assert dsk2 == {'y': (load, 'store', 'part', 'a')}
def _generate_dask_graph(data, keys): """ Generate a dask graph from a subset of REGISTRY. """ tasks = cull(REGISTRY, keys)[0] for k in unresolved(tasks, keys): tasks[k] = data[k] return tasks
def test_fuse_selections(): def load(*args): pass dsk = {'x': (load, 'store', 'part', ['a', 'b']), 'y': (getitem, 'x', 'a')} merge = lambda t1, t2: (load, t2[1], t2[2], t1[2]) dsk2 = fuse_selections(dsk, getitem, load, merge) dsk2, dependencies = cull(dsk2, 'y') assert dsk2 == {'y': (load, 'store', 'part', 'a')}
def test_inline_cull_dependencies(): d = {'a': 1, 'b': 'a', 'c': 'b', 'd': ['a', 'b', 'c'], 'e': (add, (len, 'd'), 'a')} d2, dependencies = cull(d, ['d', 'e']) inline(d2, {'b'}, dependencies=dependencies)
def test_fuse_selections(): def load(*args): pass dsk = {"x": (load, "store", "part", ["a", "b"]), "y": (getitem, "x", "a")} merge = lambda t1, t2: (load, t2[1], t2[2], t1[2]) dsk2 = fuse_selections(dsk, getitem, load, merge) dsk2, dependencies = cull(dsk2, "y") assert dsk2 == {"y": (load, "store", "part", "a")}
def __dask_optimize__(dsk, keys, **kwargs): """ Notes ----- The dask default optimizer induces too many (unnecesarry) IO calls -- we turn this off feature off by default, and only apply a culling. """ from dask.optimization import cull dsk2, dependencies = cull(dsk, keys) return dsk2
def test_inline_cull_dependencies(): d = { "a": 1, "b": "a", "c": "b", "d": ["a", "b", "c"], "e": (add, (len, "d"), "a") } d2, dependencies = cull(d, ["d", "e"]) inline(d2, {"b"}, dependencies=dependencies)
def get(graph, key, local=True): # new function because from version 19.1 on, dask no longer culls graphs (apparently it's only done by distributed) # so the function combines a get call with previous culling (which means to modify a graph in such a way that only the relevant bit to the key remains) import dask from dask.optimization import cull cgraph = cull(graph, key)[0] # with ProgressBar(): if local: return dask.local.get_sync(cgraph, key) else: return dask.threaded.get(cgraph, key)
def test_cull(): # 'out' depends on 'x' and 'y', but not 'z' d = {'x': 1, 'y': (inc, 'x'), 'z': (inc, 'x'), 'out': (add, 'y', 10)} culled, dependencies = cull(d, 'out') assert culled == {'x': 1, 'y': (inc, 'x'), 'out': (add, 'y', 10)} assert dependencies == {'x': [], 'y': ['x'], 'out': ['y']} assert cull(d, 'out') == cull(d, ['out']) assert cull(d, ['out', 'z'])[0] == d assert cull(d, [['out'], ['z']]) == cull(d, ['out', 'z']) pytest.raises(KeyError, lambda: cull(d, 'badkey'))
def test_cull(): # 'out' depends on 'x' and 'y', but not 'z' d = {"x": 1, "y": (inc, "x"), "z": (inc, "x"), "out": (add, "y", 10)} culled, dependencies = cull(d, "out") assert culled == {"x": 1, "y": (inc, "x"), "out": (add, "y", 10)} assert dependencies == {"x": [], "y": ["x"], "out": ["y"]} assert cull(d, "out") == cull(d, ["out"]) assert cull(d, ["out", "z"])[0] == d assert cull(d, [["out"], ["z"]]) == cull(d, ["out", "z"]) pytest.raises(KeyError, lambda: cull(d, "badkey"))
def __dask_optimize__(dsk, keys, **kwargs): """ Optimize the dask object. .. note:: The dask default optimizer induces too many (unnecesarry) IO calls. We turn this feature off by default, and only apply a culling. """ from dask.optimization import cull dsk2, dependencies = cull(dsk, keys) return dsk2
def __dask_optimize__(cls, dsk, keys): dsk1, _ = optimization.cull(dsk, keys) dsk2 = {} coll = [] for key, val in dsk1.items(): if isinstance(key, tuple) and key[0].startswith('image'): name, z, x, y = key dfn, url, token, chunk = val dsk2[key] = (operator.getitem, "load_urls", (z, x, y)) coll.append([url, token, (z, x, y)]) else: dsk2[key] = val dsk2['load_urls'] = (cls.__fetch__, coll) return dsk2
def get(self, key=None): """Execute and return the result of the computation *key*. Only *key* and its dependencies are computed. Parameters ---------- key : str, optional If not provided, :attr:`default_key` is used. Raises ------ ValueError If `key` and :attr:`default_key` are both :obj:`None`. """ if key is None: if self.default_key is not None: key = self.default_key else: raise ValueError('no default reporting key set') # Cull the graph, leaving only those needed to compute *key* dsk, deps = cull(self.graph, key) log.debug('Cull {} -> {} keys'.format(len(self.graph), len(dsk))) try: # Protect 'config' dict, so that dask schedulers do not try to # interpret its contents as further tasks. Workaround for # https://github.com/dask/dask/issues/3523 dsk['config'] = dask.core.quote(dsk['config']) except KeyError: pass try: return dask_get(dsk, key) except Exception as exc: # Print the exception in case ComputationError.__str__ fails; # workaround for https://github.com/iiasa/ixmp/issues/206 print(exc) raise ComputationError from exc
def cached_array(array, token=None): """ Return a new array that functionally has the same values as array, but flattens the underlying graph and introduces a cache lookup when the individual array chunks are accessed. Useful for caching data that can fit in-memory for the duration of the graph's execution. Parameters ---------- array : :class:`dask.array.Array` dask array to cache. token : optional, str A unique token for identifying the internal cache. If None, it will be automatically generated. """ dsk = dict(array.__dask_graph__()) keys = set(flatten(array.__dask_keys__())) if token is None: token = uuid.uuid4().hex # Inline + cull everything except the current array inline_keys = set(dsk.keys() - keys) dsk2 = inline(dsk, inline_keys, inline_constants=True) dsk3, _ = cull(dsk2, keys) # Create a cache used to store array values cache = ArrayCache(token) assert len(dsk3) == len(keys) for k in keys: dsk3[k] = (cache_entry, cache, Key(k), dsk3.pop(k)) graph = HighLevelGraph.from_collections(array.name, dsk3, []) return da.Array(graph, array.name, array.chunks, array.dtype)
def optimize(dsk, keys, **kwargs): dsk, _ = cull(dsk, keys) return dsk
def cull2(dsk, keys): # type: ignore return cull(dsk if type(dsk) is dict else dict(dsk), keys)
def from_darray(cls, darr, new=tuple.__new__, len=len): dsk, _ = optimization.cull(darr.dask, darr.__dask_keys__()) itr = [dsk, darr.name, darr.chunks, darr.dtype, darr.shape] return cls._make(itr)
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area ** 0.5) current_bounds = wkt.loads(self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2])) ** 0.5 ) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = RDA_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull(HighLevelGraph.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__ = gi, __geo_transform__ = gt) return image[box(*output_bounds)]
def featurize_single_ts(ts, features_to_use, custom_script_path=None, custom_functions=None, raise_exceptions=True): """Compute feature values for a given single time-series. Data is returned as dictionaries/lists of lists. Parameters ---------- ts : TimeSeries object Single time series to be featurized. features_to_use : list of str List of feature names to be generated. custom_functions : dict, optional Dictionary of custom feature functions to be evaluated for the given time series, or a dictionary representing a dask graph of function evaluations. Dictionaries of functions should have keys `feature_name` and values functions that take arguments (t, m, e); in the case of a dask graph, these arrays should be referenced as 't', 'm', 'e', respectively, and any values with keys present in `features_to_use` will be computed. raise_exceptions : bool, optional If True, exceptions during feature computation are raised immediately; if False, exceptions are supressed and `np.nan` is returned for the given feature and any dependent features. Defaults to True. Returns ------- dict Dictionary with feature names as keys, lists of feature values (one per channel) as values. """ # Initialize empty feature array for all channels feature_values = np.empty((len(features_to_use), ts.n_channels)) for (t_i, m_i, e_i), i in zip(ts.channels(), range(ts.n_channels)): feature_graph = generate_dask_graph(t_i, m_i, e_i) feature_graph.update(ts.meta_features) if custom_functions: # If values in custom_functions are functions, add calls to graph if all(hasattr(v, '__call__') for v in custom_functions.values()): feature_graph.update({ feat: f(t_i, m_i, e_i) for feat, f in custom_functions.items() }) # Otherwise, custom_functions is another dask graph else: feature_graph.update(custom_functions) # Do not execute in parallel; parallelization has already taken place # at the level of time series, so we compute features for a single time # series in serial. if raise_exceptions: raise_callback = reraise else: raise_callback = lambda e, tb: None culled_feature_graph, _ = cull(feature_graph, features_to_use) dask_values = dask.get(culled_feature_graph, features_to_use, raise_exception=raise_callback, pack_exception=pack_exception) feature_values[:, i] = [ x if not isinstance(x, Exception) else np.nan for x in dask_values ] index = pd.MultiIndex.from_product((features_to_use, range(ts.n_channels)), names=('feature', 'channel')) return pd.Series(feature_values.ravel(), index=index)
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads( self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5) current_bounds = wkt.loads( self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2]))**0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = RDA_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull( sharedict.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt) return image[box(*output_bounds)]