def _dask_unique(x, return_index=True): from dask.array.core import Array from dask import sharedict from numpy import cumsum, concatenate, unique from numpy.testing import assert_ assert_(return_index) name = "unique-" + x.name def _unique(x): return unique(x, return_index=return_index) dsk = dict(((name, i), (_unique, key)) for i, key in enumerate(x._keys())) parts = Array._get(sharedict.merge((name, dsk), x.dask), list(dsk.keys())) arrs = [a[0] for a in parts] chunks = x.chunks[0] offset = cumsum((0, ) + chunks)[:-1] idxs = [parts[i][1] + offset[i] for i in range(len(parts))] arr = concatenate(arrs) idx = concatenate(idxs) u, i = unique(arr, return_index=True) return u, idx[i]
def elemwise(op, *args, **kwargs): # See also da.core.elemwise. Note: dask seems to be able to convert Python # and numpy objects in this function, thus supporting operations between # dask objects and others. This would be useful for us as well. # Do not support mismatching chunking for now. n_chunk = None slice_dim = None for arg in args: if isinstance(arg, DatasetCollection): if n_chunk is not None: assert n_chunk == arg.n_chunk assert slice_dim == arg.slice_dim else: n_chunk = arg.n_chunk slice_dim = arg.slice_dim out = '{}-{}'.format(funcname(op), tokenize(op, *args)) out_ind = (0, ) # Handling only 1D chunking here, so everything is (0,). arginds = list( (a, (0, ) if isinstance(a, DatasetCollection) else None) for a in args) numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None} argindsstr = list( concat([(a if ind is None else a.name, ind) for a, ind in arginds])) dsk = top(op, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs) dsks = [a.dask for a, ind in arginds if ind is not None] return DatasetCollection(sharedict.merge((out, dsk), *dsks), out, n_chunk, slice_dim)
def _wrap(self, func, *args, **kwargs): """ Wrap numpy random function to produce dask.array random function extra_chunks should be a chunks tuple to append to the end of chunks """ size = kwargs.pop('size', None) chunks = kwargs.pop('chunks') extra_chunks = kwargs.pop('extra_chunks', ()) if size is not None and not isinstance(size, (tuple, list)): size = (size, ) args_shapes = { ar.shape for ar in args if isinstance(ar, (Array, np.ndarray)) } args_shapes.union({ ar.shape for ar in kwargs.values() if isinstance(ar, (Array, np.ndarray)) }) shapes = list(args_shapes) if size is not None: shapes += [size] # broadcast to the final size(shape) size = broadcast_shapes(*shapes) chunks = normalize_chunks(chunks, size) slices = slices_from_chunks(chunks) def _broadcast_any(ar, shape, chunks): if isinstance(ar, Array): return broadcast_to(ar, shape).rechunk(chunks) if isinstance(ar, np.ndarray): return np.ascontiguousarray(np.broadcast_to(ar, shape)) # Broadcast all arguments, get tiny versions as well # Start adding the relevant bits to the graph dsk = {} dsks = [] lookup = {} small_args = [] for i, ar in enumerate(args): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dsks.append(res.dask) lookup[i] = res.name elif isinstance(res, np.ndarray): name = 'array-{}'.format(tokenize(res)) lookup[i] = name dsk[name] = res small_args.append(ar[tuple(0 for _ in ar.shape)]) else: small_args.append(ar) small_kwargs = {} for key, ar in kwargs.items(): if isinstance(ar, (np.ndarray, Array)): res = _broadcast_any(ar, size, chunks) if isinstance(res, Array): dsks.append(res.dask) lookup[key] = res.name elif isinstance(res, np.ndarray): name = 'array-{}'.format(tokenize(res)) lookup[key] = name dsk[name] = res small_kwargs[key] = ar[tuple(0 for _ in ar.shape)] else: small_kwargs[key] = ar # Get dtype small_kwargs['size'] = (0, ) dtype = func(xoroshiro128plus.RandomState(), *small_args, **small_kwargs).dtype sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) token = tokenize(state_data, size, chunks, args, kwargs) name = 'da.random.{0}-{1}'.format(func.__name__, token) keys = product([name], *([range(len(bd)) for bd in chunks] + [[0]] * len(extra_chunks))) blocks = product(*[range(len(bd)) for bd in chunks]) vals = [] for state, size, slc, block in zip(state_data, sizes, slices, blocks): arg = [] for i, ar in enumerate(args): if i not in lookup: arg.append(ar) else: if isinstance(ar, Array): arg.append((lookup[i], ) + block) else: # np.ndarray arg.append((getitem, lookup[i], slc)) kwrg = {} for k, ar in kwargs.items(): if k not in lookup: kwrg[k] = ar else: if isinstance(ar, Array): kwrg[k] = (lookup[k], ) + block else: # np.ndarray kwrg[k] = (getitem, lookup[k], slc) vals.append((_apply_random, func.__name__, state, size, arg, kwrg)) dsk.update(dict(zip(keys, vals))) dsk = sharedict.merge((name, dsk), *dsks) return Array(dsk, name, chunks + extra_chunks, dtype=dtype)
def choice(self, a, size=None, replace=True, p=None, chunks=None): dsks = [] # Normalize and validate `a` if isinstance(a, Integral): # On windows the output dtype differs if p is provided or # absent, see https://github.com/numpy/numpy/issues/9867 dummy_p = np.array([1]) if p is not None else p dtype = np.random.choice(1, size=(), p=dummy_p).dtype len_a = a if a < 0: raise ValueError("a must be greater than 0") else: a = asarray(a).rechunk(a.shape) dtype = a.dtype if a.ndim != 1: raise ValueError("a must be one dimensional") len_a = len(a) dsks.append(a.dask) a = a.__dask_keys__()[0] # Normalize and validate `p` if p is not None: if not isinstance(p, Array): # If p is not a dask array, first check the sum is close # to 1 before converting. p = np.asarray(p) if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0): raise ValueError("probabilities do not sum to 1") p = asarray(p) else: p = p.rechunk(p.shape) if p.ndim != 1: raise ValueError("p must be one dimensional") if len(p) != len_a: raise ValueError("a and p must have the same size") dsks.append(p.dask) p = p.__dask_keys__()[0] if size is None: size = () elif not isinstance(size, (tuple, list)): size = (size, ) chunks = normalize_chunks(chunks, size) sizes = list(product(*chunks)) state_data = random_state_data(len(sizes), self._numpy_state) name = 'da.random.choice-%s' % tokenize(state_data, size, chunks, a, replace, p) keys = product([name], *(range(len(bd)) for bd in chunks)) dsk = { k: (_choice, state, a, size, replace, p) for k, state, size in zip(keys, state_data, sizes) } return Array(sharedict.merge((name, dsk), *dsks), name, chunks, dtype=dtype)
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """Delayed warp across an entire AOI or Image Creates a new dask image by deferring calls to the warp_geometry on chunks Args: dem (ndarray): optional. A DEM for warping to specific elevation planes proj (str): optional. An EPSG proj string to project the image data into ("EPSG:32612") Returns: daskarray: a warped image as deferred image array """ try: img_md = self.rda.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have rda rpcs metadata center = wkt.loads( self.rda.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.rda.metadata["rpcs"]["gsd"] / 2).bounds)) tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5) current_bounds = wkt.loads( self.rda.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2]))**0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 num_bands = self.shape[0] try: dtype = RDA_TO_DTYPE[img_md["dataType"]] except: dtype = 'uint8' daskmeta = { "dask": {}, "chunks": (num_bands, y_size, x_size), "dtype": dtype, "name": "warp-{}".format(self.name), "shape": (num_bands, y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoDaskImage): if dem.proj != proj: dem = dem.warp(proj=proj, dem=dem) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, dtype, 5) daskmeta["dask"], _ = optimization.cull( sharedict.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) gi = mapping(full_bounds) gt = AffineTransform(gtf, proj) image = GeoDaskImage(daskmeta, __geo_interface__=gi, __geo_transform__=gt) return image[box(*output_bounds)]
def fit(model, x, y, compute=True, shuffle_blocks=True, random_state=None, **kwargs): """ Fit scikit learn model against dask arrays Model must support the ``partial_fit`` interface for online or batch learning. Ideally your rows are independent and identically distributed. By default, this function will step through chunks of the arrays in random order. Parameters ---------- model: sklearn model Any model supporting partial_fit interface x: dask Array Two dimensional array, likely tall and skinny y: dask Array One dimensional array with same chunks as x's rows compute : bool Whether to compute this result shuffle_blocks : bool Whether to shuffle the blocks with ``random_state`` or not random_state : int or numpy.random.RandomState Random state to use when shuffling blocks kwargs: options to pass to partial_fit Examples -------- >>> import dask.array as da >>> X = da.random.random((10, 3), chunks=(5, 3)) >>> y = da.random.randint(0, 2, 10, chunks=(5,)) >>> from sklearn.linear_model import SGDClassifier >>> sgd = SGDClassifier() >>> sgd = da.learn.fit(sgd, X, y, classes=[1, 0]) >>> sgd # doctest: +SKIP SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=False, verbose=0, warm_start=False) This passes all of X and y through the classifier sequentially. We can use the classifier as normal on in-memory data >>> import numpy as np >>> sgd.predict(np.random.random((4, 3))) # doctest: +SKIP array([1, 0, 0, 1]) Or predict on a larger dataset >>> z = da.random.random((400, 3), chunks=(100, 3)) >>> da.learn.predict(sgd, z) # doctest: +SKIP dask.array<x_11, shape=(400,), chunks=((100, 100, 100, 100),), dtype=int64> """ if not hasattr(x, "chunks") and hasattr(x, "to_dask_array"): x = x.to_dask_array() assert x.ndim == 2 if y is not None: if not hasattr(y, "chunks") and hasattr(y, "to_dask_array"): y = y.to_dask_array() assert y.ndim == 1 assert x.chunks[0] == y.chunks[0] assert hasattr(model, "partial_fit") if len(x.chunks[1]) > 1: x = x.rechunk(chunks=(x.chunks[0], sum(x.chunks[1]))) nblocks = len(x.chunks[0]) order = list(range(nblocks)) if shuffle_blocks: rng = sklearn.utils.check_random_state(random_state) rng.shuffle(order) name = "fit-" + dask.base.tokenize(model, x, y, kwargs, order) dsk = {(name, -1): model} dsk.update({(name, i): ( _partial_fit, (name, i - 1), (x.name, order[i], 0), (getattr(y, "name", ""), order[i]), kwargs, ) for i in range(nblocks)}) graphs = {x.name: x.__dask_graph__(), name: dsk} if hasattr(y, "__dask_graph__"): graphs[y.name] = y.__dask_graph__() try: from dask.highlevelgraph import HighLevelGraph new_dsk = HighLevelGraph.merge(*graphs.values()) except ImportError: from dask import sharedict new_dsk = sharedict.merge(*graphs.values()) value = Delayed((name, nblocks - 1), new_dsk) if compute: return value.compute() else: return value
def warp(self, dem=None, proj="EPSG:4326", **kwargs): """ Delayed warp across an entire AOI or Image creates a new dask image by deferring calls to the warp_geometry on chunks """ try: img_md = self.ipe.metadata["image"] x_size = img_md["tileXSize"] y_size = img_md["tileYSize"] except (AttributeError, KeyError): x_size = kwargs.get("chunk_size", 256) y_size = kwargs.get("chunk_size", 256) # Create an affine transform to convert between real-world and pixels if self.proj is None: from_proj = "EPSG:4326" else: from_proj = self.proj try: # NOTE: this only works on images that have IPE rpcs metadata center = wkt.loads( self.ipe.metadata["image"]["imageBoundsWGS84"]).centroid g = box(*(center.buffer(self.ipe.metadata["rpcs"]["gsd"] / 2).bounds)) # print "Input GSD (deg):", self.ipe.metadata["rpcs"]["gsd"] tfm = partial(pyproj.transform, pyproj.Proj(init="EPSG:4326"), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", ops.transform(tfm, g).area**0.5) current_bounds = wkt.loads( self.ipe.metadata["image"]["imageBoundsWGS84"]).bounds except (AttributeError, KeyError, TypeError): tfm = partial(pyproj.transform, pyproj.Proj(init=self.proj), pyproj.Proj(init=proj)) gsd = kwargs.get("gsd", (ops.transform(tfm, shape(self)).area / (self.shape[1] * self.shape[2]))**0.5) current_bounds = self.bounds tfm = partial(pyproj.transform, pyproj.Proj(init=from_proj), pyproj.Proj(init=proj)) itfm = partial(pyproj.transform, pyproj.Proj(init=proj), pyproj.Proj(init=from_proj)) output_bounds = ops.transform(tfm, box(*current_bounds)).bounds gtf = Affine.from_gdal(output_bounds[0], gsd, 0.0, output_bounds[3], 0.0, -1 * gsd) ll = ~gtf * (output_bounds[:2]) ur = ~gtf * (output_bounds[2:]) x_chunks = int((ur[0] - ll[0]) / x_size) + 1 y_chunks = int((ll[1] - ur[1]) / y_size) + 1 daskmeta = { "dask": {}, "chunks": (img_md["numBands"], y_size, x_size), "dtype": IPE_TO_DTYPE[img_md["dataType"]], "name": "warp-{}".format(self.ipe_id), "shape": (img_md["numBands"], y_chunks * y_size, x_chunks * x_size) } def px_to_geom(xmin, ymin): xmax = int(xmin + x_size) ymax = int(ymin + y_size) bounds = list((gtf * (xmin, ymax)) + (gtf * (xmax, ymin))) return box(*bounds) full_bounds = box(*output_bounds) dasks = [] if isinstance(dem, GeoImage): if dem.proj != proj: dem = dem.warp(proj=proj) dasks.append(dem.dask) for y in xrange(y_chunks): for x in xrange(x_chunks): xmin = x * x_size ymin = y * y_size geometry = px_to_geom(xmin, ymin) full_bounds = box(*full_bounds.union(geometry).bounds) daskmeta["dask"][(daskmeta["name"], 0, y, x)] = (self._warp, geometry, gsd, dem, proj, 5) daskmeta["dask"], _ = optimize.cull( sharedict.merge(daskmeta["dask"], *dasks), list(daskmeta["dask"].keys())) result = GeoDaskWrapper(daskmeta, self) result.__geo_interface__ = mapping(full_bounds) result.__geo_transform__ = AffineTransform(gtf, proj) return GeoImage.__getitem__(result, box(*output_bounds))