def paths_to_tiled_image(paths, context=None, tile_size=(256, 256), padding=(0, 0), backend=backends[-1], skip_chunk=False, **kwargs): """ Create an tiled ND image from a collection of paths :param paths: List[str] / RDD[str] a list or RDD of strings containing image paths :param context: SparkContext the context to make the RDD from if paths is a list :param tile_size: the size of tiles to cut :param padding: the padding to use :param backend: The LazyImageBackend to use for reading the image data in (by default uses the last one, GDAL if available) :param skip_chunk: For developer use only allows the actual subchunking step to be delayed :param kwargs: other arguments for creating the initial RDD :return: a ChunkedRDD containing the image data as tiles (use .unchunk to make into a normal RDD) """ path_rdd = paths if isinstance(paths, RDD) else context.parallelize( paths, **kwargs) _create_dmzi = lambda fle_path: DiskMappedLazyImage(fle_path, backend) in_rdd = path_rdd.zipWithIndex().map( lambda x: (x[1], _create_dmzi(x[0]))) # type: RDD[(int, DiskMappedLazyImage)] first_ds = _create_dmzi(path_rdd.first()) ca_data = ChunkedArray(in_rdd, shape=(path_rdd.count(), ) + first_ds.shape, split=1, dtype=first_ds[0, 0].dtype) if skip_chunk: return ca_data return ca_data._chunk(size=tile_size, axis=None, padding=padding)
def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified number of chunks along each value dimension. Can alternatively specify an average chunk size (in megabytes) and the number of chunks will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the number of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified size of chunks along each value dimension. Can alternatively specify an average chunk byte size (in megabytes) and the size of chunks (as ints) will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the size of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in megabytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) shape = self._shape + (1,) else: rdd = self._rdd shape = self._shape from bolt.spark.chunk import ChunkedArray c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype) chunks = c.chunk(size, axis=vaxes) barray = chunks.move(kaxes, vaxes) if self.values.ndim == 0: barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze()) barray._shape = barray._shape[:-1] return barray
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in kilobytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self from bolt.spark.chunk import ChunkedArray c = ChunkedArray(self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) chunks = c._chunk(size, axis=vaxes) swapped = chunks.keys_to_values(kaxes).values_to_keys( [v + len(kaxes) for v in vaxes]) barray = swapped.unchunk() return barray
def single_chunky_image(in_ds, context, tile_size=(256, 256), padding=(0, 0)): in_rdd = context.parallelize([((0, ), in_ds)]) return ChunkedArray( in_rdd, # type - RDD[(int, np.ndarray)] shape=(in_rdd.count(), ) + in_ds.size, split=1, dtype=in_ds[0, 0].dtype)._chunk(size=tile_size, axis=None, padding=padding)
def chunk(self, size="150", axis=None, padding=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified size of chunks along each value dimension. Can alternatively specify an average chunk byte size (in kilobytes) and the size of chunks (as ints) will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in kilobytes, or a tuple with the size of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, padding: tuple or int, default = None Number of elements per dimension that will overlap with the adjacent chunk. If a tuple, specifies padding along each chunked dimension; if a int, same padding will be applied to all chunked dimensions. Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) padding = tupleize((padding)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk._chunk(size, axis, padding)