def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified number of chunks along each value dimension. Can alternatively specify an average chunk size (in megabytes) and the number of chunks will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the number of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def chunk(self, size="150", axis=None): """ Chunks records of a distributed array. Chunking breaks arrays into subarrays, using an specified size of chunks along each value dimension. Can alternatively specify an average chunk byte size (in megabytes) and the size of chunks (as ints) will be computed automatically. Parameters ---------- size : tuple, int, or str, optional, default = "150" A string giving the size in megabytes, or a tuple with the size of chunks along each dimension. axis : int or tuple, optional, default = None One or more axis to chunk array along, if None will use all axes, Returns ------- ChunkedArray """ if type(size) is not str: size = tupleize((size)) axis = tupleize((axis)) from bolt.spark.chunk import ChunkedArray chnk = ChunkedArray(rdd=self._rdd, shape=self._shape, split=self._split, dtype=self._dtype) return chnk.chunk(size, axis)
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in megabytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) shape = self._shape + (1,) else: rdd = self._rdd shape = self._shape from bolt.spark.chunk import ChunkedArray c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype) chunks = c.chunk(size, axis=vaxes) barray = chunks.move(kaxes, vaxes) if self.values.ndim == 0: barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze()) barray._shape = barray._shape[:-1] return barray
def swap(self, kaxes, vaxes, size="150"): """ Swap axes from keys to values. This is the core operation underlying shape manipulation on the Spark bolt array. It exchanges an arbitrary set of axes between the keys and the valeus. If either is None, will only move axes in one direction (from keys to values, or values to keys). Keys moved to values will be placed immediately after the split; values moved to keys will be placed immediately before the split. Parameters ---------- kaxes : tuple Axes from keys to move to values vaxes : tuple Axes from values to move to keys size : tuple or int, optional, default = "150" Can either provide a string giving the size in megabytes, or a tuple with the number of chunks along each value dimension being moved Returns ------- BoltArraySpark """ kaxes = asarray(tupleize(kaxes), 'int') vaxes = asarray(tupleize(vaxes), 'int') if type(size) is not str: size = tupleize(size) if len(kaxes) == self.keys.ndim and len(vaxes) == 0: raise ValueError('Cannot perform a swap that would ' 'end up with all data on a single key') if len(kaxes) == 0 and len(vaxes) == 0: return self if self.values.ndim == 0: rdd = self._rdd.mapValues(lambda v: array(v, ndmin=1)) shape = self._shape + (1, ) else: rdd = self._rdd shape = self._shape from bolt.spark.chunk import ChunkedArray c = ChunkedArray(rdd, shape=shape, split=self._split, dtype=self._dtype) chunks = c.chunk(size, axis=vaxes) barray = chunks.move(kaxes, vaxes) if self.values.ndim == 0: barray._rdd = barray._rdd.mapValues(lambda v: v.squeeze()) barray._shape = barray._shape[:-1] return barray