def np_rdd_to_bolt(in_rdd): f_key, f_val = in_rdd.first() bolt_shape = tuple([in_rdd.count()] + list(f_val.shape)) out_bolt = BoltArraySpark(rdd=in_rdd, shape=bolt_shape, split=1, dtype=f_val.dtype) out_bolt._mode = '4Quant IQAE Engine' out_bolt.flatten = lambda: out_bolt.reshape( (out_bolt.shape[0], np.prod(out_bolt.shape[1:]))) return out_bolt
def unchunk(self): """ Convert a chunked array back into a full array with (key,value) pairs where key is a tuple of indices, and value is an ndarray. """ plan, padding, vshape = self.plan, self.padding, self.vshape nchunks = self.getnumber(plan, vshape) full_shape = concatenate((nchunks, plan)) n = len(vshape) perm = concatenate(list(zip(range(n), range(n, 2 * n)))) if self.uniform: def _unchunk(v): #idx, data = zip(*v.data) idx, data = v sorted_idx = tuplesort(idx) return asarray(data)[sorted_idx].reshape(full_shape).transpose( perm).reshape(vshape) else: def _unchunk(v): #idx, data = zip(*v.data) idx, data = v arr = empty(nchunks, dtype='object') for (i, d) in zip(idx, data): arr[i] = d return allstack(arr.tolist()) # remove padding if self.padded: removepad = self.removepad rdd = self._rdd.map(lambda kv: ( kv[0], removepad(kv[0][1], kv[1], nchunks, padding, axes=range(n)))) else: rdd = self._rdd # undo chunking switch = self.switch rdd = rdd.map(switch) # skip groupByKey if there is not actually any chunking if array_equal(self.plan, self.vshape): rdd = rdd.mapValues(lambda v: zip(*(v, ))) else: rdd = rdd.groupByKey().mapValues(lambda v: zip(*v.data)) rdd = rdd.mapValues(_unchunk) if array_equal(self.vshape, [1]): rdd = rdd.mapValues(lambda v: squeeze(v)) newshape = self.shape[:-1] else: newshape = self.shape return BoltArraySpark(rdd, shape=newshape, split=self._split, dtype=self.dtype, ordered=False)
def transpose(self, *axes): """ Transpose just the keys of a BoltArraySpark, returning a new BoltArraySpark. Parameters ---------- axes : tuple New proposed axes. """ new = argpack(axes) old = range(self.ndim) istransposeable(new, old) if new == old: return self._barray def f(k): return tuple(k[i] for i in new) newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) newshape = tuple(self.shape[i] for i in new) + self._barray.values.shape return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def reshape(self, *shape): """ Reshape just the keys of a BoltArraySpark, returning a new BoltArraySpark. Parameters ---------- shape : tuple New proposed axes. """ new = argpack(shape) old = self.shape isreshapeable(new, old) if new == old: return self._barray def f(k): return unravel_index(ravel_multi_index(k, old), new) newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) newsplit = len(new) newshape = new + self._barray.values.shape return BoltArraySpark(newrdd, shape=newshape, split=newsplit)
def transpose(self, *axes): """ Transpose just the values of a BoltArraySpark, returning a new BoltArraySpark. Parameters ---------- axes : tuple New proposed axes. """ new = argpack(axes) old = range(self.ndim) istransposeable(new, old) if new == old: return self._barray def f(v): return v.transpose(new) newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new) return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def reshape(self, *shape): """ Reshape just the values of a BoltArraySpark, returning a new BoltArraySpark. Parameters ---------- shape : tuple New proposed axes. """ new = argpack(shape) old = self.shape isreshapeable(new, old) if new == old: return self._barray def f(v): return v.reshape(new) newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + new return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def unchunk(self): """ Convert a chunked array back into a full array with (key,value) pairs where key is a tuple of indicies, and value is an ndarray. """ plan, vshape = self.plan, self.vshape nchunks = self.getnumber(plan, vshape) full_shape = concatenate((nchunks, plan)) n = len(vshape) perm = concatenate(list(zip(range(n), range(n, 2*n)))) if self.uniform: def _unchunk(v): idx, data = zip(*v.data) sorted_idx = tuplesort(idx) return asarray(data)[sorted_idx].reshape(full_shape).transpose(perm).reshape(vshape) else: def _unchunk(v): idx, data = zip(*v.data) arr = empty(nchunks, dtype='object') for (i, d) in zip(idx, data): arr[i] = d return allstack(arr.tolist()) switch = self.switch rdd = self._rdd.map(switch).groupByKey().mapValues(_unchunk) return BoltArraySpark(rdd, shape=self.shape, split=self._split)
def fromrdd(rdd, nrecords=None, shape=None, index=None, labels=None, dtype=None, ordered=False): """ Load series data from a Spark RDD. Assumes keys are tuples with increasing and unique indices, and values are 1d ndarrays. Will try to infer properties that are not explicitly provided. Parameters ---------- rdd : SparkRDD An RDD containing series data. shape : tuple or array, optional, default = None Total shape of data (if provided will avoid check). nrecords : int, optional, default = None Number of records (if provided will avoid check). index : array, optional, default = None Index for records, if not provided will use (0, 1, ...) labels : array, optional, default = None Labels for records. If provided, should have shape of shape[:-1]. dtype : string, default = None Data numerical type (if provided will avoid check) ordered : boolean, optional, default = False Whether or not the rdd is ordered by key """ from .series import Series from bolt.spark.array import BoltArraySpark if index is None or dtype is None: item = rdd.values().first() if index is None: index = range(len(item)) if dtype is None: dtype = item.dtype if nrecords is None and shape is not None: nrecords = prod(shape[:-1]) if nrecords is None: nrecords = rdd.count() if shape is None: shape = (nrecords, asarray(index).shape[0]) def process_keys(record): k, v = record if isinstance(k, int): k = (k,) return k, v values = BoltArraySpark(rdd.map(process_keys), shape=shape, dtype=dtype, split=len(shape)-1, ordered=ordered) return Series(values, index=index, labels=labels)
def array(a, context=None, axis=(0,), dtype=None, npartitions=None): """ Create a spark bolt array from a local array. Parameters ---------- a : array-like An array, any object exposing the array interface, an object whose __array__ method returns an array, or any (nested) sequence. context : SparkContext A context running Spark. (see pyspark) axis : tuple, optional, default=(0,) Which axes to distribute the array along. The resulting distributed object will use keys to represent these axes, with the remaining axes represented by values. dtype : data-type, optional, default=None The desired data-type for the array. If None, will be determined from the data. (see numpy) npartitions : int Number of partitions for parallization. Returns ------- BoltArraySpark """ if dtype is None: arry = asarray(a) dtype = arry.dtype else: arry = asarray(a, dtype) shape = arry.shape ndim = len(shape) # handle the axes specification and transpose if necessary axes = ConstructSpark._format_axes(axis, arry.shape) key_axes, value_axes = get_kv_axes(arry.shape, axes) permutation = key_axes + value_axes arry = arry.transpose(*permutation) split = len(axes) if split < 1: raise ValueError("split axis must be greater than 0, got %g" % split) if split > len(shape): raise ValueError("split axis must not exceed number of axes %g, got %g" % (ndim, split)) key_shape = shape[:split] val_shape = shape[split:] keys = zip(*unravel_index(arange(0, int(prod(key_shape))), key_shape)) vals = arry.reshape((prod(key_shape),) + val_shape) rdd = context.parallelize(zip(keys, vals), npartitions) return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype)
def _2D_stackable_preamble(sc, num_partitions=2): dims = (10, 10) arr = vstack([[x] * dims[1] for x in arange(dims[0])]) barr = array(arr, sc, axis=0) barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), shape=barr.shape, split=barr.split) return barr
def fromrdd(rdd, dims=None, nrecords=None, dtype=None, labels=None, ordered=False): """ Load images from a Spark RDD. Input RDD must be a collection of key-value pairs where keys are singleton tuples indexing images, and values are 2d or 3d ndarrays. Parameters ---------- rdd : SparkRDD An RDD containing the images. dims : tuple or array, optional, default = None Image dimensions (if provided will avoid check). nrecords : int, optional, default = None Number of images (if provided will avoid check). dtype : string, default = None Data numerical type (if provided will avoid check) labels : array, optional, default = None Labels for records. If provided, should be one-dimensional. ordered : boolean, optional, default = False Whether or not the rdd is ordered by key """ from .images import Images from bolt.spark.array import BoltArraySpark if dims is None or dtype is None: item = rdd.values().first() dtype = item.dtype dims = item.shape if nrecords is None: nrecords = rdd.count() def process_keys(record): k, v = record if isinstance(k, int): k = (k, ) return k, v values = BoltArraySpark(rdd.map(process_keys), shape=(nrecords, ) + tuple(dims), dtype=dtype, split=1, ordered=ordered) return Images(values, labels=labels)
def _3D_stackable_preamble(sc, num_partitions=2): dims = (10, 10, 10) area = dims[0] * dims[1] arr = asarray( [repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])]) barr = array(arr, sc, axis=0) barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions), shape=barr.shape, split=barr.split) return barr
def unstack(self): """ Unstack array and return a new BoltArraySpark via flatMap(). """ from bolt.spark.array import BoltArraySpark if self._rekeyed: rdd = self._rdd else: rdd = self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))) return BoltArraySpark(rdd, shape=self.shape, split=self.split)
def _wrap(func, shape, context=None, axis=(0,), dtype=None, npartitions=None): """ Wrap an existing numpy constructor in a parallelized construction """ if isinstance(shape, int): shape = (shape,) key_shape, value_shape = get_kv_shape(shape, ConstructSpark._format_axes(axis, shape)) split = len(key_shape) # make the keys rdd = context.parallelize(list(product(*[arange(x) for x in key_shape])), npartitions) # use a map to make the arrays in parallel rdd = rdd.map(lambda x: (x, func(value_shape, dtype, order='C'))) return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype)
def reshape(self, *shape): new = argpack(shape) old = self.shape isreshapeable(new, old) if new == old: return self._barray def f(v): return v.reshape(new) newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + new return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def transpose(self, *axes): new = argpack(axes) old = range(self.ndim) istransposeable(new, old) if new == old: return self._barray def f(v): return v.transpose(new) newrdd = self._barray._rdd.mapValues(f) newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new) return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def element_wise(self, other, op): """ Apply an elementwise operation to data. Both self and other data must have the same mode. If self is in local mode, other can also be a numpy array. Self and other must have the same shape, or other must be a scalar. Parameters ---------- other : Data or numpy array Data to apply elementwise operation to op : function Binary operator to use for elementwise operations, e.g. add, subtract """ if not isscalar(other) and not self.shape == other.shape: raise ValueError("shapes %s and %s must be equal" % (self.shape, other.shape)) if not isscalar(other) and isinstance( other, Data) and not self.mode == other.mode: raise NotImplementedError if isscalar(other): return self.map(lambda x: op(x, other)) if self.mode == 'local' and isinstance(other, ndarray): return self._constructor(op(self.values, other)).__finalize__(self) if self.mode == 'local' and isinstance(other, Data): return self._constructor(op(self.values, other.values)).__finalize__(self) if self.mode == 'spark' and isinstance(other, Data): def func(record): (k1, x), (k2, y) = record return k1, op(x, y) rdd = self.tordd().zip(other.tordd()).map(func) barray = BoltArraySpark(rdd, shape=self.shape, dtype=self.dtype, split=self.values.split) return self._constructor(barray).__finalize__(self)
def reshape(self, *shape): new = argpack(shape) old = self.shape isreshapeable(new, old) if new == old: return self._barray def f(k): return unravel_index(ravel_multi_index(k, old), new) newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) newsplit = len(new) newshape = new + self._barray.values.shape return BoltArraySpark(newrdd, shape=newshape, split=newsplit)
def transpose(self, *axes): new = argpack(axes) old = range(self.ndim) istransposeable(new, old) if new == old: return self._barray def f(k): return tuple(k[i] for i in new) newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1])) newshape = tuple(self.shape[i] for i in new) + self._barray.values.shape return BoltArraySpark(newrdd, shape=newshape).__finalize__(self._barray)
def map_generic(self, func): """ Apply a generic array -> object to each subarray The resulting object is a BoltArraySpark of dtype object where the blocked dimensions are replaced with indices indication block ID. """ def process_record(val): newval = empty(1, dtype="object") newval[0] = func(val) return newval rdd = self._rdd.mapValues(process_record) nchunks = self.getnumber(self.plan, self.vshape) newshape = tuple([int(s) for s in r_[self.kshape, nchunks]]) newsplit = len(self.shape) return BoltArraySpark(rdd, shape=newshape, split=newsplit, ordered=self._ordered, dtype="object")
def move(self, kaxes=(), vaxes=()): """ Move some of the dimensions in the values into the keys. Along with chunking, this is required as part of the swap method on the BoltArraySpark, see there for further details. Parameters ---------- kaxes : tuple, optional, default=() Axes from keys to move to values. vaxes : tuple, optional, default=() Axes from values to move to keys. """ kmask, vmask = self.kmask(kaxes), self.vmask(vaxes) # make sure chunking is done only on the moving dimensions nchunks = asarray(self.getnumber(self.plan, self.vshape)) if any(logical_and(nchunks != 1, ~vmask)): raise NotImplementedError("Chunking along non-swapped axes is not supported") def _move(record): k, chk, data = asarray(record[0]), asarray(record[1][0]), asarray(record[1][1]) stationary_keys, moving_keys = tuple(k[~kmask]), tuple(k[kmask]) moving_values, stationary_values = tuple(chk[vmask]), tuple(chk[~vmask]) return (stationary_keys, moving_values), (moving_keys + stationary_values, data) switch = self.switch rdd = self._rdd.map(switch).map(_move) moving_kshape = tuple(self.kshape[kmask]) def _rebuild(v): idx, data = zip(*v.data) valshape = data[0].shape fullshape = moving_kshape + valshape sorted_idx = tuplesort(idx) return asarray(data)[sorted_idx].reshape(fullshape) rdd = rdd.groupByKey().mapValues(_rebuild) mask = [False for _ in moving_kshape] mask.extend([True if vmask[k] else False for k in range(len(vmask))]) mask = asarray(mask) slices = [slice(0, i, 1) for i in moving_kshape] slices.extend([None if vmask[i] else slice(0, self.vshape[i], 1) for i in range(len(vmask))]) slices = asarray(slices) sizes = self.plan def _extract(record): k, v = record[0], record[1] stationary_key, chunk = k[0], k[1] key_offsets = prod([asarray(chunk), asarray(sizes)[vmask]], axis=0) bounds = asarray(v.shape[len(kaxes):])[vmask] indices = list(product(*map(lambda x: arange(x), bounds))) for b in indices: s = slices.copy() s[mask] = b yield (tuple(asarray(r_[stationary_key, key_offsets + b], dtype='int')), v[tuple(s)]) rdd = rdd.flatMap(_extract) split = self._split - len(kaxes) + len(vaxes) shape = tuple(r_[self.kshape[~kmask], self.vshape[vmask], self.kshape[kmask], self.vshape[~vmask]].astype('int')) return BoltArraySpark(rdd, shape=shape, split=split)
def unstack(self): from bolt.spark.array import BoltArraySpark return BoltArraySpark(self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))), shape=self.shape, split=self.split)
def unchunk(self): """ Convert a chunked array back into a full array with (key,value) pairs where key is a tuple of indices, and value is an ndarray. """ plan, padding, vshape, split = self.plan, self.padding, self.vshape, self.split nchunks = self.getnumber(plan, vshape) full_shape = concatenate((nchunks, plan)) n = len(vshape) perm = concatenate(list(zip(range(n), range(n, 2 * n)))) if self.uniform: def _unchunk(it): ordered = sorted(it, key=lambda kv: kv[0][split:]) keys, values = zip(*ordered) yield keys[0][:split], asarray(values).reshape( full_shape).transpose(perm).reshape(vshape) else: def _unchunk(it): ordered = sorted(it, key=lambda kv: kv[0][split:]) keys, values = zip(*ordered) k_chks = [k[split:] for k in keys] arr = empty(nchunks, dtype='object') for (i, d) in zip(k_chks, values): arr[i] = d yield keys[0][:split], allstack(arr.tolist()) # remove padding if self.padded: removepad = self.removepad rdd = self._rdd.map(lambda kv: ( kv[0], removepad( kv[0][split:], kv[1], nchunks, padding, axes=range(n)))) else: rdd = self._rdd # skip partitionBy if there is not actually any chunking if array_equal(self.plan, self.vshape): rdd = rdd.map(lambda kv: (kv[0][:split], kv[1])) ordered = self._ordered else: ranges = self.kshape npartitions = int(prod(ranges)) if len(self.kshape) == 0: partitioner = lambda k: 0 else: partitioner = lambda k: ravel_multi_index(k[:split], ranges) rdd = rdd.partitionBy( numPartitions=npartitions, partitionFunc=partitioner).mapPartitions(_unchunk) ordered = True if array_equal(self.vshape, [1]): rdd = rdd.mapValues(lambda v: squeeze(v)) newshape = self.shape[:-1] else: newshape = self.shape return BoltArraySpark(rdd, shape=newshape, split=self._split, dtype=self.dtype, ordered=ordered)