Esempio n. 1
0
def np_rdd_to_bolt(in_rdd):
    f_key, f_val = in_rdd.first()
    bolt_shape = tuple([in_rdd.count()] + list(f_val.shape))
    out_bolt = BoltArraySpark(rdd=in_rdd,
                              shape=bolt_shape,
                              split=1,
                              dtype=f_val.dtype)
    out_bolt._mode = '4Quant IQAE Engine'
    out_bolt.flatten = lambda: out_bolt.reshape(
        (out_bolt.shape[0], np.prod(out_bolt.shape[1:])))
    return out_bolt
Esempio n. 2
0
    def unchunk(self):
        """
        Convert a chunked array back into a full array with (key,value) pairs
        where key is a tuple of indices, and value is an ndarray.
        """
        plan, padding, vshape = self.plan, self.padding, self.vshape
        nchunks = self.getnumber(plan, vshape)
        full_shape = concatenate((nchunks, plan))
        n = len(vshape)
        perm = concatenate(list(zip(range(n), range(n, 2 * n))))

        if self.uniform:

            def _unchunk(v):
                #idx, data = zip(*v.data)
                idx, data = v
                sorted_idx = tuplesort(idx)
                return asarray(data)[sorted_idx].reshape(full_shape).transpose(
                    perm).reshape(vshape)
        else:

            def _unchunk(v):
                #idx, data = zip(*v.data)
                idx, data = v
                arr = empty(nchunks, dtype='object')
                for (i, d) in zip(idx, data):
                    arr[i] = d
                return allstack(arr.tolist())

        # remove padding
        if self.padded:
            removepad = self.removepad
            rdd = self._rdd.map(lambda kv: (
                kv[0],
                removepad(kv[0][1], kv[1], nchunks, padding, axes=range(n))))
        else:
            rdd = self._rdd

        # undo chunking
        switch = self.switch
        rdd = rdd.map(switch)

        # skip groupByKey if there is not actually any chunking
        if array_equal(self.plan, self.vshape):
            rdd = rdd.mapValues(lambda v: zip(*(v, )))
        else:
            rdd = rdd.groupByKey().mapValues(lambda v: zip(*v.data))
        rdd = rdd.mapValues(_unchunk)

        if array_equal(self.vshape, [1]):
            rdd = rdd.mapValues(lambda v: squeeze(v))
            newshape = self.shape[:-1]
        else:
            newshape = self.shape

        return BoltArraySpark(rdd,
                              shape=newshape,
                              split=self._split,
                              dtype=self.dtype,
                              ordered=False)
Esempio n. 3
0
    def transpose(self, *axes):
        """
        Transpose just the keys of a BoltArraySpark, returning a 
        new BoltArraySpark.

        Parameters                                                                           
        ----------                                                                           
        axes : tuple 
             New proposed axes.
        """
        new = argpack(axes)
        old = range(self.ndim)
        istransposeable(new, old)

        if new == old:
            return self._barray

        def f(k):
            return tuple(k[i] for i in new)

        newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1]))
        newshape = tuple(self.shape[i]
                         for i in new) + self._barray.values.shape

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 4
0
    def reshape(self, *shape):
        """
        Reshape just the keys of a BoltArraySpark, returning a 
        new BoltArraySpark.

        Parameters                                                                           
        ----------                                                                           
        shape : tuple
              New proposed axes.
        """
        new = argpack(shape)
        old = self.shape
        isreshapeable(new, old)

        if new == old:
            return self._barray

        def f(k):
            return unravel_index(ravel_multi_index(k, old), new)

        newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1]))
        newsplit = len(new)
        newshape = new + self._barray.values.shape

        return BoltArraySpark(newrdd, shape=newshape, split=newsplit)
Esempio n. 5
0
    def transpose(self, *axes):
        """
        Transpose just the values of a BoltArraySpark, returning a 
        new BoltArraySpark.

        Parameters                                                                           
        ----------                                                                           
        axes : tuple 
             New proposed axes.
        """
        new = argpack(axes)
        old = range(self.ndim)
        istransposeable(new, old)

        if new == old:
            return self._barray

        def f(v):
            return v.transpose(new)

        newrdd = self._barray._rdd.mapValues(f)
        newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new)

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 6
0
    def reshape(self, *shape):
        """
        Reshape just the values of a BoltArraySpark, returning a 
        new BoltArraySpark.

        Parameters                                                                           
        ----------                                                                           
        shape : tuple
              New proposed axes.
        """
        new = argpack(shape)
        old = self.shape
        isreshapeable(new, old)

        if new == old:
            return self._barray

        def f(v):
            return v.reshape(new)

        newrdd = self._barray._rdd.mapValues(f)
        newshape = self._barray.keys.shape + new

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 7
0
    def unchunk(self):
        """
        Convert a chunked array back into a full array with (key,value) pairs
        where key is a tuple of indicies, and value is an ndarray.
        """
        plan, vshape = self.plan, self.vshape
        nchunks = self.getnumber(plan, vshape)

        full_shape = concatenate((nchunks, plan))
        n = len(vshape)
        perm = concatenate(list(zip(range(n), range(n, 2*n))))

        if self.uniform:
            def _unchunk(v):
                idx, data = zip(*v.data)
                sorted_idx = tuplesort(idx)
                return asarray(data)[sorted_idx].reshape(full_shape).transpose(perm).reshape(vshape)
        else:
            def _unchunk(v):
                idx, data = zip(*v.data)
                arr = empty(nchunks, dtype='object')
                for (i, d) in zip(idx, data):
                    arr[i] = d
                return allstack(arr.tolist())

        switch = self.switch
        rdd = self._rdd.map(switch).groupByKey().mapValues(_unchunk)
        return BoltArraySpark(rdd, shape=self.shape, split=self._split)
Esempio n. 8
0
def fromrdd(rdd, nrecords=None, shape=None, index=None, labels=None, dtype=None, ordered=False):
    """
    Load series data from a Spark RDD.

    Assumes keys are tuples with increasing and unique indices,
    and values are 1d ndarrays. Will try to infer properties
    that are not explicitly provided.

    Parameters
    ----------
    rdd : SparkRDD
        An RDD containing series data.

    shape : tuple or array, optional, default = None
        Total shape of data (if provided will avoid check).

    nrecords : int, optional, default = None
        Number of records (if provided will avoid check).

    index : array, optional, default = None
        Index for records, if not provided will use (0, 1, ...)

    labels : array, optional, default = None
        Labels for records. If provided, should have shape of shape[:-1].

    dtype : string, default = None
        Data numerical type (if provided will avoid check)

    ordered : boolean, optional, default = False
        Whether or not the rdd is ordered by key
    """
    from .series import Series
    from bolt.spark.array import BoltArraySpark

    if index is None or dtype is None:
        item = rdd.values().first()

    if index is None:
        index = range(len(item))

    if dtype is None:
        dtype = item.dtype

    if nrecords is None and shape is not None:
        nrecords = prod(shape[:-1])

    if nrecords is None:
        nrecords = rdd.count()

    if shape is None:
        shape = (nrecords, asarray(index).shape[0])

    def process_keys(record):
        k, v = record
        if isinstance(k, int):
            k = (k,)
        return k, v

    values = BoltArraySpark(rdd.map(process_keys), shape=shape, dtype=dtype, split=len(shape)-1, ordered=ordered)
    return Series(values, index=index, labels=labels)
Esempio n. 9
0
    def array(a, context=None, axis=(0,), dtype=None, npartitions=None):
        """
        Create a spark bolt array from a local array.

        Parameters
        ----------
        a : array-like
            An array, any object exposing the array interface, an
            object whose __array__ method returns an array, or any
            (nested) sequence.

        context : SparkContext
            A context running Spark. (see pyspark)

        axis : tuple, optional, default=(0,)
            Which axes to distribute the array along. The resulting
            distributed object will use keys to represent these axes,
            with the remaining axes represented by values.

        dtype : data-type, optional, default=None
            The desired data-type for the array. If None, will
            be determined from the data. (see numpy)

        npartitions : int
            Number of partitions for parallization.

        Returns
        -------
        BoltArraySpark
        """
        if dtype is None:
            arry = asarray(a)
            dtype = arry.dtype
        else:
            arry = asarray(a, dtype)
        shape = arry.shape
        ndim = len(shape)

        # handle the axes specification and transpose if necessary
        axes = ConstructSpark._format_axes(axis, arry.shape)
        key_axes, value_axes = get_kv_axes(arry.shape, axes)
        permutation = key_axes + value_axes
        arry = arry.transpose(*permutation)
        split = len(axes)

        if split < 1:
            raise ValueError("split axis must be greater than 0, got %g" % split)
        if split > len(shape):
            raise ValueError("split axis must not exceed number of axes %g, got %g" % (ndim, split))

        key_shape = shape[:split]
        val_shape = shape[split:]

        keys = zip(*unravel_index(arange(0, int(prod(key_shape))), key_shape))
        vals = arry.reshape((prod(key_shape),) + val_shape)

        rdd = context.parallelize(zip(keys, vals), npartitions)
        return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype)
Esempio n. 10
0
def _2D_stackable_preamble(sc, num_partitions=2):

    dims = (10, 10)
    arr = vstack([[x] * dims[1] for x in arange(dims[0])])
    barr = array(arr, sc, axis=0)
    barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions),
                          shape=barr.shape,
                          split=barr.split)
    return barr
Esempio n. 11
0
def fromrdd(rdd,
            dims=None,
            nrecords=None,
            dtype=None,
            labels=None,
            ordered=False):
    """
    Load images from a Spark RDD.

    Input RDD must be a collection of key-value pairs
    where keys are singleton tuples indexing images,
    and values are 2d or 3d ndarrays.

    Parameters
    ----------
    rdd : SparkRDD
        An RDD containing the images.

    dims : tuple or array, optional, default = None
        Image dimensions (if provided will avoid check).

    nrecords : int, optional, default = None
        Number of images (if provided will avoid check).

    dtype : string, default = None
        Data numerical type (if provided will avoid check)

    labels : array, optional, default = None
        Labels for records. If provided, should be one-dimensional.

    ordered : boolean, optional, default = False
        Whether or not the rdd is ordered by key
    """
    from .images import Images
    from bolt.spark.array import BoltArraySpark

    if dims is None or dtype is None:
        item = rdd.values().first()
        dtype = item.dtype
        dims = item.shape

    if nrecords is None:
        nrecords = rdd.count()

    def process_keys(record):
        k, v = record
        if isinstance(k, int):
            k = (k, )
        return k, v

    values = BoltArraySpark(rdd.map(process_keys),
                            shape=(nrecords, ) + tuple(dims),
                            dtype=dtype,
                            split=1,
                            ordered=ordered)
    return Images(values, labels=labels)
Esempio n. 12
0
def _3D_stackable_preamble(sc, num_partitions=2):

    dims = (10, 10, 10)
    area = dims[0] * dims[1]
    arr = asarray(
        [repeat(x, area).reshape(dims[0], dims[1]) for x in range(dims[2])])
    barr = array(arr, sc, axis=0)
    barr = BoltArraySpark(barr._rdd.partitionBy(num_partitions),
                          shape=barr.shape,
                          split=barr.split)
    return barr
Esempio n. 13
0
    def unstack(self):
        """
        Unstack array and return a new BoltArraySpark via flatMap().
        """
        from bolt.spark.array import BoltArraySpark

        if self._rekeyed:
            rdd = self._rdd
        else:
            rdd = self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1])))

        return BoltArraySpark(rdd, shape=self.shape, split=self.split)
Esempio n. 14
0
    def _wrap(func, shape, context=None, axis=(0,), dtype=None, npartitions=None):
        """
        Wrap an existing numpy constructor in a parallelized construction
        """
        if isinstance(shape, int):
            shape = (shape,)
        key_shape, value_shape = get_kv_shape(shape, ConstructSpark._format_axes(axis, shape))
        split = len(key_shape)

        # make the keys
        rdd = context.parallelize(list(product(*[arange(x) for x in key_shape])), npartitions)

        # use a map to make the arrays in parallel
        rdd = rdd.map(lambda x: (x, func(value_shape, dtype, order='C')))
        return BoltArraySpark(rdd, shape=shape, split=split, dtype=dtype)
Esempio n. 15
0
    def reshape(self, *shape):

        new = argpack(shape)
        old = self.shape
        isreshapeable(new, old)

        if new == old:
            return self._barray

        def f(v):
            return v.reshape(new)

        newrdd = self._barray._rdd.mapValues(f)
        newshape = self._barray.keys.shape + new

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 16
0
    def transpose(self, *axes):

        new = argpack(axes)
        old = range(self.ndim)
        istransposeable(new, old)

        if new == old:
            return self._barray

        def f(v):
            return v.transpose(new)

        newrdd = self._barray._rdd.mapValues(f)
        newshape = self._barray.keys.shape + tuple(self.shape[i] for i in new)

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 17
0
    def element_wise(self, other, op):
        """
        Apply an elementwise operation to data.

        Both self and other data must have the same mode.
        If self is in local mode, other can also be a numpy array.
        Self and other must have the same shape, or other must be a scalar.

        Parameters
        ----------
        other : Data or numpy array
            Data to apply elementwise operation to

        op : function
            Binary operator to use for elementwise operations, e.g. add, subtract
        """
        if not isscalar(other) and not self.shape == other.shape:
            raise ValueError("shapes %s and %s must be equal" %
                             (self.shape, other.shape))

        if not isscalar(other) and isinstance(
                other, Data) and not self.mode == other.mode:
            raise NotImplementedError

        if isscalar(other):
            return self.map(lambda x: op(x, other))

        if self.mode == 'local' and isinstance(other, ndarray):
            return self._constructor(op(self.values, other)).__finalize__(self)

        if self.mode == 'local' and isinstance(other, Data):
            return self._constructor(op(self.values,
                                        other.values)).__finalize__(self)

        if self.mode == 'spark' and isinstance(other, Data):

            def func(record):
                (k1, x), (k2, y) = record
                return k1, op(x, y)

            rdd = self.tordd().zip(other.tordd()).map(func)
            barray = BoltArraySpark(rdd,
                                    shape=self.shape,
                                    dtype=self.dtype,
                                    split=self.values.split)
            return self._constructor(barray).__finalize__(self)
Esempio n. 18
0
    def reshape(self, *shape):

        new = argpack(shape)
        old = self.shape
        isreshapeable(new, old)

        if new == old:
            return self._barray

        def f(k):
            return unravel_index(ravel_multi_index(k, old), new)

        newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1]))
        newsplit = len(new)
        newshape = new + self._barray.values.shape

        return BoltArraySpark(newrdd, shape=newshape, split=newsplit)
Esempio n. 19
0
    def transpose(self, *axes):

        new = argpack(axes)
        old = range(self.ndim)
        istransposeable(new, old)

        if new == old:
            return self._barray

        def f(k):
            return tuple(k[i] for i in new)

        newrdd = self._barray._rdd.map(lambda kv: (f(kv[0]), kv[1]))
        newshape = tuple(self.shape[i]
                         for i in new) + self._barray.values.shape

        return BoltArraySpark(newrdd,
                              shape=newshape).__finalize__(self._barray)
Esempio n. 20
0
File: chunk.py Progetto: kmader/bolt
    def map_generic(self, func):
        """
        Apply a generic array -> object to each subarray

        The resulting object is a BoltArraySpark of dtype object where the
        blocked dimensions are replaced with indices indication block ID.
        """
        def process_record(val):
            newval = empty(1, dtype="object")
            newval[0] = func(val)
            return newval

        rdd = self._rdd.mapValues(process_record)

        nchunks = self.getnumber(self.plan, self.vshape)
        newshape = tuple([int(s) for s in r_[self.kshape, nchunks]])
        newsplit = len(self.shape)
        return BoltArraySpark(rdd,
                              shape=newshape,
                              split=newsplit,
                              ordered=self._ordered,
                              dtype="object")
Esempio n. 21
0
    def move(self, kaxes=(), vaxes=()):
        """
        Move some of the dimensions in the values into the keys.

        Along with chunking, this is required as part of the swap method
        on the BoltArraySpark, see there for further details.

        Parameters
        ----------
        kaxes : tuple, optional, default=()
            Axes from keys to move to values.

        vaxes : tuple, optional, default=()
            Axes from values to move to keys.
        """
        kmask, vmask = self.kmask(kaxes), self.vmask(vaxes)
        
        # make sure chunking is done only on the moving dimensions 
        nchunks = asarray(self.getnumber(self.plan, self.vshape))
        if any(logical_and(nchunks != 1, ~vmask)):
            raise NotImplementedError("Chunking along non-swapped axes is not supported")

        def _move(record):
            k, chk, data = asarray(record[0]), asarray(record[1][0]), asarray(record[1][1])
            stationary_keys, moving_keys = tuple(k[~kmask]), tuple(k[kmask])
            moving_values, stationary_values = tuple(chk[vmask]), tuple(chk[~vmask])
            return (stationary_keys, moving_values), (moving_keys + stationary_values, data)

        switch = self.switch
        rdd = self._rdd.map(switch).map(_move)

        moving_kshape = tuple(self.kshape[kmask])

        def _rebuild(v):
            idx, data = zip(*v.data)
            valshape = data[0].shape
            fullshape = moving_kshape + valshape 
            sorted_idx = tuplesort(idx)
            return asarray(data)[sorted_idx].reshape(fullshape)

        rdd = rdd.groupByKey().mapValues(_rebuild)
            
        mask = [False for _ in moving_kshape]
        mask.extend([True if vmask[k] else False for k in range(len(vmask))])
        mask = asarray(mask)

        slices = [slice(0, i, 1) for i in moving_kshape]
        slices.extend([None if vmask[i] else slice(0, self.vshape[i], 1) for i in range(len(vmask))])
        slices = asarray(slices)

        sizes = self.plan

        def _extract(record):

            k, v = record[0], record[1]

            stationary_key, chunk = k[0], k[1]
            key_offsets = prod([asarray(chunk), asarray(sizes)[vmask]], axis=0)

            bounds = asarray(v.shape[len(kaxes):])[vmask]
            indices = list(product(*map(lambda x: arange(x), bounds)))

            for b in indices:
                s = slices.copy()
                s[mask] = b
                yield (tuple(asarray(r_[stationary_key, key_offsets + b], dtype='int')), v[tuple(s)])
            
        rdd = rdd.flatMap(_extract)
        split = self._split - len(kaxes) + len(vaxes)
        shape = tuple(r_[self.kshape[~kmask], self.vshape[vmask],
                         self.kshape[kmask], self.vshape[~vmask]].astype('int'))
        return BoltArraySpark(rdd, shape=shape, split=split)
Esempio n. 22
0
 def unstack(self):
     from bolt.spark.array import BoltArraySpark
     return BoltArraySpark(self._rdd.flatMap(lambda kv: zip(kv[0], list(kv[1]))),
                           shape=self.shape, split=self.split)
Esempio n. 23
0
File: chunk.py Progetto: kmader/bolt
    def unchunk(self):
        """
        Convert a chunked array back into a full array with (key,value) pairs
        where key is a tuple of indices, and value is an ndarray.
        """
        plan, padding, vshape, split = self.plan, self.padding, self.vshape, self.split
        nchunks = self.getnumber(plan, vshape)
        full_shape = concatenate((nchunks, plan))
        n = len(vshape)
        perm = concatenate(list(zip(range(n), range(n, 2 * n))))

        if self.uniform:

            def _unchunk(it):
                ordered = sorted(it, key=lambda kv: kv[0][split:])
                keys, values = zip(*ordered)
                yield keys[0][:split], asarray(values).reshape(
                    full_shape).transpose(perm).reshape(vshape)
        else:

            def _unchunk(it):
                ordered = sorted(it, key=lambda kv: kv[0][split:])
                keys, values = zip(*ordered)
                k_chks = [k[split:] for k in keys]
                arr = empty(nchunks, dtype='object')
                for (i, d) in zip(k_chks, values):
                    arr[i] = d
                yield keys[0][:split], allstack(arr.tolist())

        # remove padding
        if self.padded:
            removepad = self.removepad
            rdd = self._rdd.map(lambda kv: (
                kv[0],
                removepad(
                    kv[0][split:], kv[1], nchunks, padding, axes=range(n))))
        else:
            rdd = self._rdd

        # skip partitionBy if there is not actually any chunking
        if array_equal(self.plan, self.vshape):
            rdd = rdd.map(lambda kv: (kv[0][:split], kv[1]))
            ordered = self._ordered
        else:
            ranges = self.kshape
            npartitions = int(prod(ranges))
            if len(self.kshape) == 0:
                partitioner = lambda k: 0
            else:
                partitioner = lambda k: ravel_multi_index(k[:split], ranges)
            rdd = rdd.partitionBy(
                numPartitions=npartitions,
                partitionFunc=partitioner).mapPartitions(_unchunk)
            ordered = True

        if array_equal(self.vshape, [1]):
            rdd = rdd.mapValues(lambda v: squeeze(v))
            newshape = self.shape[:-1]
        else:
            newshape = self.shape

        return BoltArraySpark(rdd,
                              shape=newshape,
                              split=self._split,
                              dtype=self.dtype,
                              ordered=ordered)