Beispiel #1
0
    def map(self, func):
        """
        Apply a function on each subarray.

        Parameters
        ----------
        func : function 
             This is applied to each value in the intermediate RDD.

        Returns
        -------
        StackedArray
        """
        vshape = self.shape[self.split:]
        x = self._rdd.values().first()
        if x.shape == vshape:
            a, b = asarray([x]), asarray([x, x])
        else:
            a, b = x, concatenate((x, x))

        try:
            atest = func(a)
            btest = func(b)
        except Exception as e:
            raise RuntimeError(
                "Error evaluating function on test array, got error:\n %s" % e)

        if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)):
            raise ValueError("Function must return ndarray")

        # different shapes map to the same new shape
        elif atest.shape == btest.shape:
            if self._rekeyed is True:
                # we've already rekeyed
                rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1])))
                shape = (self.shape[0], ) + atest.shape
            else:
                # do the rekeying
                count, rdd = zip_with_index(self._rdd.values())
                rdd = rdd.map(lambda kv: ((kv[1], ), func(kv[0])))
                shape = (count, ) + atest.shape
            split = 1
            rekeyed = True

        # different shapes stay different (along the first dimension)
        elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]:
            shape = self.shape[0:self.split] + atest.shape[1:]
            split = self.split
            rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1])))
            rekeyed = self._rekeyed

        else:
            raise ValueError("Cannot infer effect of function on shape")

        return self._constructor(rdd,
                                 rekeyed=rekeyed,
                                 shape=shape,
                                 split=split).__finalize__(self)
Beispiel #2
0
    def map(self, func):
        """
        Apply a function on each subarray.

        Parameters
        ----------
        func : function 
             This is applied to each value in the intermediate RDD.

        Returns
        -------
        StackedArray
        """
        vshape = self.shape[self.split:]
        x = self._rdd.values().first()
        if x.shape == vshape:
            a, b = asarray([x]), asarray([x, x])
        else:
            a, b = x, concatenate((x, x))

        try:
            atest = func(a)
            btest = func(b)
        except Exception as e:
            raise RuntimeError("Error evaluating function on test array, got error:\n %s" % e)

        if not (isinstance(atest, ndarray) and isinstance(btest, ndarray)):
            raise ValueError("Function must return ndarray")

        # different shapes map to the same new shape
        elif atest.shape == btest.shape:
            if self._rekeyed is True:
                # we've already rekeyed
                rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1])))
                shape = (self.shape[0],) + atest.shape
            else:
                # do the rekeying
                count, rdd = zip_with_index(self._rdd.values())
                rdd = rdd.map(lambda kv: ((kv[1],), func(kv[0])))
                shape = (count,) + atest.shape
            split = 1
            rekeyed = True

        # different shapes stay different (along the first dimension)
        elif atest.shape[0] == a.shape[0] and btest.shape[0] == b.shape[0]:
            shape = self.shape[0:self.split] + atest.shape[1:]
            split = self.split
            rdd = self._rdd.map(lambda kv: (kv[0], func(kv[1])))
            rekeyed = self._rekeyed

        else:
            raise ValueError("Cannot infer effect of function on shape")

        return self._constructor(rdd, rekeyed=rekeyed, shape=shape, split=split).__finalize__(self)
Beispiel #3
0
    def filter(self, func, axis=(0, ), sort=False):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        sort: bool, optional, default=False
            Whether or not to sort by key before reindexing

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)

        swapped = self._align(axis)

        def f(record):
            return func(record[1])

        rdd = swapped._rdd.filter(f)
        if sort:
            rdd = rdd.sortByKey().values()
        else:
            rdd = rdd.values()

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[len(axis):])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0, )

        return self._constructor(reindexed, shape=shape,
                                 split=swapped.split).__finalize__(swapped)
Beispiel #4
0
    def filter(self, func, axis=(0,), sort=False):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        sort: bool, optional, default=False
            Whether or not to sort by key before reindexing

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)

        swapped = self._align(axis)
        def f(record):
            return func(record[1])
        rdd = swapped._rdd.filter(f)
        if sort:
            rdd = rdd.sortByKey().values()
        else:
            rdd = rdd.values()

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (tupleize(kv[1]), kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[len(axis):])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
Beispiel #5
0
    def filter(self, func, axis=(0, )):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError(
                "Filtering over multiple axes will not be "
                "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[1:])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0, )

        return self._constructor(reindexed, shape=shape,
                                 split=swapped.split).__finalize__(swapped)
Beispiel #6
0
    def filter(self, func, axis=(0, )):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltSparkArray
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError(
                "Filtering over multiple axes will not be "
                "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        remaining = [
            swapped.shape[dim] for dim in range(len(swapped.shape))
            if dim not in axis
        ]
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0, )

        return self._constructor(reindexed, shape=shape,
                                 split=swapped.split).__finalize__(swapped)
Beispiel #7
0
    def filter(self, func, axis=(0,)):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes. Array will be
        aligned so that the desired set of axes are in the keys,
        which may incur a swap.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltArraySpark
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError("Filtering over multiple axes will not be "
                                      "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        # since we can only filter over one axis, the remaining shape is always the following
        remaining = list(swapped.shape[1:])
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)
Beispiel #8
0
    def filter(self, func, axis=(0,)):
        """
        Filter array along an axis.

        Applies a function which should evaluate to boolean,
        along a single axis or multiple axes.

        Parameters
        ----------
        func : function
            Function to apply, should return boolean

        axis : tuple or int, optional, default=(0,)
            Axis or multiple axes to filter along.

        Returns
        -------
        BoltSparkArray
        """
        axis = tupleize(axis)
        if len(axis) != 1:
            raise NotImplementedError("Filtering over multiple axes will not be "
                                      "supported until SparseBoltArray is implemented.")

        swapped = self._align(axis)
        rdd = swapped._rdd.values().filter(func)

        # count the resulting array in order to reindex (linearize) the keys
        count, zipped = zip_with_index(rdd)
        if not count:
            count = zipped.count()
        reindexed = zipped.map(lambda kv: (kv[1], kv[0]))

        remaining = [swapped.shape[dim] for dim in range(len(swapped.shape)) if dim not in axis]
        if count != 0:
            shape = tuple([count] + remaining)
        else:
            shape = (0,)

        return self._constructor(reindexed, shape=shape, split=swapped.split).__finalize__(swapped)