Example #1
0
    def test_fancy_assignment(self):
        a = np.zeros((4, 4, 4), 'd')
        twoByTwo = np.ones((2, 2), 'd')

        #NOTEs from commit message motivating why we need this:
        # a = np.zeros((3,3,3))
        # a[:,1:2,1:3].shape == (3,1,2) # good!
        # a[0,:,1:3].shape == (3,2) #good!
        # a[0,:,[1,2]].shape == (2,3) # ?? (broacasting ':' makes this like a[0,[1,2]])
        # a[:,[1,2],[1,2]].shape == (3,2) # ?? not (3,2,2) b/c lists broadcast
        # a[:,[1],[1,2]].shape == (3,2) # ?? not (3,1,2) b/c lists broadcast
        # a[:,[1,2],[0,1,2]].shape == ERROR b/c [1,2] can't broadcast to [0,1,2]!

        #simple integer indices
        mt._fas(a, (0, 0, 0), 4.5)  # a[0,0,0] = 4.5
        self.assertAlmostEqual(a[0, 0, 0], 4.5)

        mt._fas(a, (0, 0, 0), 4.5, add=True)  # a[0,0,0] += 4.5
        self.assertAlmostEqual(a[0, 0, 0], 9.0)

        #still simple: mix of slices and integers
        mt._fas(a, (slice(0, 2), slice(0, 2), 0),
                twoByTwo)  # a[0:2,0:2,0] = twoByTwo
        self.assertArraysAlmostEqual(a[0:2, 0:2, 0], twoByTwo)

        #complex case: some/all indices are integer arrays
        mt._fas(
            a, ([0, 1], [0, 1], 0), twoByTwo[:, :]
        )  # a[0:2,0:2,0] = twoByTwo - but a[[0,1],[0,1],0] wouldn't do this!
        self.assertArraysAlmostEqual(a[0:2, 0:2, 0], twoByTwo)

        mt._fas(
            a, ([0, 1], [0, 1], 0), twoByTwo[:, :], add=True
        )  # a[0:2,0:2,0] = twoByTwo - but a[[0,1],[0,1],0] wouldn't do this!
        self.assertArraysAlmostEqual(a[0:2, 0:2, 0], 2 * twoByTwo)

        # Fancy indexing (without assignment)
        self.assertEqual(mt._findx(a, (0, 0, 0)).shape, ())  # (1,1,1))
        self.assertEqual(
            mt._findx(a, (slice(0, 2), slice(0, 2), slice(0, 2))).shape,
            (2, 2, 2))
        self.assertEqual(
            mt._findx(a, (slice(0, 2), slice(0, 2), 0)).shape, (2, 2))
        self.assertEqual(mt._findx(a, ([0, 1], [0, 1], 0)).shape, (2, 2))
        self.assertEqual(mt._findx(a, ([], [0, 1], 0)).shape, (0, 2))
Example #2
0
def gather_slices_by_owner(current_slices, ar_to_fill, ar_to_fill_inds,
                           axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given slices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices of the
    axes indexed by `axes`. At exit, data has been gathered such that all processors
    have the results for the entire `ar_to_fill` (or at least for all the slices
    given).

    Parameters
    ----------
    current_slices : list
        A list of all the slices computed by the *current* processor.
        Each element of `slices` may be either a single slice or a
        tuple of slices (when gathering across multiple dimensions).

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the slices in `slices` refer to?).  Note that `len(axes)` must
        be equal to the number of slices (i.e. the tuple length) of each
        element of `slices`.

    comm : mpi4py.MPI.Comm or None
        The communicator specifying the processors involved and used
        to perform the gather operation.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """

    #Note: same beginning as gather_slices (TODO: consolidate?)
    if comm is None: return  # no gathering needed!

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()
    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds

    axes = (axes,) if _compat.isint(axes) else axes

    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_slices_by_owner: Could not achieve max_buffer_size")
    # -- end part that is the same as gather_slices

    #Get a list of the slices to broadcast, indexed by the rank of the owner proc
    slices_by_owner = comm.allgather(current_slices)
    for owner, slices in enumerate(slices_by_owner):
        for slcOrSlcTup in slices:
            slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup
            assert(len(slcTup) == len(axes))

            #Get the a list of the (sub-)slices along each axis, whose product
            # (along the specified axes) gives the entire block given by slcTup
            axisSlices = []
            for iaxis, axis in enumerate(axes):
                slc = slcTup[iaxis]
                if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc):
                    axisSlices.append([slc])  # arIndx[axis] = slc
                else:
                    axisSlices.append(_slct.divide(slc, max_indices[iaxis]))

            for axSlcs in _itertools.product(*axisSlices):
                #create arIndx from per-axis (sub-)slices and broadcast
                for iaxis, axis in enumerate(axes):
                    arIndx[axis] = axSlcs[iaxis]

                #broadcast arIndx slice
                buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                    else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
                comm.Bcast(buf, root=owner)
                if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
                buf = None  # free buffer mem asap
Example #3
0
def gather_indices(indices, index_owners, ar_to_fill, ar_to_fill_inds,
                   axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given indices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices or
    index-arrays of the `axis`-th axis.  At exit, data has been gathered such
    that all processors have the results for the entire `ar_to_fill` (or at least
    for all the indices given).

    Parameters
    ----------
    indices : list
        A list of all the integer-arrays or slices (computed by *any* of
        the processors, not just the current one).  Each element of `indices`
        may be either a single slice/index-array or a tuple of such
        elements (when gathering across multiple dimensions).

    index_owners : dict
        A dictionary mapping the index of an element within `slices` to an
        integer rank of the processor responsible for communicating that
        slice/index-array's data to the rest of the processors.

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the elements of `indices` refer to?).  Note that `len(axes)` must
        be equal to the number of sub-indices (i.e. the tuple length) of each
        element of `indices`.

    comm : mpi4py.MPI.Comm or None
        The communicator specifying the processors involved and used
        to perform the gather operation.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """
    if comm is None: return  # no gathering needed!

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()
    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds

    axes = (axes,) if _compat.isint(axes) else axes

    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_indices: Could not achieve max_buffer_size")

    for iIndex, indOrIndTup in enumerate(indices):
        owner = index_owners[iIndex]  # owner's rank
        indTup = (indOrIndTup,) if not isinstance(indOrIndTup, tuple) else indOrIndTup
        assert(len(indTup) == len(axes))

        def to_slice_list(index_array_or_slice):
            """Breaks a slice or index array into a list of slices"""
            if isinstance(index_array_or_slice, slice):
                return [index_array_or_slice]  # easy!

            lst = index_array_or_slice
            if len(lst) == 0: return [slice(0, 0)]

            slc_lst = []
            i = 0; N = len(lst)
            while i < N:
                start = lst[i]
                step = lst[i + 1] - lst[i] if i + 1 < N else None
                while i + 1 < N and lst[i + 1] - lst[i] == step: i += 1
                stop = lst[i] + 1
                slc_lst.append(slice(start, stop, None if step == 1 else step))
                i += 1

            return slc_lst

        #Get the a list of the (sub-)indices along each axis, whose product
        # (along the specified axes) gives the entire block given by slcTup
        axisSlices = []
        for iaxis, axis in enumerate(axes):
            ind = indTup[iaxis]
            sub_slices = []

            #break `ind`, which may be either a single slice or an index array,
            # into a list of slices that are broadcast one at a time (sometimes
            # these `ind_slice` slices themselves need to be broken up further
            # to obey max_buffer_size).
            for islice in to_slice_list(ind):
                if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(islice):
                    sub_slices.append(islice)  # arIndx[axis] = slc
                else:
                    sub_slices.extend(_slct.divide(islice, max_indices[iaxis]))
            axisSlices.append(sub_slices)

        for axSlcs in _itertools.product(*axisSlices):
            #create arIndx from per-axis (sub-)slices and broadcast
            for iaxis, axis in enumerate(axes):
                arIndx[axis] = axSlcs[iaxis]

            #broadcast arIndx slice
            buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
            comm.Bcast(buf, root=owner)
            if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
            buf = None  # free buffer mem asap
Example #4
0
def gather_slices(slices, slice_owners, ar_to_fill,
                  ar_to_fill_inds, axes, comm, max_buffer_size=None):
    """
    Gathers data within a numpy array, `ar_to_fill`, according to given slices.

    Upon entry it is assumed that the different processors within `comm` have
    computed different parts of `ar_to_fill`, namely different slices of the
    `axis`-th axis.  At exit, data has been gathered such that all processors
    have the results for the entire `ar_to_fill` (or at least for all the slices
    given).

    Parameters
    ----------
    slices : list
        A list of all the slices (computed by *any* of the processors, not
        just the current one).  Each element of `slices` may be either a
        single slice or a tuple of slices (when gathering across multiple
        dimensions).

    slice_owners : dict
        A dictionary mapping the index of a slice (or tuple of slices)
        within `slices` to an integer rank of the processor responsible
        for communicating that slice's data to the rest of the processors.

    ar_to_fill : numpy.ndarray
        The array which contains partial data upon entry and the gathered
        data upon exit.

    ar_to_fill_inds : list
        A list of slice or index-arrays specifying the (fixed) sub-array of
        `ar_to_fill` that should be gathered into.  The elements of
        `ar_to_fill_inds` are taken to be indices for the leading dimension
        first, and any unspecified dimensions or `None` elements are
        assumed to be unrestricted (as if `slice(None,None)`).  Note that
        the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like
        passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will
        work with index arrays as well as slices.

    axes : int or tuple of ints
        The axis or axes of `ar_to_fill` on which the slices apply (which axis
        do the slices in `slices` refer to?).  Note that `len(axes)` must
        be equal to the number of slices (i.e. the tuple length) of each
        element of `slices`.

    comm : mpi4py.MPI.Comm or ResourceAllocation or None
        The communicator specifying the processors involved and used
        to perform the gather operation.  If a :class:`ResourceAllocation`
        is provided, then inter-host communication is used when available
        to facilitate use of shared intra-host memory.

    max_buffer_size : int or None
        The maximum buffer size in bytes that is allowed to be used
        for gathering data.  If None, there is no limit.

    Returns
    -------
    None
    """
    from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation
    if isinstance(comm, _ResourceAllocation):
        ralloc = comm
        comm = ralloc.comm

        #For use with shared intra-host (intra-node) memory:
        # my_interhost_ranks = ranks of comm, 1 per host, that this processor uses to send/receive data between hosts
        # broadcast_comm = the comm of my_interhost_ranks used to send/receive data.
        if ralloc.interhost_ranks is not None:
            my_interhost_ranks = set(ralloc.interhost_ranks)
            broadcast_rank_map = {comm_rank: broadcast_comm_rank
                                  for broadcast_comm_rank, comm_rank in enumerate(ralloc.interhost_ranks)}
            broadcast_comm = ralloc.interhost_comm
        else:
            my_interhost_ranks = None
            broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0}  # trivial map
            broadcast_comm = comm
    else:
        ralloc = None
        my_interhost_ranks = None
        broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0}  # trivial map
        broadcast_comm = comm

    if comm is None: return  # no gathering needed!

    # To be safe, since use of broadcast_comm below means we don't always need to wait for all procs
    # to finish what they were doing last, which could involve updating a shared ar_to_fill so that
    # values accessed by the already-finished front-running processors are affected!
    comm.barrier()

    #Perform broadcasts for each slice in order
    my_rank = comm.Get_rank()

    axes = (axes,) if _compat.isint(axes) else axes

    #print("DB: Rank %d (%d): BEGIN GATHER SLICES: interhost=%s, group=%s" %
    #      (my_rank, broadcast_comm.rank, str(my_interhost_ranks), str(broadcast_comm.Get_group())))

    # # if ar_to_fill_inds only contains slices (or is empty), then we can slice ar_to_fill once up front
    # # and not use generic arIndx in loop below (slower, especially with lots of procs)
    # if all([isinstance(indx, slice) for indx in ar_to_fill_inds]):
    #     ar_to_fill = ar_to_fill[tuple(ar_to_fill_inds)]  # Note: this *doesn't* reduce its .ndim
    #     ar_to_fill_inds = ()  # now ar_to_fill requires no further indexing

    arIndx = [slice(None, None)] * ar_to_fill.ndim
    arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds
    max_indices = [None] * len(axes)
    if max_buffer_size is not None:  # no maximum of buffer size
        chunkBytes = ar_to_fill.nbytes  # start with the entire array as the "chunk"
        for iaxis, axis in enumerate(axes):
            # Consider restricting the chunk size along the iaxis-th axis.
            #  If we can achieve the desired max_buffer_size by restricting
            #  just along this axis, great.  Otherwise, restrict to at most
            #  1 index along this axis and keep going.
            bytes_per_index = chunkBytes / ar_to_fill.shape[axis]
            max_inds = int(max_buffer_size / bytes_per_index)
            if max_inds == 0:
                max_indices[iaxis] = 1
                chunkBytes /= ar_to_fill.shape[axis]
            else:
                max_indices[iaxis] = max_inds
                break
        else:
            _warnings.warn("gather_slices: Could not achieve max_buffer_size")

    # NOTE: Tried doing something faster (Allgatherv) when slices elements are simple slices (not tuples of slices).
    # This ultimately showed that our repeated use of Bcast isn't any slower than fewer calls to Allgatherv,
    # and since the Allgatherv case complicates the code and ignores the memory limit, it's best to just drop it.

    # Broadcast slices one-by-one (slower, but more general):
    for iSlice, slcOrSlcTup in enumerate(slices):
        owner = slice_owners[iSlice]  # owner's rank
        if my_interhost_ranks is not None and owner not in my_interhost_ranks:
            # if the "source" (owner) of the data isn't a part of my "circle" of ranks, then we
            # don't need to send or receive this data - other ranks on the same hosts will do it.
            continue

        slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup
        assert(len(slcTup) == len(axes))

        #Get the a list of the (sub-)slices along each axis, whose product
        # (along the specified axes) gives the entire block given by slcTup
        axisSlices = []
        for iaxis, axis in enumerate(axes):
            slc = slcTup[iaxis]
            if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc):
                axisSlices.append([slc])  # arIndx[axis] = slc
            else:
                axisSlices.append(_slct.divide(slc, max_indices[iaxis]))

        for axSlcs in _itertools.product(*axisSlices):
            #create arIndx from per-axis (sub-)slices and broadcast
            for iaxis, axis in enumerate(axes):
                arIndx[axis] = axSlcs[iaxis]

            #broadcast arIndx slice
            buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \
                else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype)
            if my_interhost_ranks is None or len(my_interhost_ranks) > 1:
                #print("DB: Rank %d (%d) Broadcast: arIndx = %s, owner=%d root=%d" %
                #      (my_rank, broadcast_comm.rank, str(arIndx), owner, broadcast_rank_map[owner]))
                broadcast_comm.Bcast(buf, root=broadcast_rank_map[owner])
                if my_rank != owner: _fas(ar_to_fill, arIndx, buf)
            buf = None  # free buffer mem asap
    #print("DB: Rank %d: END GATHER SLICES" % my_rank)

    # Important: wait for everything to finish before proceeding
    #  (when broadcast_comm != comm some procs may run ahead - see comment above)
    comm.barrier()