def _mapfill_hprobs_atom(self, array_to_fill, dest_indices, dest_param_indices1, dest_param_indices2, layout_atom, param_indices1, param_indices2, resource_alloc, eps): """ Helper function for populating hessian values by block. """ shared_mem_leader = resource_alloc.is_host_leader if ( resource_alloc is not None) else True if param_indices1 is None: param_indices1 = list(range(self.model.num_params)) if param_indices2 is None: param_indices2 = list(range(self.model.num_params)) if dest_param_indices1 is None: dest_param_indices1 = list(range(_slct.length(param_indices1))) if dest_param_indices2 is None: dest_param_indices2 = list(range(_slct.length(param_indices2))) param_indices1 = _slct.to_array(param_indices1) dest_param_indices1 = _slct.to_array(dest_param_indices1) #dest_param_indices2 = _slct.to_array(dest_param_indices2) # OK if a slice #Get a map from global parameter indices to the desired # final index within mx_to_fill (fpoffset = final parameter offset) iParamToFinal = { i: dest_index for i, dest_index in zip(param_indices1, dest_param_indices1) } nEls = layout_atom.num_elements nP2 = _slct.length(param_indices2) if isinstance( param_indices2, slice) else len(param_indices2) dprobs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2), 'd') dprobs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, nP2), 'd') self.calclib.mapfill_dprobs_atom(self, dprobs, slice(0, nEls), None, layout_atom, param_indices2, resource_alloc, eps) orig_vec = self.model.to_vector().copy() for i in range(self.model.num_params): if i in iParamToFinal: iFinal = iParamToFinal[i] vec = orig_vec.copy() vec[i] += eps self.model.from_vector(vec, close=True) self.calclib.mapfill_dprobs_atom(self, dprobs2, slice(0, nEls), None, layout_atom, param_indices2, resource_alloc, eps) if shared_mem_leader: _fas(array_to_fill, [dest_indices, iFinal, dest_param_indices2], (dprobs2 - dprobs) / eps) self.model.from_vector(orig_vec) _smt.cleanup_shared_ndarray(shm) _smt.cleanup_shared_ndarray(shm2)
def mapfill_timedep_dterms(fwdsim, array_to_fill, dest_indices, dest_param_indices, num_outcomes, layout_atom, dataset_rows, fillfn, wrt_slice, comm): eps = 1e-7 # hardcoded? #Compute finite difference derivatives, one parameter at a time. param_indices = range(fwdsim.model.num_params) if ( wrt_slice is None) else _slct.indices(wrt_slice) nEls = layout_atom.num_elements vals = _np.empty(nEls, 'd') vals2 = _np.empty(nEls, 'd') assert ( layout_atom.cache_size == 0 ) # so all elements have None as start and remainder[0] is a prep label orig_vec = fwdsim.model.to_vector().copy() fwdsim.model.from_vector( orig_vec, close=False) # ensure we call with close=False first fillfn(vals, slice(0, nEls), num_outcomes, layout_atom, dataset_rows, comm) all_slices, my_slice, owners, subComm = \ _mpit.distribute_slice(slice(0, len(param_indices)), comm) my_param_indices = param_indices[my_slice] st = my_slice.start # beginning of where my_param_indices results # get placed into dpr_cache #Get a map from global parameter indices to the desired # final index within dpr_cache iParamToFinal = {i: st + ii for ii, i in enumerate(my_param_indices)} for i in range(fwdsim.model.num_params): # print("dprobs cache %d of %d" % (i,fwdsim.model.num_params)) if i in iParamToFinal: iFinal = iParamToFinal[i] vec = orig_vec.copy() vec[i] += eps fwdsim.model.from_vector(vec, close=True) fillfn(vals2, slice(0, nEls), num_outcomes, layout_atom, dataset_rows, subComm) _fas(array_to_fill, [dest_indices, iFinal], (vals2 - vals) / eps) fwdsim.model.from_vector(orig_vec, close=True) #Now each processor has filled the relavant parts of dpr_cache, # so gather together: _mpit.gather_slices(all_slices, owners, array_to_fill, [], axes=1, comm=comm)
def mapfill_dprobs_atom(fwdsim, mx_to_fill, dest_indices, dest_param_indices, layout_atom, param_indices, resource_alloc, eps): #eps = 1e-7 #shared_mem_leader = resource_alloc.is_host_leader if (resource_alloc is not None) else True if param_indices is None: param_indices = list(range(fwdsim.model.num_params)) if dest_param_indices is None: dest_param_indices = list(range(_slct.length(param_indices))) param_indices = _slct.to_array(param_indices) dest_param_indices = _slct.to_array(dest_param_indices) #Get a map from global parameter indices to the desired # final index within mx_to_fill (fpoffset = final parameter offset) iParamToFinal = { i: dest_index for i, dest_index in zip(param_indices, dest_param_indices) } orig_vec = fwdsim.model.to_vector().copy() fwdsim.model.from_vector( orig_vec, close=False) # ensure we call with close=False first #Note: no real need for using shared memory here except so that we can pass # `resource_alloc` to mapfill_probs_block and have it potentially use multiple procs. nEls = layout_atom.num_elements probs, shm = _smt.create_shared_ndarray(resource_alloc, (nEls, ), 'd', memory_tracker=None) probs2, shm2 = _smt.create_shared_ndarray(resource_alloc, (nEls, ), 'd', memory_tracker=None) mapfill_probs_atom(fwdsim, probs, slice(0, nEls), layout_atom, resource_alloc) # probs != shared for i in range(fwdsim.model.num_params): #print("dprobs cache %d of %d" % (i,self.Np)) if i in iParamToFinal: iFinal = iParamToFinal[i] vec = orig_vec.copy() vec[i] += eps fwdsim.model.from_vector(vec, close=True) mapfill_probs_atom(fwdsim, probs2, slice(0, nEls), layout_atom, resource_alloc) _fas(mx_to_fill, [dest_indices, iFinal], (probs2 - probs) / eps) fwdsim.model.from_vector(orig_vec, close=True) _smt.cleanup_shared_ndarray(shm) _smt.cleanup_shared_ndarray(shm2)
def test_fancy_assignment(self): a = np.zeros((4, 4, 4), 'd') twoByTwo = np.ones((2, 2), 'd') #NOTEs from commit message motivating why we need this: # a = np.zeros((3,3,3)) # a[:,1:2,1:3].shape == (3,1,2) # good! # a[0,:,1:3].shape == (3,2) #good! # a[0,:,[1,2]].shape == (2,3) # ?? (broacasting ':' makes this like a[0,[1,2]]) # a[:,[1,2],[1,2]].shape == (3,2) # ?? not (3,2,2) b/c lists broadcast # a[:,[1],[1,2]].shape == (3,2) # ?? not (3,1,2) b/c lists broadcast # a[:,[1,2],[0,1,2]].shape == ERROR b/c [1,2] can't broadcast to [0,1,2]! #simple integer indices mt._fas(a, (0, 0, 0), 4.5) # a[0,0,0] = 4.5 self.assertAlmostEqual(a[0, 0, 0], 4.5) mt._fas(a, (0, 0, 0), 4.5, add=True) # a[0,0,0] += 4.5 self.assertAlmostEqual(a[0, 0, 0], 9.0) #still simple: mix of slices and integers mt._fas(a, (slice(0, 2), slice(0, 2), 0), twoByTwo) # a[0:2,0:2,0] = twoByTwo self.assertArraysAlmostEqual(a[0:2, 0:2, 0], twoByTwo) #complex case: some/all indices are integer arrays mt._fas( a, ([0, 1], [0, 1], 0), twoByTwo[:, :] ) # a[0:2,0:2,0] = twoByTwo - but a[[0,1],[0,1],0] wouldn't do this! self.assertArraysAlmostEqual(a[0:2, 0:2, 0], twoByTwo) mt._fas( a, ([0, 1], [0, 1], 0), twoByTwo[:, :], add=True ) # a[0:2,0:2,0] = twoByTwo - but a[[0,1],[0,1],0] wouldn't do this! self.assertArraysAlmostEqual(a[0:2, 0:2, 0], 2 * twoByTwo) # Fancy indexing (without assignment) self.assertEqual(mt._findx(a, (0, 0, 0)).shape, ()) # (1,1,1)) self.assertEqual( mt._findx(a, (slice(0, 2), slice(0, 2), slice(0, 2))).shape, (2, 2, 2)) self.assertEqual( mt._findx(a, (slice(0, 2), slice(0, 2), 0)).shape, (2, 2)) self.assertEqual(mt._findx(a, ([0, 1], [0, 1], 0)).shape, (2, 2)) self.assertEqual(mt._findx(a, ([], [0, 1], 0)).shape, (0, 2))
def gather_indices(indices, index_owners, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given indices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices or index-arrays of the `axis`-th axis. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the indices given). Parameters ---------- indices : list A list of all the integer-arrays or slices (computed by *any* of the processors, not just the current one). Each element of `indices` may be either a single slice/index-array or a tuple of such elements (when gathering across multiple dimensions). index_owners : dict A dictionary mapping the index of an element within `slices` to an integer rank of the processor responsible for communicating that slice/index-array's data to the rest of the processors. ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the elements of `indices` refer to?). Note that `len(axes)` must be equal to the number of sub-indices (i.e. the tuple length) of each element of `indices`. comm : mpi4py.MPI.Comm or None The communicator specifying the processors involved and used to perform the gather operation. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ if comm is None: return # no gathering needed! #Perform broadcasts for each slice in order my_rank = comm.Get_rank() arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds axes = (axes,) if _compat.isint(axes) else axes max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_indices: Could not achieve max_buffer_size") for iIndex, indOrIndTup in enumerate(indices): owner = index_owners[iIndex] # owner's rank indTup = (indOrIndTup,) if not isinstance(indOrIndTup, tuple) else indOrIndTup assert(len(indTup) == len(axes)) def to_slice_list(index_array_or_slice): """Breaks a slice or index array into a list of slices""" if isinstance(index_array_or_slice, slice): return [index_array_or_slice] # easy! lst = index_array_or_slice if len(lst) == 0: return [slice(0, 0)] slc_lst = [] i = 0; N = len(lst) while i < N: start = lst[i] step = lst[i + 1] - lst[i] if i + 1 < N else None while i + 1 < N and lst[i + 1] - lst[i] == step: i += 1 stop = lst[i] + 1 slc_lst.append(slice(start, stop, None if step == 1 else step)) i += 1 return slc_lst #Get the a list of the (sub-)indices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): ind = indTup[iaxis] sub_slices = [] #break `ind`, which may be either a single slice or an index array, # into a list of slices that are broadcast one at a time (sometimes # these `ind_slice` slices themselves need to be broken up further # to obey max_buffer_size). for islice in to_slice_list(ind): if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(islice): sub_slices.append(islice) # arIndx[axis] = slc else: sub_slices.extend(_slct.divide(islice, max_indices[iaxis])) axisSlices.append(sub_slices) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) comm.Bcast(buf, root=owner) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap
def gather_slices_by_owner(current_slices, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given slices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices of the axes indexed by `axes`. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the slices given). Parameters ---------- current_slices : list A list of all the slices computed by the *current* processor. Each element of `slices` may be either a single slice or a tuple of slices (when gathering across multiple dimensions). ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the slices in `slices` refer to?). Note that `len(axes)` must be equal to the number of slices (i.e. the tuple length) of each element of `slices`. comm : mpi4py.MPI.Comm or None The communicator specifying the processors involved and used to perform the gather operation. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ #Note: same beginning as gather_slices (TODO: consolidate?) if comm is None: return # no gathering needed! #Perform broadcasts for each slice in order my_rank = comm.Get_rank() arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds axes = (axes,) if _compat.isint(axes) else axes max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_slices_by_owner: Could not achieve max_buffer_size") # -- end part that is the same as gather_slices #Get a list of the slices to broadcast, indexed by the rank of the owner proc slices_by_owner = comm.allgather(current_slices) for owner, slices in enumerate(slices_by_owner): for slcOrSlcTup in slices: slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup assert(len(slcTup) == len(axes)) #Get the a list of the (sub-)slices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): slc = slcTup[iaxis] if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc): axisSlices.append([slc]) # arIndx[axis] = slc else: axisSlices.append(_slct.divide(slc, max_indices[iaxis])) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) comm.Bcast(buf, root=owner) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap
def gather_slices(slices, slice_owners, ar_to_fill, ar_to_fill_inds, axes, comm, max_buffer_size=None): """ Gathers data within a numpy array, `ar_to_fill`, according to given slices. Upon entry it is assumed that the different processors within `comm` have computed different parts of `ar_to_fill`, namely different slices of the `axis`-th axis. At exit, data has been gathered such that all processors have the results for the entire `ar_to_fill` (or at least for all the slices given). Parameters ---------- slices : list A list of all the slices (computed by *any* of the processors, not just the current one). Each element of `slices` may be either a single slice or a tuple of slices (when gathering across multiple dimensions). slice_owners : dict A dictionary mapping the index of a slice (or tuple of slices) within `slices` to an integer rank of the processor responsible for communicating that slice's data to the rest of the processors. ar_to_fill : numpy.ndarray The array which contains partial data upon entry and the gathered data upon exit. ar_to_fill_inds : list A list of slice or index-arrays specifying the (fixed) sub-array of `ar_to_fill` that should be gathered into. The elements of `ar_to_fill_inds` are taken to be indices for the leading dimension first, and any unspecified dimensions or `None` elements are assumed to be unrestricted (as if `slice(None,None)`). Note that the combination of `ar_to_fill` and `ar_to_fill_inds` is essentally like passing `ar_to_fill[ar_to_fill_inds]` to this function, except it will work with index arrays as well as slices. axes : int or tuple of ints The axis or axes of `ar_to_fill` on which the slices apply (which axis do the slices in `slices` refer to?). Note that `len(axes)` must be equal to the number of slices (i.e. the tuple length) of each element of `slices`. comm : mpi4py.MPI.Comm or ResourceAllocation or None The communicator specifying the processors involved and used to perform the gather operation. If a :class:`ResourceAllocation` is provided, then inter-host communication is used when available to facilitate use of shared intra-host memory. max_buffer_size : int or None The maximum buffer size in bytes that is allowed to be used for gathering data. If None, there is no limit. Returns ------- None """ from ..baseobjs.resourceallocation import ResourceAllocation as _ResourceAllocation if isinstance(comm, _ResourceAllocation): ralloc = comm comm = ralloc.comm #For use with shared intra-host (intra-node) memory: # my_interhost_ranks = ranks of comm, 1 per host, that this processor uses to send/receive data between hosts # broadcast_comm = the comm of my_interhost_ranks used to send/receive data. if ralloc.interhost_ranks is not None: my_interhost_ranks = set(ralloc.interhost_ranks) broadcast_rank_map = {comm_rank: broadcast_comm_rank for broadcast_comm_rank, comm_rank in enumerate(ralloc.interhost_ranks)} broadcast_comm = ralloc.interhost_comm else: my_interhost_ranks = None broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0} # trivial map broadcast_comm = comm else: ralloc = None my_interhost_ranks = None broadcast_rank_map = {i: i for i in range(comm.Get_size())} if (comm is not None) else {0: 0} # trivial map broadcast_comm = comm if comm is None: return # no gathering needed! # To be safe, since use of broadcast_comm below means we don't always need to wait for all procs # to finish what they were doing last, which could involve updating a shared ar_to_fill so that # values accessed by the already-finished front-running processors are affected! comm.barrier() #Perform broadcasts for each slice in order my_rank = comm.Get_rank() axes = (axes,) if _compat.isint(axes) else axes #print("DB: Rank %d (%d): BEGIN GATHER SLICES: interhost=%s, group=%s" % # (my_rank, broadcast_comm.rank, str(my_interhost_ranks), str(broadcast_comm.Get_group()))) # # if ar_to_fill_inds only contains slices (or is empty), then we can slice ar_to_fill once up front # # and not use generic arIndx in loop below (slower, especially with lots of procs) # if all([isinstance(indx, slice) for indx in ar_to_fill_inds]): # ar_to_fill = ar_to_fill[tuple(ar_to_fill_inds)] # Note: this *doesn't* reduce its .ndim # ar_to_fill_inds = () # now ar_to_fill requires no further indexing arIndx = [slice(None, None)] * ar_to_fill.ndim arIndx[0:len(ar_to_fill_inds)] = ar_to_fill_inds max_indices = [None] * len(axes) if max_buffer_size is not None: # no maximum of buffer size chunkBytes = ar_to_fill.nbytes # start with the entire array as the "chunk" for iaxis, axis in enumerate(axes): # Consider restricting the chunk size along the iaxis-th axis. # If we can achieve the desired max_buffer_size by restricting # just along this axis, great. Otherwise, restrict to at most # 1 index along this axis and keep going. bytes_per_index = chunkBytes / ar_to_fill.shape[axis] max_inds = int(max_buffer_size / bytes_per_index) if max_inds == 0: max_indices[iaxis] = 1 chunkBytes /= ar_to_fill.shape[axis] else: max_indices[iaxis] = max_inds break else: _warnings.warn("gather_slices: Could not achieve max_buffer_size") # NOTE: Tried doing something faster (Allgatherv) when slices elements are simple slices (not tuples of slices). # This ultimately showed that our repeated use of Bcast isn't any slower than fewer calls to Allgatherv, # and since the Allgatherv case complicates the code and ignores the memory limit, it's best to just drop it. # Broadcast slices one-by-one (slower, but more general): for iSlice, slcOrSlcTup in enumerate(slices): owner = slice_owners[iSlice] # owner's rank if my_interhost_ranks is not None and owner not in my_interhost_ranks: # if the "source" (owner) of the data isn't a part of my "circle" of ranks, then we # don't need to send or receive this data - other ranks on the same hosts will do it. continue slcTup = (slcOrSlcTup,) if isinstance(slcOrSlcTup, slice) else slcOrSlcTup assert(len(slcTup) == len(axes)) #Get the a list of the (sub-)slices along each axis, whose product # (along the specified axes) gives the entire block given by slcTup axisSlices = [] for iaxis, axis in enumerate(axes): slc = slcTup[iaxis] if max_indices[iaxis] is None or max_indices[iaxis] >= _slct.length(slc): axisSlices.append([slc]) # arIndx[axis] = slc else: axisSlices.append(_slct.divide(slc, max_indices[iaxis])) for axSlcs in _itertools.product(*axisSlices): #create arIndx from per-axis (sub-)slices and broadcast for iaxis, axis in enumerate(axes): arIndx[axis] = axSlcs[iaxis] #broadcast arIndx slice buf = _findx(ar_to_fill, arIndx, True) if (my_rank == owner) \ else _np.empty(_findx_shape(ar_to_fill, arIndx), ar_to_fill.dtype) if my_interhost_ranks is None or len(my_interhost_ranks) > 1: #print("DB: Rank %d (%d) Broadcast: arIndx = %s, owner=%d root=%d" % # (my_rank, broadcast_comm.rank, str(arIndx), owner, broadcast_rank_map[owner])) broadcast_comm.Bcast(buf, root=broadcast_rank_map[owner]) if my_rank != owner: _fas(ar_to_fill, arIndx, buf) buf = None # free buffer mem asap #print("DB: Rank %d: END GATHER SLICES" % my_rank) # Important: wait for everything to finish before proceeding # (when broadcast_comm != comm some procs may run ahead - see comment above) comm.barrier()