def get_shift(self): _sfd = host.Array(ncomp=26 * 3, dtype=ctypes.c_double) dims = mpi.cartcomm_dims_xyz(self.comm) top = mpi.cartcomm_top_xyz(self.comm) periods = mpi.cartcomm_periods_xyz(self.comm) for dx in range(26): dir = mpi.recv_modifiers[dx] for ix in range(3): if top[ix] == 0 and \ periods[ix] == 1 and \ dir[ix] == -1: _sfd[dx * 3 + ix] = self.extent[ix] elif top[ix] == dims[ix] - 1 and \ periods[ix] == 1 and \ dir[ix] == 1: _sfd[dx * 3 + ix] = -1. * self.extent[ix] else: _sfd[dx * 3 + ix] = 0.0 return _sfd
def create_halo_pairs(domain_in, slicexyz, direction): """ Automatically create the pairs of cells for halos. """ cell_array = domain_in.cell_array extent = domain_in.extent comm = domain_in.comm dims = mpi.cartcomm_dims_xyz(comm) top = mpi.cartcomm_top_xyz(comm) periods = mpi.cartcomm_periods_xyz(comm) xr = range(1, cell_array[0] - 1)[slicexyz[0]] yr = range(1, cell_array[1] - 1)[slicexyz[1]] zr = range(1, cell_array[2] - 1)[slicexyz[2]] if not isinstance(xr, collections.abc.Iterable): xr = [xr] if not isinstance(yr, collections.abc.Iterable): yr = [yr] if not isinstance(zr, collections.abc.Iterable): zr = [zr] l = len(xr) * len(yr) * len(zr) b_cells = np.zeros(l, dtype=ctypes.c_int) h_cells = np.zeros(l, dtype=ctypes.c_int) i = 0 for iz in zr: for iy in yr: for ix in xr: b_cells[i] = ix + (iy + iz * cell_array[1]) * cell_array[0] _ix = (ix + direction[0] * 2) % cell_array[0] _iy = (iy + direction[1] * 2) % cell_array[1] _iz = (iz + direction[2] * 2) % cell_array[2] h_cells[i] = _ix + (_iy + _iz * cell_array[1]) * cell_array[0] i += 1 shift = np.zeros(3, dtype=ctypes.c_double) for ix in range(3): if top[ix] == 0 and periods[ix] == 1 and direction[ix] == -1: shift[ix] = extent[ix] if top[ix] == dims[ix] - 1 and periods[ix] == 1 and direction[ix] == 1: shift[ix] = -1. * extent[ix] return b_cells, h_cells, shift
def _distribute_domain(self): _top = mpi.cartcomm_top_xyz(self.comm) _dims = mpi.cartcomm_dims_xyz(self.comm) opt.PROFILE[self.__class__.__name__ + ':mpi_dims'] = (_dims) self._extent[0] = self._extent_global[0] / _dims[0] self._extent[1] = self._extent_global[1] / _dims[1] self._extent[2] = self._extent_global[2] / _dims[2] _boundary = (-0.5 * self._extent_global[0] + _top[0] * self._extent[0], -0.5 * self._extent_global[0] + (_top[0] + 1.) * self._extent[0], -0.5 * self._extent_global[1] + _top[1] * self._extent[1], -0.5 * self._extent_global[1] + (_top[1] + 1.) * self._extent[1], -0.5 * self._extent_global[2] + _top[2] * self._extent[2], -0.5 * self._extent_global[2] + (_top[2] + 1.) * self._extent[2]) self._boundary = data.ScalarArray(_boundary, dtype=ctypes.c_double) self._boundary_outer = data.ScalarArray(_boundary, dtype=ctypes.c_double)
def dims(self): return mpi.cartcomm_dims_xyz(self.comm)
def __call__(self): t0 = time.time() state = self.state comm = self.comm if comm.size == 1: return rank = comm.rank topo = mpi.cartcomm_top_xyz(comm) dims = mpi.cartcomm_dims_xyz(comm) extent = state.domain.extent boundary = state.domain.boundary dist_cell_widths = [1.0 / (ex / dx) for ex, dx in zip(extent, dims)] dist_cell_widths = np.array(dist_cell_widths, dtype=REAL) npart = state.npart_local pos = state.get_position_dat() pos = pos.view lcount = 0 lrank_dict = {} lpid = [] rk_offsets = (1, dims[0], dims[0] * dims[1]) def to_mpi_rank(_p): # avoid send if possible if ((_p[0] >= boundary[0]) and (_p[0] < boundary[1]) and \ (_p[1] >= boundary[2]) and (_p[1] < boundary[3]) and \ (_p[2] >= boundary[4]) and (_p[2] < boundary[5])): return rank # case where particle needs sending to another rank _rk = 0 for dx in range(3): assert _p[dx] <= 0.5 * extent[dx], "outside domain" assert _p[dx] >= -0.5 * extent[dx], "outside domain" tint = int((_p[dx] + 0.5 * extent[dx]) * dist_cell_widths[dx]) tint = min(dims[dx] - 1, tint) _rk += tint * rk_offsets[dx] return _rk # find the new remote rank for leaving particles t0_local = time.time() for px in range(npart): rk = to_mpi_rank(pos[px]) if rk != rank: lcount += 1 if rk not in lrank_dict.keys(): lrank_dict[rk] = [px] else: lrank_dict[rk].append(px) lpid.append(px) t_local = time.time() - t0_local num_rranks = len(lrank_dict.keys()) # for each remote rank get accumalate t1 = time.time() self._check_recv_count_win() # prevent sizes going out of scope _size_store = [] #self._win_recv_count.Fence(0) lrind = np.zeros((num_rranks, 2), INT64) for rki, rk in enumerate(lrank_dict.keys()): lrind[rki, 0] = rk _size = np.array((len(lrank_dict[rk]), ), INT64) _size_store.append(_size) self._win_recv_count.Lock(rk, MPI.LOCK_SHARED) self._win_recv_count.Get_accumulate(_size_store[-1], lrind[rki, 1:2], rk) self._win_recv_count.Unlock(rk) self.comm.Barrier() #self._win_recv_count.Fence(MPI.MODE_NOSTORE) del _size_store opt.PROFILE[self._key_rma1] += time.time() - t1 opt.PROFILE[self._key_nsend] += lcount # pack the send buffer for all particles t0_local = time.time() # get the views before the copy to avoid excessive syncing with the case # of CUDA views = {} bytes_per_element = {} for dat in self.state.particle_dats: views[dat] = self._dat_obj(dat).view.view() bytes_per_element[dat] = self._byte_per_element(dat) nbytes = self._get_nbytes() self._check_send_buffer(lcount, nbytes) send_offset = 0 for rk in lrind[:, 0]: for px in lrank_dict[rk]: s = 0 for dat in self.state.particle_dats: w = bytes_per_element[dat] n = self._dat_ncomp(dat) w *= n v = self._send[send_offset, s:s + w:].view(self._dat_dtype(dat)) s += w v[:] = views[dat][px, :].copy() send_offset += 1 t_local += time.time() - t0_local # need to place the data in the remote buffers here self._check_recv_win() t2 = time.time() # RMA send_offset = 0 for rki in range(num_rranks): rk = lrind[rki, 0] ri = lrind[rki, 1] nsend = len(lrank_dict[rk]) self._win_recv.Lock(rk, MPI.LOCK_SHARED) #self._win_recv.Lock(rk, MPI.LOCK_EXCLUSIVE) self._win_recv.Put(self._send[send_offset:send_offset + nsend:, :], rk, ri) self._win_recv.Unlock(rk) send_offset += nsend self.comm.Barrier() opt.PROFILE[self._key_rma2] += time.time() - t2 opt.PROFILE[self._key_nrecv] += self._recv_count.array[0] # unpack the data recv'd into dats old_npart_local = self.state.npart_local self.state.npart_local = old_npart_local + self._recv_count.array[0] # get the views before the copy to avoid excessive syncing with the case # of CUDA views = {} bytes_per_element = {} for dat in self.state.particle_dats: views[dat] = self._dat_obj(dat).view.view() bytes_per_element[dat] = self._byte_per_element(dat) t0_local = time.time() for px in range(self._recv_count.array[0]): s = 0 for dat in self.state.particle_dats: w = bytes_per_element[dat] n = self._dat_ncomp(dat) w *= n v = self._recv.array[px, s:s + w:].view(self._dat_dtype(dat)) s += w views[dat][old_npart_local + px, :] = v[:] # on some architectures the memory used for compute is different to the # exposed numpy view for dat in self.state.particle_dats: self._dat_obj(dat).sync_view_to_data() t_local += time.time() - t0_local opt.PROFILE[self._key_local] += t_local t0_compress = time.time() self.state.remove_by_slot(lpid) opt.PROFILE[self._key_compress] += time.time() - t0_compress self._free_wins() opt.PROFILE[self._key_call] += time.time() - t0 opt.PROFILE[self._key_call_count] += 1