def __init__(self, occ_matrix=OCCUPANCY_MATRIX, cutoff=None): self._occ_matrix = occ_matrix assert cutoff is not None, "cuda_cell::NeighbourListLayerBased.setup error: No cutoff passed." self._rc = cutoff self.max_neigbours_per_particle = None self.version_id_1 = 0 self.version_id_2 = 0 self.list1 = cuda_base.Matrix(nrow=1, ncol=1, dtype=ctypes.c_int) self.list2 = cuda_base.Matrix(nrow=1, ncol=1, dtype=ctypes.c_int) with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaNeighbourListSplitSource.cu', 'r') as fh: _code = fh.read() with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaNeighbourListSplitSource.h', 'r') as fh: _header = fh.read() _name1 = 'NeighbourList' _name2 = 'NeighbourList2' _lib = cuda_build.simple_lib_creator(_header, _code, _name1) self._lib1 = _lib[_name1] self._lib2 = _lib[_name2]
def __init__(self, kernel, dat_dict, n=None, types_map=None): self._types_map = types_map self._kernel = kernel self._dat_dict = access.DatArgStore(self._get_allowed_types(), dat_dict) # set compiler as NVCC default self._cc = cuda_build.NVCC self.loop_timer = ppmd.modules.code_timer.LoopTimer() self._components = { 'LIB_PAIR_INDEX_0': '_i', 'LIB_NAME': str(self._kernel.name) + '_wrapper' } self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), cuda_data.ParticleDat): if pd[1][0].group is not None: self._group = pd[1][0].group break #start code creation self._generate() # Create library self._lib = cuda_build.simple_lib_creator( self._generate_header_source(), self._components['LIB_SRC'], self._components['LIB_NAME'])
def _build(self): """ Build the library to create the cell occupancy matrix. :return: """ assert self._setup is not False, "Run CellOccupancyMatrix.setup() first." with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaCellOccupancyMatrixSource.cu', 'r') as fh: _code = fh.read() with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaCellOccupancyMatrixSource.h', 'r') as fh: _header = fh.read() _name = 'CellOccupancyMatrix' self._p1_lib = cuda_build.simple_lib_creator(_header, _code, 'CellOccupancyMatrix') self._init = True
def _build_exchange_lib(dat): with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaHaloExchangeSource.cu', 'r') as fh: code = fh.read() with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaHaloExchangeSource.h', 'r') as fh: hcode = fh.read() assert code is not None, "Failure to read CUDA MPI packing code source" d = dict() d['DTYPE'] = host.ctypes_map[dat.dtype] d['NCOMP'] = dat.ncomp d['MPI_DTYPE'] = host.mpi_type_map[dat.dtype] code = code % d hcode = hcode % d return cuda_build.simple_lib_creator( hcode, code, 'ParticleDat_exchange')['cudaHaloExchangePD']
def _build_norm_linf_lib(dtype): """ Build the L1 norm lib for a ParticleDat """ with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaLInfNormSource.cu', 'r') as fh: code = fh.read() with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaLInfNormSource.h', 'r') as fh: hcode = fh.read() assert code is not None, "Failure to read CUDA L inf NORM packing code source" d = dict() d['TYPENAME'] = host.ctypes_map[dtype] code = code % d hcode = hcode % d return cuda_build.simple_lib_creator( hcode, code, 'ParticleDat_Linf_Norm')['cudaLInfNorm']
def __init__(self, domain, cell_width, positions): self.domain = domain boundary = domain.boundary assert cell_width > 0, "bad cell width" assert boundary[1] > boundary[0], "nonsensical boundary" assert boundary[3] > boundary[2], "nonsensical boundary" assert boundary[5] > boundary[4], "nonsensical boundary" self.positions = positions self.cell_array = host.Array(ncomp=3, dtype=ctypes.c_int) self.cell_sizes = host.Array(ncomp=3, dtype=ctypes.c_double) # get sizes just considering interior cell_array = [0, 0, 0] cell_array[0] = int(float(boundary[1] - boundary[0]) / cell_width) cell_array[1] = int(float(boundary[3] - boundary[2]) / cell_width) cell_array[2] = int(float(boundary[5] - boundary[4]) / cell_width) cell_sizes = [0, 0, 0] cell_sizes[0] = float(boundary[1] - boundary[0]) / cell_array[0] cell_sizes[1] = float(boundary[3] - boundary[2]) / cell_array[1] cell_sizes[2] = float(boundary[5] - boundary[4]) / cell_array[2] self.cell_sizes[:] = cell_sizes[:] padx = int(math.ceil( self.domain.cell_edge_lengths[0] / cell_sizes[0])) + 1 pady = int(math.ceil( self.domain.cell_edge_lengths[1] / cell_sizes[1])) + 1 padz = int(math.ceil( self.domain.cell_edge_lengths[2] / cell_sizes[2])) + 1 rpadx = padx * cell_sizes[0] rpady = pady * cell_sizes[1] rpadz = padz * cell_sizes[2] #print "CA", cell_array[:], "CS", self.cell_sizes[:], "CES", self.domain.cell_edge_lengths[:] self.cell_array[0] = cell_array[0] + 2 * padx self.cell_array[1] = cell_array[1] + 2 * pady self.cell_array[2] = cell_array[2] + 2 * padz #print "CA2", self.cell_array[:] self.boundary = host.Array(ncomp=6, dtype=ctypes.c_double) self.boundary[0] = boundary[0] - rpadx self.boundary[1] = boundary[1] + rpadx self.boundary[2] = boundary[2] - rpady self.boundary[3] = boundary[3] + rpady self.boundary[4] = boundary[4] - rpadz self.boundary[5] = boundary[5] + rpadz self.cell_count = cell_array[0] * cell_array[1] * cell_array[2] self.particle_layers = cuda_base.Array(ncomp=1, dtype=ctypes.c_int) self.cell_reverse_lookup = cuda_base.Array(ncomp=1, dtype=ctypes.c_int) self.cell_contents_count = cuda_base.Array(ncomp=self.cell_count, dtype=ctypes.c_int) self.matrix = cuda_base.Matrix(nrow=self.cell_count, ncol=1, dtype=ctypes.c_int) self.num_layers = 0 with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaSubCellOccupancyMatrixSource.cu', 'r') as fh: _code = fh.read() with open( str(ppmd.cuda.cuda_config.LIB_DIR) + '/cudaSubCellOccupancyMatrixSource.h', 'r') as fh: _header = fh.read() _name = 'SubCellOccupancyMatrix' lib = cuda_build.simple_lib_creator(_header, _code, _name) self._sort_lib = lib['LayerSort'] self._fill_lib = lib['PopMatrix'] self.version_id = 0
def _build_1p_halo_lib(self): _name = '_1p_halo_lib' _hargs = '''const int blocksize[3], const int threadsize[3], int * h_n_total, const int h_n, const int h_npc, const cuda_Array<int> d_b, const cuda_Array<int> d_h, const cuda_Array<int> d_bhc_map, cuda_Array<int> d_ccc, cuda_Matrix<int> d_occ_matrix, cuda_ParticleDat<%(TYPE)s> d_dat ''' % { 'TYPE': host.ctypes_map[self.idtype] } _dargs = '''const cuda_Array<int> d_b, const cuda_Array<int> d_h, const cuda_Array<int> d_bhc_map, cuda_Array<int> d_ccc, cuda_Matrix<int> d_occ_matrix, cuda_ParticleDat<%(TYPE)s> d_dat ''' % { 'TYPE': host.ctypes_map[self.idtype] } _d_call_args = '''d_b, d_h, d_bhc_map, d_ccc, d_occ_matrix, d_dat''' if type(self) == PositionDat: _hargs += ''', const cuda_Array<double> d_shifts''' _dargs += ''', const cuda_Array<double> d_shifts''' _d_call_args += ''', d_shifts''' # self._position_shifts = self.group._halo_manager.get_position_shifts() _shift_code = ''' + d_shifts.ptr[d_bhc_map.ptr[_cx]*3 + _comp]''' _occ_code = ''' d_occ_matrix.ptr[ d_npc * d_h.ptr[_cx] + _pi ] = hpx; // if particle layer is zero write the cell contents count. if (_pi == 0){ d_ccc.ptr[d_h.ptr[_cx]] = d_ccc.ptr[d_b.ptr[_cx]]; } ''' else: _shift_code = '''''' _occ_code = '''''' _header = ''' #include <cuda_generic.h> extern "C" int %(NAME)s(%(HARGS)s); ''' % { 'NAME': _name, 'HARGS': _hargs } _src = ''' __constant__ int d_n_total; __constant__ int d_n; __constant__ int d_npc; __global__ void d_1p_halo_copy_shift(%(DARGS)s){ //particle index const int idx = (threadIdx.x + blockIdx.x*blockDim.x)/%(NCOMP)s; if (idx < d_n){ //component corresponding to thread. const int _comp = (threadIdx.x + blockIdx.x*blockDim.x) %% %(NCOMP)s; const int _cx = idx/d_npc; const int _bc = d_b.ptr[_cx]; // some boundary cell const int _pi = idx %% d_npc; // particle layer if (_pi < d_ccc.ptr[_bc]){ //Do we need this thread to do anything? // local index of particle const int px = d_occ_matrix.ptr[_bc*d_npc + _pi]; //halo index of particle const int hpx = d_n_total + idx; d_dat.ptr[hpx* %(NCOMP)s + _comp] = d_dat.ptr[px * %(NCOMP)s + _comp] %(SHIFT_CODE)s ; //printf("hpx %%d, px %%d, _cx %%d, _bc %%d halo %%d \\n", hpx, px, _cx, _bc, d_bhc_map.ptr[_cx]); %(OCC_CODE)s //printf("shift %%f, halo %%d, _comp %%d \\n", d_shifts.ptr[d_bhc_map.ptr[_cx]*3 + _comp], d_bhc_map.ptr[_cx], _comp); } } return; } int %(NAME)s(%(HARGS)s){ checkCudaErrors(cudaMemcpyToSymbol(d_n_total, h_n_total, sizeof(int))); checkCudaErrors(cudaMemcpyToSymbol(d_n, &h_n, sizeof(h_n))); checkCudaErrors(cudaMemcpyToSymbol(d_npc, &h_npc, sizeof(h_npc))); dim3 bs; bs.x = blocksize[0]; bs.y = blocksize[1]; bs.z = blocksize[2]; dim3 ts; ts.x = threadsize[0]; ts.y = threadsize[1]; ts.z = threadsize[2]; d_1p_halo_copy_shift<<<bs,ts>>>(%(D_C_ARGS)s); checkCudaErrors(cudaDeviceSynchronize()); getLastCudaError("1proc halo lib Execution failed. \\n"); return 0; } ''' % { 'NAME': _name, 'HARGS': _hargs, 'DARGS': _dargs, 'D_C_ARGS': _d_call_args, 'NCOMP': self.ncomp, 'SHIFT_CODE': _shift_code, 'OCC_CODE': _occ_code } self._1p_halo_lib = cuda_build.simple_lib_creator( _header, _src, _name)[_name]
def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None, sub_divide=None): self._dat_dict = access.DatArgStore( self._get_allowed_types(), dat_dict) self._cc = cuda_build.NVCC self._kernel = kernel self.shell_cutoff = shell_cutoff if sub_divide is None: rs_default = 5. else: rs_default = sub_divide self.sub_divide_size = rs_default #print "ACTUAL SUB CELL WIDTH", self.sub_divide_size self.loop_timer = ppmd.modules.code_timer.LoopTimer() self.wrapper_timer = opt.SynchronizedTimer(runtime.TIMER) self._components = {'LIB_PAIR_INDEX_0': '_i', 'LIB_PAIR_INDEX_1': '_j', 'LIB_NAME': str(self._kernel.name) + '_wrapper'} self._gather_size_limit = 4 self._generate() self._lib = cuda_build.simple_lib_creator( self._generate_header_source(), self._components['LIB_SRC'], self._kernel.name, )[self._components['LIB_NAME']] self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), cuda_data.PositionDat): self._group = pd[1][0].group break assert self._group is not None, "No cell to particle map found" new_decomp_flag = self._group.domain.cell_decompose( self.shell_cutoff ) if new_decomp_flag: self._group.get_cell_to_particle_map().create() self._key = (self.shell_cutoff, self._group.domain, self._group.get_position_dat()) _nd = PairLoopCellByCell._cell_lists if not self._key in _nd.keys() or new_decomp_flag: _nd[self._key] = cuda_cell.SubCellOccupancyMatrix( domain=self._group.domain, cell_width=self.sub_divide_size, positions=self._group.get_position_dat(), ) self.cell_list = _nd[self._key] self._cell_list_count = 0 self._invocations = 0 # get the offset list oslist = cell.convert_offset_tuples( cell.radius_cell_decompose(shell_cutoff, self.cell_list.cell_sizes), self.cell_list.cell_array, remove_zero=True ) self.offset_list = cuda_base.Array(ncomp=len(oslist), dtype=ctypes.c_int) self.offset_list[:] = oslist[:]
def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None): self._dat_dict = access.DatArgStore( self._get_allowed_types(), dat_dict) self._cc = cuda_build.NVCC self._kernel = kernel ''' if type(shell_cutoff) is not logic.Distance: shell_cutoff = logic.Distance(shell_cutoff) ''' self.shell_cutoff = shell_cutoff self.loop_timer = ppmd.modules.code_timer.LoopTimer() self.wrapper_timer = opt.SynchronizedTimer(runtime.TIMER) self._components = {'LIB_PAIR_INDEX_0': '_i', 'LIB_PAIR_INDEX_1': '_j', 'LIB_NAME': str(self._kernel.name) + '_wrapper'} self._gather_size_limit = 4 self._generate() self._lib = cuda_build.simple_lib_creator( self._generate_header_source(), self._components['LIB_SRC'], self._kernel.name, )[self._components['LIB_NAME']] self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), cuda_data.PositionDat): self._group = pd[1][0].group break assert self._group is not None, "No cell to particle map found" new_decomp_flag = self._group.domain.cell_decompose( self.shell_cutoff ) if new_decomp_flag: self._group.get_cell_to_particle_map().create() self._key = (self.shell_cutoff, self._group.domain, self._group.get_position_dat()) _nd = PairLoopNeighbourListNSSplit._neighbour_list_dict_PNLNS_split if not self._key in _nd.keys() or new_decomp_flag: _nd[self._key] = cuda_cell.NeighbourListLayerSplit( occ_matrix=self._group.get_cell_to_particle_map(), cutoff=self.shell_cutoff ) self.neighbour_list = _nd[self._key] self._neighbourlist_count = 0 self._invocations = 0
def __init__(self, state, particle_dat_names): self._state = state self._names = particle_dat_names call_template = ''' bs; bs.x = %(NCOMP)s * blocksize[0]; bs.y = blocksize[1]; bs.z = blocksize[2]; ts; ts.x = threadsize[0]; ts.y = threadsize[1]; ts.z = threadsize[2]; compression_kernel<%(DTYPE)s><<<bs,ts>>>(%(PTR_NAME)s, %(NCOMP)s); ''' extra_params = '' kernel_calls = ''' ''' for ix in self._names: dat = getattr(self._state, ix) sdtype = ppmd.host.ctypes_map[dat.dtype] extra_params += ', ' + sdtype + '* ' + ix kernel_calls += call_template % { 'DTYPE': sdtype, 'PTR_NAME': ix, 'NCOMP': dat.ncomp } name = 'compression_lib' header_code = ''' #include "cuda_generic.h" __constant__ int d_n_empty; __constant__ int* d_e_slots; __constant__ int* d_r_slots; template <typename T> __global__ void compression_kernel(T* __restrict__ d_ptr, const int ncomp){ const int ix = threadIdx.x + blockIdx.x*blockDim.x; if (ix < d_n_empty * ncomp){ const int sx = ix / ncomp; const int comp = ix %% ncomp; const int eslot = d_e_slots[sx]; const int rslot = d_r_slots[sx]; d_ptr[eslot*ncomp + comp] = d_ptr[rslot*ncomp + comp]; } return; } extern "C" int compression_lib(const int blocksize[3], const int threadsize[3], const int h_n_empty, const int* d_e_slots_p, const int* d_r_slots_p %(EXTRA_PARAMS)s ); ''' % { 'EXTRA_PARAMS': extra_params } src_code = ''' int compression_lib(const int blocksize[3], const int threadsize[3], const int h_n_empty, const int* d_e_slots_p, const int* d_r_slots_p %(EXTRA_PARAMS)s ){ dim3 bs; dim3 ts; checkCudaErrors(cudaMemcpyToSymbol(d_n_empty, &h_n_empty, sizeof(int))); checkCudaErrors(cudaMemcpyToSymbol(d_e_slots, &d_e_slots_p, sizeof(int*))); checkCudaErrors(cudaMemcpyToSymbol(d_r_slots, &d_r_slots_p, sizeof(int*))); %(KERNEL_CALLS)s return (int) cudaDeviceSynchronize(); } ''' % { 'KERNEL_CALLS': kernel_calls, 'EXTRA_PARAMS': extra_params } self._lib = cuda_build.simple_lib_creator(header_code, src_code, name)[name]