def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None): self._dat_dict = access.DatArgStore(self._get_allowed_types(), dat_dict) self._cc = build.TMPCC self._kernel = kernel self.shell_cutoff = shell_cutoff self.loop_timer = modules.code_timer.LoopTimer() self.wrapper_timer = opt.Timer(runtime.TIMER) self.list_timer = opt.Timer(runtime.TIMER) self._gather_space = host.ThreadSpace(100, ctypes.c_uint8) self._generate() self._offset_list = host.Array(ncomp=27, dtype=ctypes.c_int) self._lib = build.simple_lib_creator(self._generate_header_source(), self._components['LIB_SRC'], self._kernel.name, CC=self._cc) self._group = None for pd in self._dat_dict.items(): if issubclass(type(pd[1][0]), data.PositionDat): self._group = pd[1][0].group break #assert self._group is not None, "No cell to particle map found" if self._group is not None: self._make_cell_list(self._group) self._kernel_execution_count = INT64(0) self._invocations = 0 self._jstore = [host.Array(ncomp=100, dtype=ctypes.c_int) for tx in \ range(runtime.NUM_THREADS)]
def __init__(self, dtype, tree, nlevel, a_arr, ar_arr, p_arr, e_arr, int_list, int_tlookup, int_plookup, int_radius, ipower_mtl, wigner_f, wigner_b, arn0): self.tree = tree self.L = nlevel ncomp = (self.L**2) * 2 self.tree_plain = OctalCudaDataTree(tree=tree, mode='plain', dtype=dtype, ncomp=ncomp) self.tree_halo = OctalCudaDataTree(tree=tree, mode='halo', dtype=dtype, ncomp=ncomp) self._d_a = cuda_base.gpuarray.to_gpu(a_arr) self._d_ar = cuda_base.gpuarray.to_gpu(ar_arr) self._d_p = cuda_base.gpuarray.to_gpu(p_arr) self._d_e = cuda_base.gpuarray.to_gpu(e_arr) self._int_list = [] for lx in int_list: if lx is not None: ne = cuda_base.gpuarray.to_gpu(lx) else: ne = None self._int_list.append(ne) self._d_int_tlookup = cuda_base.gpuarray.to_gpu(int_tlookup) self._d_int_plookup = cuda_base.gpuarray.to_gpu(int_plookup) self._d_int_radius = cuda_base.gpuarray.to_gpu(int_radius) self._ipower_mtl = cuda_base.gpuarray.to_gpu(ipower_mtl) jlookup = np.zeros(ncomp, dtype=INT64) klookup = np.zeros(ncomp, dtype=INT64) ind = 0 for jx in range(nlevel): for kx in range(-1 * jx, jx + 1): jlookup[ind] = jx klookup[ind] = kx ind += 1 self._jlookup = cuda_base.gpuarray.to_gpu(jlookup) self._klookup = cuda_base.gpuarray.to_gpu(klookup) # need tmp space to rotate moments self.tmp_plain0 = OctalCudaDataTree(tree=tree, mode='plain', dtype=dtype, ncomp=ncomp) self.tmp_plain1 = OctalCudaDataTree(tree=tree, mode='plain', dtype=dtype, ncomp=ncomp) self._wigner_real = np.zeros((7, 7, 7), dtype=ctypes.c_void_p) self._wigner_imag = np.zeros((7, 7, 7), dtype=ctypes.c_void_p) self._wigner_b_real = np.zeros((7, 7, 7), dtype=ctypes.c_void_p) self._wigner_b_imag = np.zeros((7, 7, 7), dtype=ctypes.c_void_p) self._dev_matrices = [] self._dev_pointers = [] def ffs_numpy_swap_memory(arr): out = np.zeros_like(arr) for ix in range(arr.shape[0]): for iy in range(arr.shape[1]): out[ix, iy] = arr[iy, ix] return out # convert host rotation matrices to device matrices for iz, pz in enumerate(range(-3, 4)): for iy, py in enumerate(range(-3, 4)): for ix, px in enumerate(range(-3, 4)): # forward real f = wigner_f[(pz, py, px)] pa = np.zeros(nlevel, dtype=ctypes.c_void_p) for p in range(nlevel): # forward real o = ffs_numpy_swap_memory(f['real'][p]) nn = cuda_base.gpuarray.to_gpu(o) self._dev_matrices.append(nn) pa[p] = self._dev_matrices[-1].ptr # need array of pointers on gpu pa = cuda_base.gpuarray.to_gpu(pa) self._dev_pointers.append(pa) # forward real self._wigner_real[iz, iy, ix] = self._dev_pointers[-1].ptr pa = np.zeros(nlevel, dtype=ctypes.c_void_p) for p in range(nlevel): # forward imag o = ffs_numpy_swap_memory(f['imag'][p]) nn = cuda_base.gpuarray.to_gpu(o) self._dev_matrices.append(nn) pa[p] = self._dev_matrices[-1].ptr # need array of pointers on gpu pa = cuda_base.gpuarray.to_gpu(pa) self._dev_pointers.append(pa) # forward imag self._wigner_imag[iz, iy, ix] = self._dev_pointers[-1].ptr f = None # backward real b = wigner_b[(pz, py, px)] pa = np.zeros(nlevel, dtype=ctypes.c_void_p) for p in range(nlevel): # backward real o = ffs_numpy_swap_memory(b['real'][p]) nn = cuda_base.gpuarray.to_gpu(o) self._dev_matrices.append(nn) pa[p] = self._dev_matrices[-1].ptr # need array of pointers on gpu pa = cuda_base.gpuarray.to_gpu(pa) self._dev_pointers.append(pa) # backward real self._wigner_b_real[iz, iy, ix] = self._dev_pointers[-1].ptr pa = np.zeros(nlevel, dtype=ctypes.c_void_p) for p in range(nlevel): # backward imag o = ffs_numpy_swap_memory(b['imag'][p]) nn = cuda_base.gpuarray.to_gpu(o) self._dev_matrices.append(nn) pa[p] = self._dev_matrices[-1].ptr # need array of pointers on gpu pa = cuda_base.gpuarray.to_gpu(pa) self._dev_pointers.append(pa) # backward imag self._wigner_b_imag[iz, iy, ix] = self._dev_pointers[-1].ptr # pointers to pointers on device self._wigner_real = cuda_base.gpuarray.to_gpu(self._wigner_real) self._wigner_imag = cuda_base.gpuarray.to_gpu(self._wigner_imag) self._wigner_b_real = cuda_base.gpuarray.to_gpu(self._wigner_b_real) self._wigner_b_imag = cuda_base.gpuarray.to_gpu(self._wigner_b_imag) self._arn0 = cuda_base.gpuarray.to_gpu(arn0) # load multipole to local lib with open(str(_SRC_DIR) + \ '/FMMSource/CudaTranslateMTLZ.cu') as fh: cpp = fh.read() with open(str(_SRC_DIR) + \ '/FMMSource/CudaTranslateMTLZ.h') as fh: hpp = fh.read() self._translate_mtl_lib = cuda_build.simple_lib_creator( hpp, cpp, 'fmm_translate_mtl') self.timer_mtl = opt.Timer(runtime.TIMER) self._lock = Lock()
def __init__(self, width, domain, entry_data, entry_map, free_space, dtype, force_unit, energy_unit): self.width = width self.domain = domain self.entry_data = entry_data self.entry_map = entry_map self.free_space = free_space self.dtype = dtype self.sh = pairloop.state_handler.StateHandler(state=None, shell_cutoff=width) with open(str(_SRC_DIR) + \ '/FMMSource/CudaLocalCells.cu') as fh: cpp = fh.read() with open(str(_SRC_DIR) + \ '/FMMSource/CudaLocalCells.h') as fh: hpp = fh.read() hpp = hpp % { 'SUB_FORCE_UNIT': str(force_unit), 'SUB_ENERGY_UNIT': str(energy_unit) } self._lib = cuda_build.simple_lib_creator(hpp, cpp, 'fmm_local') self._lib0 = self._lib['local_cell_by_cell_0'] self._lib1 = self._lib['local_cell_by_cell_1'] self._lib2 = self._lib['local_cell_by_cell_2'] #print("CUDA LOCAL BUILT") self._global_size = np.zeros(3, dtype=INT64) self._global_size[:] = entry_map.cube_side_count self._ncells = (self._global_size[0] + 6) * \ (self._global_size[1] + 6) * \ (self._global_size[2] + 6) self._local_size = np.zeros(3, dtype=INT64) self._local_size[:] = self.entry_data.local_size[:] self._local_offset = np.zeros(3, dtype=INT64) self._local_offset[:] = self.entry_data.local_offset[:] self._u = np.zeros(1, dtype=self.dtype) self.last_u = 0.0 self._ll_array = np.zeros(1, dtype=INT64) self._ll_ccc_array = np.zeros(self._ncells, dtype=INT64) self.d_ll_ccc_array = cuda_base.gpuarray.GPUArray( shape=self._ll_ccc_array.shape, dtype=INT64) self._ntotal = 100000 self.d_positions = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 3), dtype=REAL) self.d_charges = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 1), dtype=REAL) self.d_forces = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 3), dtype=REAL) self.d_potential_array = cuda_base.gpuarray.GPUArray( shape=(self._ntotal, 1), dtype=REAL) self.h_forces = np.zeros(shape=(self._ntotal, 3), dtype=REAL) self.h_potential_array = np.zeros(shape=(self._ntotal, 1), dtype=REAL) self.exec_count = 0 self.timer0 = opt.Timer(runtime.TIMER) self.timer1 = opt.Timer(runtime.TIMER) self.timer2 = opt.Timer(runtime.TIMER)
def _load(filename): try: lib = ctypes.cdll.LoadLibrary(str(filename)) LOADED_LIBS.append(str(filename[:-3])) return lib except Exception as e: print("build:load error. Could not load following library,", \ str(filename)) ppmd.abort(e) def _check_path_exists(abs_path): return os.path.exists(abs_path) _load_timer = opt.Timer() def simple_lib_creator(header_code, src_code, name='', extensions=('.h', '.cpp'), dst_dir=None, CC=TMPCC, prefix='HOST', inc_dirs=(runtime.LIB_DIR, )): if dst_dir is None: dst_dir = ppmd.runtime.BUILD_DIR # make build dir