Ejemplo n.º 1
0
    def __init__(self, kernel=None, dat_dict=None, shell_cutoff=None):

        self._dat_dict = access.DatArgStore(self._get_allowed_types(),
                                            dat_dict)

        self._cc = build.TMPCC
        self._kernel = kernel
        self.shell_cutoff = shell_cutoff

        self.loop_timer = modules.code_timer.LoopTimer()
        self.wrapper_timer = opt.Timer(runtime.TIMER)
        self.list_timer = opt.Timer(runtime.TIMER)

        self._gather_space = host.ThreadSpace(100, ctypes.c_uint8)
        self._generate()

        self._offset_list = host.Array(ncomp=27, dtype=ctypes.c_int)

        self._lib = build.simple_lib_creator(self._generate_header_source(),
                                             self._components['LIB_SRC'],
                                             self._kernel.name,
                                             CC=self._cc)
        self._group = None

        for pd in self._dat_dict.items():
            if issubclass(type(pd[1][0]), data.PositionDat):
                self._group = pd[1][0].group
                break

        #assert self._group is not None, "No cell to particle map found"
        if self._group is not None:
            self._make_cell_list(self._group)

        self._kernel_execution_count = INT64(0)
        self._invocations = 0

        self._jstore = [host.Array(ncomp=100, dtype=ctypes.c_int) for tx in \
                        range(runtime.NUM_THREADS)]
Ejemplo n.º 2
0
    def __init__(self, dtype, tree, nlevel, a_arr, ar_arr, p_arr, e_arr,
                 int_list, int_tlookup, int_plookup, int_radius, ipower_mtl,
                 wigner_f, wigner_b, arn0):
        self.tree = tree
        self.L = nlevel
        ncomp = (self.L**2) * 2
        self.tree_plain = OctalCudaDataTree(tree=tree,
                                            mode='plain',
                                            dtype=dtype,
                                            ncomp=ncomp)
        self.tree_halo = OctalCudaDataTree(tree=tree,
                                           mode='halo',
                                           dtype=dtype,
                                           ncomp=ncomp)

        self._d_a = cuda_base.gpuarray.to_gpu(a_arr)
        self._d_ar = cuda_base.gpuarray.to_gpu(ar_arr)
        self._d_p = cuda_base.gpuarray.to_gpu(p_arr)
        self._d_e = cuda_base.gpuarray.to_gpu(e_arr)

        self._int_list = []
        for lx in int_list:
            if lx is not None:
                ne = cuda_base.gpuarray.to_gpu(lx)
            else:
                ne = None
            self._int_list.append(ne)

        self._d_int_tlookup = cuda_base.gpuarray.to_gpu(int_tlookup)
        self._d_int_plookup = cuda_base.gpuarray.to_gpu(int_plookup)
        self._d_int_radius = cuda_base.gpuarray.to_gpu(int_radius)

        self._ipower_mtl = cuda_base.gpuarray.to_gpu(ipower_mtl)

        jlookup = np.zeros(ncomp, dtype=INT64)
        klookup = np.zeros(ncomp, dtype=INT64)

        ind = 0
        for jx in range(nlevel):
            for kx in range(-1 * jx, jx + 1):
                jlookup[ind] = jx
                klookup[ind] = kx
                ind += 1

        self._jlookup = cuda_base.gpuarray.to_gpu(jlookup)
        self._klookup = cuda_base.gpuarray.to_gpu(klookup)

        # need tmp space to rotate moments
        self.tmp_plain0 = OctalCudaDataTree(tree=tree,
                                            mode='plain',
                                            dtype=dtype,
                                            ncomp=ncomp)
        self.tmp_plain1 = OctalCudaDataTree(tree=tree,
                                            mode='plain',
                                            dtype=dtype,
                                            ncomp=ncomp)

        self._wigner_real = np.zeros((7, 7, 7), dtype=ctypes.c_void_p)
        self._wigner_imag = np.zeros((7, 7, 7), dtype=ctypes.c_void_p)

        self._wigner_b_real = np.zeros((7, 7, 7), dtype=ctypes.c_void_p)
        self._wigner_b_imag = np.zeros((7, 7, 7), dtype=ctypes.c_void_p)

        self._dev_matrices = []
        self._dev_pointers = []

        def ffs_numpy_swap_memory(arr):
            out = np.zeros_like(arr)
            for ix in range(arr.shape[0]):
                for iy in range(arr.shape[1]):
                    out[ix, iy] = arr[iy, ix]
            return out

        # convert host rotation matrices to device matrices
        for iz, pz in enumerate(range(-3, 4)):
            for iy, py in enumerate(range(-3, 4)):
                for ix, px in enumerate(range(-3, 4)):

                    # forward real
                    f = wigner_f[(pz, py, px)]

                    pa = np.zeros(nlevel, dtype=ctypes.c_void_p)

                    for p in range(nlevel):
                        # forward real

                        o = ffs_numpy_swap_memory(f['real'][p])
                        nn = cuda_base.gpuarray.to_gpu(o)
                        self._dev_matrices.append(nn)
                        pa[p] = self._dev_matrices[-1].ptr

                    # need array of pointers on gpu
                    pa = cuda_base.gpuarray.to_gpu(pa)
                    self._dev_pointers.append(pa)

                    # forward real
                    self._wigner_real[iz, iy, ix] = self._dev_pointers[-1].ptr

                    pa = np.zeros(nlevel, dtype=ctypes.c_void_p)

                    for p in range(nlevel):

                        # forward imag

                        o = ffs_numpy_swap_memory(f['imag'][p])
                        nn = cuda_base.gpuarray.to_gpu(o)
                        self._dev_matrices.append(nn)
                        pa[p] = self._dev_matrices[-1].ptr

                    # need array of pointers on gpu
                    pa = cuda_base.gpuarray.to_gpu(pa)
                    self._dev_pointers.append(pa)

                    # forward imag
                    self._wigner_imag[iz, iy, ix] = self._dev_pointers[-1].ptr

                    f = None
                    # backward real
                    b = wigner_b[(pz, py, px)]

                    pa = np.zeros(nlevel, dtype=ctypes.c_void_p)

                    for p in range(nlevel):
                        # backward real

                        o = ffs_numpy_swap_memory(b['real'][p])
                        nn = cuda_base.gpuarray.to_gpu(o)
                        self._dev_matrices.append(nn)
                        pa[p] = self._dev_matrices[-1].ptr

                    # need array of pointers on gpu
                    pa = cuda_base.gpuarray.to_gpu(pa)
                    self._dev_pointers.append(pa)

                    # backward real
                    self._wigner_b_real[iz, iy,
                                        ix] = self._dev_pointers[-1].ptr

                    pa = np.zeros(nlevel, dtype=ctypes.c_void_p)

                    for p in range(nlevel):

                        # backward imag
                        o = ffs_numpy_swap_memory(b['imag'][p])
                        nn = cuda_base.gpuarray.to_gpu(o)
                        self._dev_matrices.append(nn)
                        pa[p] = self._dev_matrices[-1].ptr

                    # need array of pointers on gpu
                    pa = cuda_base.gpuarray.to_gpu(pa)
                    self._dev_pointers.append(pa)

                    # backward imag
                    self._wigner_b_imag[iz, iy,
                                        ix] = self._dev_pointers[-1].ptr

        # pointers to pointers on device
        self._wigner_real = cuda_base.gpuarray.to_gpu(self._wigner_real)
        self._wigner_imag = cuda_base.gpuarray.to_gpu(self._wigner_imag)
        self._wigner_b_real = cuda_base.gpuarray.to_gpu(self._wigner_b_real)
        self._wigner_b_imag = cuda_base.gpuarray.to_gpu(self._wigner_b_imag)

        self._arn0 = cuda_base.gpuarray.to_gpu(arn0)

        # load multipole to local lib
        with open(str(_SRC_DIR) + \
                          '/FMMSource/CudaTranslateMTLZ.cu') as fh:
            cpp = fh.read()
        with open(str(_SRC_DIR) + \
                          '/FMMSource/CudaTranslateMTLZ.h') as fh:
            hpp = fh.read()
        self._translate_mtl_lib = cuda_build.simple_lib_creator(
            hpp, cpp, 'fmm_translate_mtl')

        self.timer_mtl = opt.Timer(runtime.TIMER)

        self._lock = Lock()
Ejemplo n.º 3
0
    def __init__(self, width, domain, entry_data, entry_map, free_space, dtype,
                 force_unit, energy_unit):

        self.width = width
        self.domain = domain
        self.entry_data = entry_data
        self.entry_map = entry_map
        self.free_space = free_space
        self.dtype = dtype

        self.sh = pairloop.state_handler.StateHandler(state=None,
                                                      shell_cutoff=width)

        with open(str(_SRC_DIR) + \
                          '/FMMSource/CudaLocalCells.cu') as fh:
            cpp = fh.read()
        with open(str(_SRC_DIR) + \
                          '/FMMSource/CudaLocalCells.h') as fh:
            hpp = fh.read()

        hpp = hpp % {
            'SUB_FORCE_UNIT': str(force_unit),
            'SUB_ENERGY_UNIT': str(energy_unit)
        }

        self._lib = cuda_build.simple_lib_creator(hpp, cpp, 'fmm_local')
        self._lib0 = self._lib['local_cell_by_cell_0']
        self._lib1 = self._lib['local_cell_by_cell_1']
        self._lib2 = self._lib['local_cell_by_cell_2']

        #print("CUDA LOCAL BUILT")
        self._global_size = np.zeros(3, dtype=INT64)
        self._global_size[:] = entry_map.cube_side_count

        self._ncells =  (self._global_size[0] + 6) * \
                        (self._global_size[1] + 6) * \
                        (self._global_size[2] + 6)

        self._local_size = np.zeros(3, dtype=INT64)
        self._local_size[:] = self.entry_data.local_size[:]

        self._local_offset = np.zeros(3, dtype=INT64)
        self._local_offset[:] = self.entry_data.local_offset[:]
        self._u = np.zeros(1, dtype=self.dtype)
        self.last_u = 0.0

        self._ll_array = np.zeros(1, dtype=INT64)
        self._ll_ccc_array = np.zeros(self._ncells, dtype=INT64)
        self.d_ll_ccc_array = cuda_base.gpuarray.GPUArray(
            shape=self._ll_ccc_array.shape, dtype=INT64)

        self._ntotal = 100000
        self.d_positions = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 3),
                                                       dtype=REAL)
        self.d_charges = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 1),
                                                     dtype=REAL)
        self.d_forces = cuda_base.gpuarray.GPUArray(shape=(self._ntotal, 3),
                                                    dtype=REAL)
        self.d_potential_array = cuda_base.gpuarray.GPUArray(
            shape=(self._ntotal, 1), dtype=REAL)

        self.h_forces = np.zeros(shape=(self._ntotal, 3), dtype=REAL)
        self.h_potential_array = np.zeros(shape=(self._ntotal, 1), dtype=REAL)

        self.exec_count = 0

        self.timer0 = opt.Timer(runtime.TIMER)
        self.timer1 = opt.Timer(runtime.TIMER)
        self.timer2 = opt.Timer(runtime.TIMER)
Ejemplo n.º 4
0
def _load(filename):
    try:
        lib = ctypes.cdll.LoadLibrary(str(filename))
        LOADED_LIBS.append(str(filename[:-3]))
        return lib
    except Exception as e:
        print("build:load error. Could not load following library,", \
            str(filename))
        ppmd.abort(e)


def _check_path_exists(abs_path):
    return os.path.exists(abs_path)


_load_timer = opt.Timer()


def simple_lib_creator(header_code,
                       src_code,
                       name='',
                       extensions=('.h', '.cpp'),
                       dst_dir=None,
                       CC=TMPCC,
                       prefix='HOST',
                       inc_dirs=(runtime.LIB_DIR, )):

    if dst_dir is None:
        dst_dir = ppmd.runtime.BUILD_DIR

    # make build dir