def allocate_buffered_data_arrays(self, **kwargs): n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert (n0 is not None) kw = dict(dtype=self.real_type, alignment=resource.getpagesize()) self.t = cuda.aligned_zeros(shape=(n0, ), **kw) self.t = cuda.register_host_memory(self.t) self.y = cuda.aligned_zeros(shape=(n0, ), dtype=self.ytype, alignment=resource.getpagesize()) self.y = cuda.register_host_memory(self.y) if self.weighted: self.dy = cuda.aligned_zeros(shape=(n0, ), **kw) self.dy = cuda.register_host_memory(self.dy) if self.balanced_magbins: self.mag_bwf = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw) self.mag_bwf = cuda.register_host_memory(self.mag_bwf) if self.compute_log_prob: self.mag_bin_fracs = cuda.aligned_zeros(shape=(self.mag_bins, ), **kw) self.mag_bin_fracs = cuda.register_host_memory(self.mag_bin_fracs) return self
def allocate_buffered_data_arrays(self, **kwargs): """ Allocates pinned memory for lightcurves if we're reusing this container """ n0 = kwargs.get('n0', self.n0) if self.buffered_transfer: n0 = kwargs.get('n0_buffer', self.n0_buffer) assert (n0 is not None) self.t = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.t = cuda.register_host_memory(self.t) self.yw = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.yw = cuda.register_host_memory(self.yw) self.w = cuda.aligned_zeros(shape=(n0, ), dtype=self.real_type, alignment=resource.getpagesize()) self.w = cuda.register_host_memory(self.w) return self
def initialize(num_points): # states ## Note: It is important to keep the parallelizable index (largest) ## on the most inner dimension state = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32) wts_left, wts_right, xi_left, xi_right = jet_initialize_moments(num_coords, num_nodes) grid_spacing = 1/ (num_points - 2) disc_loc = 0.125 n_pt = num_points - 2 disc_idx = int(n_pt * disc_loc) - 2 print('Dislocation index is ', disc_idx, ' out of ', n_pt, ' points') # print("abscissas left: ", xi_left[0,:]) # print("abscissas right: ", xi_right[0,:]) # Populate state moments_left = projection(wts_left, xi_left, indices, num_coords, num_nodes) moments_right = projection(wts_right, xi_right, indices, num_coords, num_nodes) state[:, :disc_idx] = np.asarray([moments_left]).T state[:, -disc_idx:] = np.asarray([moments_right]).T state[:, 0] = np.asarray([moments_right]) state[:, -1] = np.asarray([moments_left]) return state, grid_spacing
def allocate(self, data): if len(data) > len(self.streams): self._create_streams(len(data) - len(self.streams)) gpu_data, pow_cpus = [], [] for t, y, w, freqs in data: pow_cpu = cuda.aligned_zeros(shape=(len(freqs), ), dtype=np.float32, alignment=resource.getpagesize()) pow_cpu = cuda.register_host_memory(pow_cpu) t_g, y_g, w_g = None, None, None if len(t) > 0: t_g, y_g, w_g = tuple([ gpuarray.zeros(len(t), dtype=np.float32) for i in range(3) ]) pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype) freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32)) gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g)) pow_cpus.append(pow_cpu) return gpu_data, pow_cpus
def _init_memory(self) -> None: ''' Initialize GPU memory each GPU gets its own number of streams, and each stream gets its own memory allocation ''' # initialize memory lists self.moments_device = [[]] * self.num_device self.moment_chunk_host = [[]] * self.num_device self.x_chunk_host = [[]] * self.num_device self.y_chunk_host = [[]] * self.num_device self.w_chunk_host = [[]] * self.num_device self.x_device = [[]] * self.num_device self.y_device = [[]] * self.num_device self.w_device = [[]] * self.num_device self.c_moments = [[]] * self.num_device self.mu = [[]] * self.num_device self.yf = [[]] * self.num_device self.m1 = [[]] * self.num_device self.x1 = [[]] * self.num_device self.w1 = [[]] * self.num_device self.x2 = [[]] * self.num_device self.w2 = [[]] * self.num_device # Host memory that stores the output self.w_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32) self.x_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32) self.y_out = cuda.aligned_zeros((4, self.in_size), dtype=np.float32) self.streams = [[]] * self.num_device # number of input allocated to each thread size_per_thread = np.ceil(self.in_size / self.num_device) mem_thread = [] for i, ctx in enumerate(self.context_list): mem_thread.append( threading.Thread(target=self._init_thread_memory, args=(i, ctx, size_per_thread))) mem_thread[i].start() for t in mem_thread: t.join()
def init_moment_10(size: int): one_moment = np.asarray( [1, 1, 1, 1.01, 1, 1.01, 1.03, 1.03, 1.0603, 1.0603], dtype=np.float32) moments = cuda.aligned_zeros((10, size), dtype=np.float32) for i in range(size): moments[:, i] = one_moment return moments
def init_moment_6(size: int): ''' Initialize a dummy input of specified size for Chyqmom4 ''' one_moment = np.asarray([1.0, 1.0, 1.0, 1.01, 1, 1.01], dtype=np.float32) moments = cuda.aligned_zeros((6, size), dtype=np.float32) for i in range(size): moments[:, i] = one_moment return moments
def allocate_pinned_cpu(self, **kwargs): nf = kwargs.get('nf', self.nf) assert (nf is not None) self.ce_c = cuda.aligned_zeros(shape=(nf, ), dtype=self.real_type, alignment=resource.getpagesize()) self.ce_c = cuda.register_host_memory(self.ce_c) return self
def allocate_pinned_cpu(self, **kwargs): """ Allocates pinned CPU memory for asynchronous transfer of result """ nf = kwargs.get('nf', self.nf) assert(nf is not None) self.lsp_c = cuda.aligned_zeros(shape=(nf,), dtype=self.real_type, alignment=resource.getpagesize()) self.lsp_c = cuda.register_host_memory(self.lsp_c) return self
def allocate_pinned_arrays(self, nfreqs=None, ndata=None): if nfreqs is None: nfreqs = int(self.max_nfreqs) if ndata is None: ndata = int(self.max_ndata) self.bls = cuda.aligned_zeros(shape=(nfreqs,), dtype=self.rtype, alignment=resource.getpagesize()) self.bls = cuda.register_host_memory(self.bls) self.nbins0 = cuda.aligned_zeros(shape=(nfreqs,), dtype=np.int32, alignment=resource.getpagesize()) self.nbins0 = cuda.register_host_memory(self.nbins0) self.nbinsf = cuda.aligned_zeros(shape=(nfreqs,), dtype=np.int32, alignment=resource.getpagesize()) self.nbinsf = cuda.register_host_memory(self.nbinsf) self.t = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.t = cuda.register_host_memory(self.t) self.yw = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.yw = cuda.register_host_memory(self.yw) self.w = cuda.aligned_zeros(shape=(ndata,), dtype=self.rtype, alignment=resource.getpagesize()) self.w = cuda.register_host_memory(self.w)
import pycuda.autoinit import numpy as np if __name__ == "__main__": res_file_name = 'result_pycuda.csv' max_input_size_mag = 6 num_points = 200 trial = 5 result = np.zeros((num_points, trial + 1)) for idx, in_size in enumerate(np.logspace(0, max_input_size_mag, num=num_points)): this_result = np.zeros(trial + 1) this_result[0] = int(in_size) w = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32) x = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32) y = cuda.aligned_zeros((9, int(in_size)), dtype=np.float32) this_moment = init_moment_10(int(in_size)) for i in range(1, trial, 1): this_result[i] = chyqmom9_pycuda(this_moment, int(in_size), w, x, y, 1) result[idx] = this_result print(int(in_size), ": ", this_result[1]) np.savetxt(res_file_name, result, delimiter=',')
def chyqmom27( moments: np.ndarray, size: int): mem_d_size_in_byte = np.ones(size).astype(np.float32).nbytes sizeof_float = np.int32(np.dtype(np.float32).itemsize) size = np.int32(size) BlockSize = (256, 1, 1) GridSize = (size +BlockSize[0] - 1) /BlockSize[0]; GridSize = (int(GridSize), 1, 1) # compile kernel HYQ = SourceModule(HYQMOM) CHY27 = SourceModule(CHYQMOM27) hyqmom3 = HYQ.get_function('hyqmom3') c_kernel = CHY27.get_function('chyqmom27_cmoments') chyqmom27_rho_yf = CHY27.get_function('chyqmom27_rho_yf') chyqmom27_zf = CHY27.get_function('chyqmom27_zf') chyqmom27_mu = CHY27.get_function('chyqmom27_mu') float_value_set = CHY27.get_function('float_value_set') float_array_set = CHY27.get_function('float_array_set') chyqmom27_set_m = CHY27.get_function('chyqmom27_set_m') print_device = CHY27.get_function('print_device') chyqmom27_wout = CHY27.get_function('chyqmom27_wout') chyqmom27_xout = CHY27.get_function('chyqmom27_xout') chyqmom27_yout = CHY27.get_function('chyqmom27_yout') chyqmom27_zout = CHY27.get_function('chyqmom27_zout') w = cuda.aligned_zeros((27, int(size)), dtype=np.float32) x = cuda.aligned_zeros((27, int(size)), dtype=np.float32) y = cuda.aligned_zeros((27, int(size)), dtype=np.float32) z = cuda.aligned_zeros((27, int(size)), dtype=np.float32) # Allocate memory moments_device = cuda.mem_alloc(int(sizeof_float * size * 16)) c_moments = cuda.mem_alloc(int(sizeof_float * size * 12)) m = cuda.mem_alloc(int(sizeof_float * size * 10)) float_value_set(m, np.float32(1), size, np.int32(0), block=BlockSize, grid=GridSize) float_value_set(m, np.float32(0), size, size, block=BlockSize, grid=GridSize) w1 = cuda.mem_alloc(int(sizeof_float * size * 3)) x1 = cuda.mem_alloc(int(sizeof_float * size * 3)) w2 = cuda.mem_alloc(int(sizeof_float * size * 9)) x2 = cuda.mem_alloc(int(sizeof_float * size * 9)) y2 = cuda.mem_alloc(int(sizeof_float * size * 9)) rho = cuda.mem_alloc(int(sizeof_float * size * 9)) yf = cuda.mem_alloc(int(sizeof_float * size * 3)) yp = cuda.mem_alloc(int(sizeof_float * size * 9)) zf = cuda.mem_alloc(int(sizeof_float * size * 3)) w3 = cuda.mem_alloc(int(sizeof_float * size * 3)) x3 = cuda.mem_alloc(int(sizeof_float * size * 3)) mu = cuda.mem_alloc(int(sizeof_float * size * 3)) w_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) x_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) y_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) z_dev = cuda.mem_alloc(int(sizeof_float * size * 27)) cuda.memcpy_htod(moments_device, moments) # Is this faster? time_before = cuda.Event() time_after = cuda.Event() time_before.record() c_kernel(moments_device, c_moments, size, block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(3) * size, np.int32(6) * size, block=BlockSize, grid=GridSize) float_array_set(m, c_moments, size, np.int32(4) * size, np.int32(9) * size, block=BlockSize, grid=GridSize) # print("What is m1?") # print_device(m, np.int32(5), block=BlockSize, grid=GridSize) hyqmom3(m, x1, w1, size, block=BlockSize, grid=GridSize) # Is this faster? chyqmom27_set_m(m, c_moments, size, block=BlockSize, grid=GridSize) # this_context.synchronize() # print_device(m, np.int32(10), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering CHYQMOM9") chyqmom9(m, size, w2, x2, y2) # this_context.synchronize() # print("What is w2?") # print_device(w2, np.int32(10), block=BlockSize, grid=GridSize) chyqmom27_rho_yf(c_moments, y2, w2, rho, yf, yp, size, block=BlockSize, grid=GridSize) chyqmom27_zf(c_moments, x1, zf, size, block=BlockSize, grid=GridSize) chyqmom27_mu(c_moments, rho, zf, mu, size, block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(2) * size, np.int32(0), block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(3) * size, np.int32(1) * size, block=BlockSize, grid=GridSize) float_array_set(m, mu, size, np.int32(4) * size, np.int32(2) * size, block=BlockSize, grid=GridSize) hyqmom3(m, x3, w3, size, block=BlockSize, grid=GridSize) chyqmom27_wout(moments_device, w1, rho, w3, w_dev, size, block=BlockSize, grid=GridSize) chyqmom27_xout(moments_device, x1, x_dev, size, block=BlockSize, grid=GridSize) chyqmom27_yout(moments_device, yf, yp, y_dev, size, block=BlockSize, grid=GridSize) chyqmom27_zout(moments_device, zf, x3, z_dev, block=BlockSize, grid=GridSize) time_after.record() time_after.synchronize() elapsed_time = time_after.time_since(time_before) cuda.memcpy_dtoh(w, w_dev) cuda.memcpy_dtoh(x, x_dev) cuda.memcpy_dtoh(y, y_dev) cuda.memcpy_dtoh(z, z_dev) # this_context.synchronize() # print("Entering rho") # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering mu") # print_device(mu, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering w1") # print_device(w1, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering rho") # print_device(rho, np.int32(9*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Entering w3") # print_device(w3, np.int32(3*2), block=BlockSize, grid=GridSize) # this_context.synchronize() # print("Final w_dev") # print_device(w_dev, np.int32(27*1), block=BlockSize, grid=GridSize) moments_device.free() c_moments.free() m.free() w1.free() x1.free() w2.free() x2.free() y2.free() rho.free() yf.free() yp.free() zf.free() w3.free() x3.free() mu.free() return elapsed_time, w_dev, x_dev, y_dev, z_dev
def single_advance_gpu(state, num_points, grid_space): rhs = cuda.aligned_zeros((num_moments, num_points), dtype=np.float32) time_before = cuda.Event() time_1 = cuda.Event() time_after = cuda.Event() ## allocate GPU memory indices_device = cuda.mem_alloc_like(indices) cuda.memcpy_htod(indices_device, indices) f_min = cuda.mem_alloc(int(sizeof_float * num_moments * num_nodes * num_points)) f_max = cuda.mem_alloc(int(sizeof_float * num_moments*num_nodes*num_points)) flux_1 = cuda.mem_alloc_like(state) flux_2 = cuda.mem_alloc_like(state) ## compile GPU kernel BlockSize = (256, 1, 1) GridSize = (num_points +BlockSize[0] - 1) /BlockSize[0]; GridSize = (int(GridSize), 1, 1) domain_get_flux = QUAD.get_function('domain_get_flux_3d') fsum = QUAD.get_function('fsum_3d') flux_out = QUAD.get_function('flux_3d') ## compute_rhs time_before.record() # grid_inversion(state) # output are pointer object to GPU memory _, w, x, y, z = chyqmom27(state, num_points) time_1.record() # domain_get_fluxes(weights, abscissas, qbmm_mgr.indices, # num_points, qbmm_mgr.num_moments, # qbmm_mgr.num_nodes, flux) domain_get_flux(w, x, y, z, indices_device, f_min, f_max, np.int32(num_moments), np.int32(num_nodes), np.int32(num_points), block=BlockSize, grid=GridSize) fsum(flux_1, f_min, f_max, np.int32(num_moments), np.int32(num_nodes), np.int32(num_points), block=BlockSize, grid=GridSize) flux_out(flux_1, flux_2, np.float32(grid_space), np.int32(num_moments), np.int32(num_points), block=BlockSize, grid=GridSize) time_after.record() time_1.synchronize() time_after.synchronize() total_time = time_after.time_since(time_before) quad_time = time_after.time_since(time_1) cuda.memcpy_dtoh(rhs, flux_2) w.free() x.free() y.free() z.free() return rhs, total_time, quad_time
moments = cuda.aligned_zeros((10, size), dtype=np.float32) for i in range(size): moments[:, i] = one_moment return moments if __name__ == '__main__': num_moments = 10000000 batch_size = 4 moments = init_moment_10(num_moments) # flatten to 1d array # moments = moments.flatten() # outputs w = cuda.aligned_zeros((9, num_moments), dtype=np.float32) x = cuda.aligned_zeros((9, num_moments), dtype=np.float32) y = cuda.aligned_zeros((9, num_moments), dtype=np.float32) time1 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size) # time2 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size) # time3 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size) # time4 = chyqmom9_pycuda(moments, num_moments, w, x, y, batch_size) print("Done") # for j in range(num_moments): # try: # if np.abs(w[0, j] - 0.027791) > 1e-3: raise ValueError # if np.abs(w[1, j] - 0.111124) > 1e-3: raise ValueError # if np.abs(w[2, j] - 0.027791) > 1e-3: raise ValueError # if np.abs(w[3, j] - 0.111124) > 1e-3: raise ValueError