def __init__(self, r_cut, r_buff=0.5, cell_guess=50, n_guess=150): system = Ctx.get_active() if system is None: raise ValueError("No active system!") self.system = system self.cell_guess = cell_guess self.n_guess = n_guess self.r_cut2 = r_cut ** 2 self.r_buff2 = (r_buff / 2) ** 2 self.gpu = system.gpu self.tpb = 64 self.bpg = int(self.system.N // self.tpb + 1) # self.situ_zero = np.zeros(1, dtype=np.int32) self.update_counts = 0 self.dist_funcs = {} self.cu_nlist, self.cu_check_build = self._gen_func() with cuda.gpus[self.gpu]: self.p_n_max = cuda.pinned_array((1,), dtype=np.int32) self.p_situation = cuda.pinned_array((1,), dtype=np.int32) self.d_last_x = cuda.device_array_like(self.system.d_x) self.d_n_max = cuda.device_array(1, dtype=np.int32) self.d_nl = cuda.device_array((self.system.N, self.n_guess), dtype=np.int32) self.d_nc = cuda.device_array((self.system.N,), dtype=np.int32) self.d_situation = cuda.device_array(1, dtype=np.int32) self.clist = clist(r_cut, r_buff, cell_guess=self.cell_guess) self.neighbour_list() self.system.nlist = self # register to system
def __init__(self, r_cut, r_buff=0.5, cell_guess=50): system = Ctx.get_active() if system is None: raise ValueError("Error, Initialize system first!") self.system = system self.ibox = np.asarray(np.floor(system.box / (r_cut + r_buff)), dtype=np.int32) self.n_cell = int(np.multiply.reduce(self.ibox)) self.cell_adj = np.ones(self.system.n_dim, dtype=np.int32) * 3 self.gpu = system.gpu self.tpb = 64 self.bpg = int(self.system.N // self.tpb + 1) self.bpg_cell = int(self.n_cell // self.tpb + 1) self.cell_guess = cell_guess # self.situ_zero = np.zeros(1, dtype=np.int32) self.cu_cell_map, self.cu_cell_list = self._gen_func() self.p_cell_max = cuda.pinned_array((1, ), dtype=np.int32) self.p_out_of_box = cuda.pinned_array((1, ), dtype=self.system.dtype) with cuda.gpus[self.gpu]: self.d_cells = cuda.device_array(self.system.d_x.shape[0], dtype=np.int32) self.d_cell_map = cuda.device_array((self.n_cell, 3**system.n_dim), dtype=np.int32) self.d_ibox = cuda.to_device(self.ibox) self.d_cell_adj = cuda.to_device(self.cell_adj) self.cu_cell_map[self.bpg_cell, self.tpb](self.d_ibox, self.d_cell_adj, self.d_cell_map) self.d_cell_list = cuda.device_array( (self.n_cell, self.cell_guess), dtype=np.int32) self.d_cell_counts = cuda.device_array(self.n_cell, dtype=np.int32) self.d_cell_max = cuda.device_array(1, dtype=np.int32) self.d_out_of_box = cuda.device_array(1, dtype=self.system.dtype) self.update()
def test_gpu_instrumentation(): sdfg: dace.SDFG = axpy2GPU.to_sdfg(strict=True) sdfg.name = 'gpu_instrumentation' map1 = find_map_by_param(sdfg, 'i') map2 = find_map_by_param(sdfg, 'j') GPUTransformMap.apply_to(sdfg, _map_entry=map1, options={'gpu_id': 0}) GPUTransformMap.apply_to(sdfg, _map_entry=map2, options={'gpu_id': 1}) # Set instrumentation both on the state and the map sdfg.start_state.instrument = dace.InstrumentationType.GPU_Events map1.instrument = dace.InstrumentationType.GPU_Events map2.instrument = dace.InstrumentationType.GPU_Events size = 256 np.random.seed(0) A = cuda.pinned_array(shape=1, dtype=np_dtype) X = cuda.pinned_array(shape=size, dtype=np_dtype) Y = cuda.pinned_array(shape=size, dtype=np_dtype) A.fill(np.random.rand()) X[:] = np.random.rand(size)[:] Y[:] = np.random.rand(size)[:] Z = np.copy(Y) sdfg(A=A[0], X=X, Y=Y, N=size) assert np.allclose(Y, A * X + Z) # Print instrumentation report if sdfg.is_instrumented(): report = sdfg.get_latest_report() print(report)
async def async_cuda_fn(value_in: float) -> float: stream = cuda.stream() h_src, h_dst = cuda.pinned_array(8), cuda.pinned_array(8) h_src[:] = value_in d_ary = cuda.to_device(h_src, stream=stream) d_ary.copy_to_host(h_dst, stream=stream) await stream.async_done() return h_dst.mean()
def __init__(self, info, group): self.info = info self.group = group # self.ps = particle_set(info, group) self.ps = info.find_particle_set(group) if self.ps is None: self.ps = particle_set(info, group) info.particle_set.append(self.ps) self.last_ts = 0xffffffff self.temp = 0.0 self.pressure = 0.0 self.potential = 0.0 self.momentum = 0.0 self.block_size = 256 self.nblocks = math.ceil(self.ps.nme / self.block_size) self.coll = np.zeros(self.nblocks * 6, dtype=np.float32) self.d_coll = cuda.to_device(self.coll) # self.result = np.zeros(3, dtype = nb.float32) self.result = cuda.pinned_array(16, dtype=np.float32) self.result[0] = 0.0 self.result[1] = 0.0 self.result[2] = 0.0 self.result[3] = 0.0 self.result[4] = 0.0 self.result[5] = 0.0 self.d_result = cuda.to_device(self.result)
def __init__(self, x_shape, n_samples, logger: logging.Logger = None): if logger is None: logger = logging.Logger('AutoMemoryBuilder') logger.addHandler(logging.NullHandler()) self._logger = logger.getChild('AutoMemoryBuilder') self._logger.debug('Constructor') if len(x_shape) == 1: self._x_size = x_shape[0] else: assert len(x_shape) == 2 self._x_size = x_shape[0] * x_shape[1] assert self._x_size >= n_samples self._n_samples = n_samples self._xmem_shape = (n_samples, self._x_size) self._i = 0 self._logger.info('Allocating pinned array: {0} bytes'.format( n_samples * self._x_size)) self._pinned_mem = cuda.pinned_array(self._xmem_shape) self._stream = cuda.stream()
def predict(self, features_gen=None, as_cuda_array=False, flatten=True): if features_gen is None: features_gen = self._build_features() predicted_cva = torch.empty((self.diffusion_engine.num_defs_per_path * self.diffusion_engine.num_paths, 1), dtype=torch.float32, device=self.device) with cuda.devices.gpus[self.device.index]: d_predicted_cva = cuda.as_cuda_array( predicted_cva.view(self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths)) if as_cuda_array: out = d_predicted_cva else: out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=np.float32) if flatten: out = out.reshape(-1) while True: t = yield self._predict(t, features_gen, predicted_cva) if not as_cuda_array: d_predicted_cva.copy_to_host(out) yield out
def test_host_alloc_pinned(self): ary = cuda.pinned_array(10, dtype=np.uint32) ary.fill(123) self.assertTrue(all(ary == 123)) devary = cuda.to_device(ary) driver.device_memset(devary, 0, driver.device_memory_size(devary)) self.assertTrue(all(ary == 123)) devary.copy_to_host(ary) self.assertTrue(all(ary == 0))
def test_host_alloc_pinned(self): ary = cuda.pinned_array(10, dtype=np.uint32) ary.fill(123) self.assertTrue(all(ary == 123)) devary = cuda.to_device(ary) driver.device_memset(devary, 0, driver.device_memory_size(devary)) self.assertTrue(all(ary == 123)) devary.copy_to_host(ary) self.assertTrue(all(ary == 0))
def _build_labels_backward(self, as_cuda_tensor): d_spread_integral_now = self.diffusion_engine.d_spread_integrals[0, 1:] d_spread_integral_next = self.diffusion_engine.d_spread_integrals[1, 1:] d_mtm_next = self.diffusion_engine.d_mtm_by_cpty[0] d_rate_integral_now = self.diffusion_engine.d_dom_rate_integral[0] d_rate_integral_next = self.diffusion_engine.d_dom_rate_integral[1] d_def = self.diffusion_engine.d_def_indicators[0] d_labels_by_cpty = self.diffusion_engine.d_mtm_by_cpty[1] t_out = torch.empty((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=torch.float32, device=self.device) with cuda.devices.gpus[self.device.index]: d_out = cuda.as_cuda_array(t_out) if as_cuda_tensor: out = t_out else: out = cuda.pinned_array((self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths), dtype=np.float32) out[:] = 0 if as_cuda_tensor: yield out.view(-1, 1) else: yield out.reshape(-1, 1) d_spread_integral_next.copy_to_device( self.diffusion_engine.spread_integrals[ self.diffusion_engine.num_coarse_steps, 1:]) d_rate_integral_next.copy_to_device( self.diffusion_engine.dom_rate_integral[ self.diffusion_engine.num_coarse_steps]) accumulate = False for t in range(self.diffusion_engine.num_coarse_steps - 1, -1, -1): d_spread_integral_now.copy_to_device( self.diffusion_engine.spread_integrals[t, 1:]) d_rate_integral_now.copy_to_device( self.diffusion_engine.dom_rate_integral[t]) d_mtm_next.copy_to_device(self.diffusion_engine.mtm_by_cpty[t + 1]) d_def.copy_to_device(self.diffusion_engine.def_indicators[t]) self.__cuda_build_labels_backward(d_spread_integral_now, d_spread_integral_next, d_rate_integral_now, d_rate_integral_next, d_mtm_next, d_labels_by_cpty, t > 0, accumulate) self.__cuda_aggregate_survival(d_labels_by_cpty, d_def, d_out) if as_cuda_tensor: yield out.view(-1, 1) else: d_out.copy_to_host(out) yield out.reshape(-1, 1) if not accumulate: accumulate = True
def test_host_operators(self): for ary in [ cuda.mapped_array(10, dtype=np.uint32), cuda.pinned_array(10, dtype=np.uint32) ]: ary[:] = range(10) self.assertTrue(sum(ary + 1) == 55) self.assertTrue(sum((ary + 1) * 2 - 1) == 100) self.assertTrue(sum(ary < 5) == 5) self.assertTrue(sum(ary <= 5) == 6) self.assertTrue(sum(ary > 6) == 3) self.assertTrue(sum(ary >= 6) == 4) self.assertTrue(sum(ary**2) == 285) self.assertTrue(sum(ary // 2) == 20) self.assertTrue(sum(ary / 2.0) == 22.5) self.assertTrue(sum(ary % 2) == 5)
def __init__(self): self.device_array = cuda.device_array((1024, 2048), np.float32) self.host_array = cuda.pinned_array((1024, 2048), np.float32) self.stream = cuda.stream() self.zoom = 0.3 self.pos = np.array([-1, 0], np.float32) dpi = mpl.rcParams['figure.dpi'] figsize = 2048 / float(dpi), 1024 / float(dpi) self.fig = plt.figure(figsize=figsize) self.fig.canvas.mpl_connect('scroll_event', self.zoom_cb) self.ax = self.fig.add_axes([0, 0, 1, 1]) self.ax.axis('off') self.calculate() self.img = self.ax.imshow(self.host_array)
def test_pinned_warn_on_host_array(self): @cuda.jit def foo(r, x): r[0] = x + 1 N = 10 ary = cuda.pinned_array(N, dtype=np.float32) with override_config('CUDA_WARN_ON_IMPLICIT_COPY', 1): with warnings.catch_warnings(record=True) as w: foo[1, N](ary, N) self.assertEqual(w[0].category, NumbaPerformanceWarning) self.assertIn('Host array used in CUDA kernel will incur', str(w[0].message)) self.assertIn('copy overhead', str(w[0].message))
def _build_features(self): features = cuda.pinned_array( (self.diffusion_engine.num_defs_per_path, self.diffusion_engine.num_paths, self.num_features), dtype=np.float32) while True: t = yield t_prev_reset = self.prev_reset_arr[t] features[:, :, :2 * self.diffusion_engine.num_rates - 1] = self.diffusion_engine.X[ t, :2 * self.diffusion_engine.num_rates - 1].T[None] np.maximum( self.diffusion_engine.X[t, 2 * self.diffusion_engine.num_rates:2 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 1].T[None], 0., out=features[:, :, 2 * self.diffusion_engine.num_rates - 1:2 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2]) if t_prev_reset > 0: features[:, :, 2 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2:3 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2] = self.diffusion_engine.X[ t_prev_reset, :self.diffusion_engine. num_rates].T[None] else: features[:, :, 2 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2:3 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2] = 0 self.__unpack( self.diffusion_engine.def_indicators[t], features[:, :, 3 * self.diffusion_engine.num_rates + self.diffusion_engine.num_spreads - 2:]) yield features.reshape( self.diffusion_engine.num_defs_per_path * self.diffusion_engine.num_paths, -1)
def vecadd(x, y): with time_region_cuda() as t_xfer: d_x = cuda.to_device(x) d_y = cuda.to_device(y) d_z = cuda.device_array_like(x) block_size = 128 num_blocks = N // block_size if N % block_size: num_blocks += 1 with time_region_cuda() as t_kernel: _vecadd_cuda[num_blocks, block_size](d_z, d_x, d_y) with time_region_cuda(t_xfer.elapsed_time()) as t_xfer: ret = cuda.pinned_array(N) d_z.copy_to_host(ret) print(f' CUDA xfer overheads: {t_xfer.elapsed_time()} s') print(f' CUDA kernel time: {t_kernel.elapsed_time()} s') return ret
def similarity_gpu( emx, clusmethod, corrmethod, preout, postout, minexpr, maxexpr, minsamp, minclus, maxclus, criterion, mincorr, maxcorr, gsize, lsize, outfile): # allocate device buffers W = gsize N = emx.shape[1] N_pow2 = next_power_2(N) K = maxclus in_emx = cuda.to_device(emx) in_index_cpu = cuda.pinned_array((W, 2), dtype=np.int32) in_index_gpu = cuda.device_array_like(in_index_cpu) work_x = cuda.device_array((W, N_pow2), dtype=np.float32) work_y = cuda.device_array((W, N_pow2), dtype=np.float32) work_gmm_data = cuda.device_array((W, N, 2), dtype=np.float32) work_gmm_labels = cuda.device_array((W, N), dtype=np.int8) work_gmm_pi = cuda.device_array((W, K), dtype=np.float32) work_gmm_mu = cuda.device_array((W, K, 2), dtype=np.float32) work_gmm_sigma = cuda.device_array((W, K, 2, 2), dtype=np.float32) work_gmm_sigmaInv = cuda.device_array((W, K, 2, 2), dtype=np.float32) work_gmm_normalizer = cuda.device_array((W, K), dtype=np.float32) work_gmm_MP = cuda.device_array((W, K, 2), dtype=np.float32) work_gmm_counts = cuda.device_array((W, K), dtype=np.int32) work_gmm_logpi = cuda.device_array((W, K), dtype=np.float32) work_gmm_xm = cuda.device_array((W, 2), dtype=np.float32) work_gmm_Sxm = cuda.device_array((W, 2), dtype=np.float32) work_gmm_gamma = cuda.device_array((W, N, K), dtype=np.float32) work_gmm_logL = cuda.device_array((W, 1), dtype=np.float32) work_gmm_entropy = cuda.device_array((W, 1), dtype=np.float32) out_K_cpu = cuda.pinned_array((W,), dtype=np.int8) out_K_gpu = cuda.device_array_like(out_K_cpu) out_labels_cpu = cuda.pinned_array((W, N), dtype=np.int8) out_labels_gpu = cuda.device_array_like(out_labels_cpu) out_correlations_cpu = cuda.pinned_array((W, K), dtype=np.float32) out_correlations_gpu = cuda.device_array_like(out_correlations_cpu) # iterate through global work blocks n_genes = emx.shape[0] n_total_pairs = n_genes * (n_genes - 1) // 2 index_x = 1 index_y = 0 for i in range(0, n_total_pairs, gsize): # print("%8d %8d" % (i, n_total_pairs)) # determine number of pairs n_pairs = min(gsize, n_total_pairs - i) # initialize index array index_x_ = index_x index_y_ = index_y for j in range(n_pairs): in_index_cpu[j] = index_x_, index_y_ index_x_, index_y_ = pairwise_increment(index_x_, index_y_) # copy index array to device in_index_gpu.copy_to_device(in_index_cpu) # execute similarity kernel similarity_gpu_helper[gsize // lsize, lsize]( n_pairs, in_emx, in_index_gpu, clusmethod, corrmethod, preout, postout, minexpr, maxexpr, minsamp, minclus, maxclus, criterion, work_x, work_y, work_gmm_data, work_gmm_labels, work_gmm_pi, work_gmm_mu, work_gmm_sigma, work_gmm_sigmaInv, work_gmm_normalizer, work_gmm_MP, work_gmm_counts, work_gmm_logpi, work_gmm_xm, work_gmm_Sxm, work_gmm_gamma, work_gmm_logL, work_gmm_entropy, out_K_gpu, out_labels_gpu, out_correlations_gpu ) cuda.synchronize() # copy results from device out_K_gpu.copy_to_host(out_K_cpu) out_labels_gpu.copy_to_host(out_labels_cpu) out_correlations_gpu.copy_to_host(out_correlations_cpu) # save correlation matrix to output file index_x_ = index_x index_y_ = index_y for j in range(n_pairs): # extract pairwise results K = out_K_cpu[j] labels = out_labels_cpu[j] correlations = out_correlations_cpu[j, 0:K] # save pairwise results write_pair( index_x_, index_y_, K, labels, correlations, mincorr, maxcorr, outfile) # increment pairwise index index_x_, index_y_ = pairwise_increment(index_x_, index_y_) # update local pairwise index index_x = index_x_ index_y = index_y_
def run_gpu_loop( weights, delays, total_time=60e3, bold_tr=1800, coupling_scaling=0.01, r_sigma=1e-3, V_sigma=1e-3, I=1.0, Delta=1.0, eta=-5.0, tau=100.0, J=15.0, cr=0.01, cv=0.0, dt=1.0, nh=256, # history buf len, must be power of 2 & greater than delays.max()/dt nto=16, # num parts of nh for tavg, e.g. nh=256, nto=4: tavg over 64 steps progress=False, icfun=default_icfun, rng_seed=42): assert weights.shape == delays.shape and weights.shape[0] == weights.shape[ 1] nn = weights.shape[0] w = weights.astype(np.float32) d = (delays / dt).astype(np.int32) assert d.max() < nh assert nto <= nh, 'oversampling <= buffer size' make_loop = make_gpu_loop # TODO block_dim_x = 96 # nodes grid_dim_x = 64, 16, 16 # subjects, noise, coupling nt = np.prod(grid_dim_x) # allocate workspace stuff print('allocating memory..') if True: # TODO no dedent in lab editor.. r, V = rV = np.zeros((2, nh, nn, nt), 'f') nrV = np.zeros((2, nt), 'f') # no stack arrays in Numba print('creating rngs..', end='') rngs = create_xoroshiro128p_states(int(nt * nn * 2), rng_seed) print('done') tavg = np.zeros((nto, 2, nn, nt), 'f') # buffer for temporal average bold_state = np.zeros((nn, 4, nt), 'f') # buffer for bold state bold_state[:, 1:] = 1.0 bold_out = np.zeros((nn, nt), 'f') # buffer for bold output icfun(-np.r_[:nh] * dt, rV) I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma = [ nb.float32(_) for _ in (I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma) ] print('workspace allocations done') # first call to jit the function cfpre, cfpost = make_linear_cfun(coupling_scaling) loop = make_loop(nh, nto, nn, dt, cfpre, cfpost, block_dim_x) # outer loop setup win_len = nto * dt total_wins = int(total_time / win_len) bold_skip = int(bold_tr / win_len) # pinned memory for speeding up kernel invocations from numba.cuda import to_device, pinned_array g_nrV, g_r, g_V, g_rngs, g_w, g_d, g_tavg, g_bold_state, g_bold_out = [ to_device(_) for _ in (nrV, r, V, rngs, w, d, tavg, bold_state, bold_out) ] p_tavg = pinned_array(tavg.shape, dtype=np.float32) p_bold_out = pinned_array(bold_out.shape, dtype=np.float32) # TODO mem map this, since it will get too big # tavg_trace = np.zeros((total_wins, ) + tavg.shape, 'f') bold_trace = np.zeros((total_wins // bold_skip + 1, ) + bold_out.shape, 'f') # start time stepping print('starting time stepping') for t in (tqdm.trange if progress else range)(total_wins): loop[grid_dim_x, block_dim_x](g_nrV, g_r, g_V, g_rngs, g_w, g_d, g_tavg, g_bold_state, g_bold_out, I, Delta, eta, tau, J, cr, cv, r_sigma, V_sigma) g_tavg.copy_to_host(p_tavg) # print(p_tavg) # tavg_trace[t] = p_tavg if t % bold_skip == 0: g_bold_out.copy_to_host(p_bold_out) bold_trace[t // bold_skip] = p_bold_out
def _allocate_host_arrays(self): self.X = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_diffusions, self.num_paths), np.float32) self.mtm_by_cpty = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths), np.float32) self.cash_flows_by_cpty = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths), np.float32) self.spread_integrals = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_spreads, self.num_paths), np.float32) self.dom_rate_integral = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_paths), np.float32) self.def_indicators = cuda.pinned_array( (self.num_coarse_steps + 1, (self.num_spreads - 1 + 7) // 8, self.num_defs_per_path, self.num_paths), np.int8) if not self.no_nested_cva: try: self.nested_cva = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_defs_per_path, self.num_paths), np.float32) except cuda.cudadrv.driver.CudaAPIError: print( 'couldn\'t allocate pinned array for nested_cva, using the numpy allocator instead (non-pinned array).' ) self.nested_cva = np.empty( (self.num_coarse_steps + 1, self.num_defs_per_path, self.num_paths), np.float32) try: self.nested_cva_sq = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_defs_per_path, self.num_paths), np.float32) except cuda.cudadrv.driver.CudaAPIError: print( 'couldn\'t allocate pinned array for nested_cva_sq, using the numpy allocator instead (non-pinned array).' ) self.nested_cva_sq = np.empty( (self.num_coarse_steps + 1, self.num_defs_per_path, self.num_paths), np.float32) if not self.no_nested_im: try: self.nested_im_by_cpty = cuda.pinned_array( (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths), np.float32) except cuda.cudadrv.driver.CudaAPIError: print( 'couldn\'t allocate pinned array for nested_im_by_cpty, using the numpy allocator instead (non-pinned array).' ) self.nested_im_by_cpty = np.empty( (self.num_coarse_steps + 1, self.num_spreads - 1, self.num_paths), np.float32) self.R = np.empty((self.num_diffusions, self.num_diffusions), dtype=np.float32) # following matrices are flattened (and only upper-triangular entries are kept) self.g_R = np.empty( self.num_diffusions * (self.num_diffusions + 1) // 2, np.float32) self.g_L_T = np.empty( self.num_diffusions * (self.num_diffusions + 1) // 2, np.float32) self.g_diff_params = np.empty( 5 * self.num_rates - 2 + 3 * self.num_spreads, np.float32) # storing copies of product specs for each warp self.vanillas_on_fx_f32 = np.empty((self.vanilla_specs.size, 3), np.float32) # mat, notional, stk self.vanillas_on_fx_i32 = np.empty((self.vanilla_specs.size, 2), np.int32) # cpty, fgn_ccy self.vanillas_on_fx_b8 = np.empty((self.vanilla_specs.size, 1), np.bool8) # call_put # first_reset, reset_freq, notional, swap_rate self.irs_f32 = np.empty((self.irs_specs.size, 4), np.float32) # num_resets, cpty, ccy self.irs_i32 = np.empty((self.irs_specs.size, 3), np.int32) self.zcs_f32 = np.empty((self.zcs_specs.size, 2), np.float32) # mat, notional self.zcs_i32 = np.empty((self.zcs_specs.size, 2), np.int32) # cpty, ccy
rte_off = 0.01 hp = 0.3 btm = 1 # Below are shape_next matrices transferred betweet host and device once in each iteration u_shape_next = np.zeros((user, topic), dtype=np.float32) #cuda.pinned_array((500000,500)) v_shape_next = np.zeros((author, topic), dtype=np.float32) #cuda.pinned_array((5000,500)) y_shape_next = np.zeros((user, topic), dtype=np.float32) #cuda.pinned_array((500000,500)) theta_shape_next = np.zeros((tweet, topic), dtype=np.float32) #cuda.pinned_array((500000,500)) beta_shape_next = np.zeros((word, topic), dtype=np.float32) #cuda.pinned_array((500000,500)) elogu = cuda.pinned_array((user, topic), dtype=np.float32) elogv = cuda.pinned_array((author, topic), dtype=np.float32) elogy = cuda.pinned_array((user, topic), dtype=np.float32) elogtheta = cuda.pinned_array((tweet, topic), dtype=np.float32) elogbeta = cuda.pinned_array((word, topic), dtype=np.float32) def computlog(a, b): return np.float32(digamma(a)) - np.float32(np.log(b)) def random_offset(x, y, k): return np.random.rand(x, y) * k
TWO_N32 = 0.232830643653869628906250e-9 return np.uint32(u32)*(TWO_N32*(high-low))+low @cuda.jit("void(int32, int32, float32[:])") def zero_pos(npa, seed, d_ran): i = cuda.grid(1) if i < npa: d_ran[i] = saruprng(np.uint32(i), np.uint32(npa-i), np.uint32(seed), 5, -1.0, 1.0) if __name__ == "__main__": # Create an image. npa = 100 ran = cuda.pinned_array(npa, dtype=np.float32) d_ran = cuda.to_device(ran) # Record the starting time. start = time.time() block_size = 256 nblocks = math.ceil(npa / block_size) seed = 12454 zero_pos[nblocks, block_size](npa, seed, d_ran) cuda.synchronize() ran = d_ran.copy_to_host() # Record the ending time. end = time.time() print(ran) print(end - start)
def init_gpu(part, walls): global dim global pw_blk_rows, pw_blk_cols, pw_grd_rows, pw_grd_cols global pw_blk_shp, pw_grd_shp, pw_tot_shp global pp_blk_rows, pp_blk_cols, pp_grd_rows, pp_grd_cols global pp_blk_shp, pp_grd_shp, pp_tot_shp global get_pp_dt_gpu, get_pw_dt_gpu dim = part.dim bw = min(threads_per_block_max, part.num) pw_blk_rows = bw pw_blk_cols = 1 gw = int(np.ceil(part.num / bw)) pw_grd_rows = gw pw_grd_cols = len(walls) pw_blk_shp = (pw_blk_rows, pw_blk_cols) pw_grd_shp = (pw_grd_rows, pw_grd_cols) pw_tot_shp = (pw_grd_rows, pw_grd_cols, pw_blk_rows, pw_blk_cols) bp = min(optimal_part_num, part.num) pp_blk_rows = bp pp_blk_cols = bp gp = int(np.ceil(part.num / bp)) pp_grd_rows = gp pp_grd_cols = gp pp_blk_shp = (pp_blk_rows, pp_blk_cols) pp_grd_shp = (pp_grd_rows, pp_grd_cols) pp_tot_shp = (pp_grd_rows, pp_grd_cols, pp_blk_rows, pp_blk_cols) part.walls_data = np.array([wall.data.copy() for wall in walls], dtype=np_dtype) part.walls_data_dev = cuda.to_device(part.walls_data) part.pw_gap_min_dev = cuda.to_device(part.pw_gap_min) part.pp_gap_min_dev = cuda.to_device(part.pp_gap_min) part.pos_dev = cuda.to_device(part.pos) part.pos_loc_dev = cuda.to_device(part.pos_loc) part.vel_dev = cuda.to_device(part.vel) ## We wish to minimize data transfer between gpu and cpu for the sake of speed. ## The want to avoid passing all p x p times because these are floats. ## Instead, we will pass only the smallest time from each block via pp_dt_blk_gpu ## and a BOOLEAN array called pp_events ## An entry of pp_events is True only if the dt for that pair of particles ## is within thresh of the min_dt on its block. ## Now, the true GLOBAL min_dt may be from a different block ## So, once passed to the cpu, we find the global min_dt = min(pp_dt_blk) ## and set all entries of pp_events to False except those from ## blocks that achieve this global min_dt ## Repeat for particle-wall ## Finally, if the min_dt over p-p interactions is bigger than the min_dt over p-w, ## we set every entry of pp_events to False. Conversely for pw_events. part.pw_events_dev = cuda.to_device(part.pw_events_old) part.pp_events_dev = cuda.to_device(part.pp_events_old) part.pw_dt_blk_gpu = cuda.pinned_array(pw_grd_shp, dtype=np_dtype) part.pw_dt_blk_gpu[:] = np.inf part.pw_dt_blk_dev = cuda.to_device(part.pw_dt_blk_gpu) part.pp_dt_blk_gpu = cuda.pinned_array(pp_grd_shp, dtype=np_dtype) part.pp_dt_blk_gpu[:] = np.inf part.pp_dt_blk_dev = cuda.to_device(part.pp_dt_blk_gpu) # Though inefficient, we may occasionally want to pass all pxp and pxw dt's # back to CPU for validation and debugging. So, we create pp_dt_gpu and pw_dt_gpu # for this purpose. In other cases, it is set as a 1x1 matrix and ignored # (other than as a placeholder in function signatures and calls). if check_gpu_cpu: pw_sh = [part.num, len(walls)] pp_sh = [part.num, part.num] else: pw_sh = [1, 1] pp_sh = [1, 1] part.pw_dt_gpu = cuda.pinned_array(pw_sh, dtype=np_dtype) part.pw_dt_gpu[:] = np.inf part.pw_dt_dev = cuda.to_device(part.pw_dt_gpu) part.pp_dt_gpu = cuda.pinned_array(pp_sh, dtype=np_dtype) part.pp_dt_gpu[:] = np.inf part.pp_dt_dev = cuda.to_device(part.pp_dt_gpu) update_gpu(part) @cuda.jit(device=False) def get_pp_dt_kernel(pos, vel, pp_gap_min, pp_events_dev, pp_dt_blk_dev, pp_dt_dev): row_loc = cuda.threadIdx.x col_loc = cuda.threadIdx.y idx_loc = row_loc * cuda.blockDim.y + col_loc blk_row = cuda.blockIdx.x blk_col = cuda.blockIdx.y row_glob = blk_row * cuda.blockDim.x + row_loc col_glob = blk_col * cuda.blockDim.y + col_loc idx_glob = row_glob * cuda.blockDim.y * cuda.gridDim.y + col_glob p = row_glob q = col_glob N = pos.shape[0] pos1_shr = cuda.shared.array(shape=(pp_blk_rows, dim), dtype=nb_dtype) pos2_shr = cuda.shared.array(shape=(pp_blk_cols, dim), dtype=nb_dtype) vel1_shr = cuda.shared.array(shape=(pp_blk_rows, dim), dtype=nb_dtype) vel2_shr = cuda.shared.array(shape=(pp_blk_cols, dim), dtype=nb_dtype) pp_dt_shr = cuda.shared.array(shape=(pp_blk_rows, pp_blk_cols), dtype=nb_dtype) temp_shr = cuda.shared.array(shape=(pp_blk_rows, pp_blk_cols), dtype=nb_dtype) d = col_loc if d < dim: pos1_shr[row_loc, d] = pos[p, d] vel1_shr[row_loc, d] = vel[p, d] d = row_loc if d < dim: pos2_shr[col_loc, d] = pos[q, d] vel2_shr[col_loc, d] = vel[q, d] cuda.syncthreads() if (p < N) and (q < N): c0 = 0.0 c1 = 0.0 c2 = 0.0 for d in range(dim): dx = pos1_shr[row_loc, d] - pos2_shr[col_loc, d] dv = vel1_shr[row_loc, d] - vel2_shr[col_loc, d] c0 += (dx**2) c1 += (dx * dv * 2) c2 += (dv**2) c0 -= (pp_gap_min[p, q]**2) dt = solver_gpu(c2, c1, c0, pp_events_dev[p, q]) else: dt = np.inf pp_dt_shr[row_loc, col_loc] = dt temp_shr[row_loc, col_loc] = dt cuda.syncthreads() min_gpu(temp_shr) min_dt = temp_shr[0, 0] if (p < N) and (q < N): pp_events_dev[p, q] = (dt < min_dt + thresh) if idx_loc == 0: pp_dt_blk_dev[blk_row, blk_col] = min_dt if check_gpu_cpu: pp_dt_dev[p, q] = dt @cuda.jit(device=False) def get_pw_dt_kernel(pos_loc, vel, pw_gap_min, walls, pw_events_dev, pw_dt_blk_dev, pw_dt_dev): row_loc = cuda.threadIdx.x col_loc = cuda.threadIdx.y idx_loc = row_loc * cuda.blockDim.y + col_loc blk_row = cuda.blockIdx.x blk_col = cuda.blockIdx.y row_glob = blk_row * cuda.blockDim.x + row_loc col_glob = blk_col * cuda.blockDim.y + col_loc idx_glob = row_glob * cuda.blockDim.y * cuda.gridDim.y + col_glob p = row_glob w = col_glob N = pos_loc.shape[0] W = walls.shape[0] shape = walls[w, 0, 0] pw_dt_shr = cuda.shared.array(shape=(pw_blk_rows, pw_blk_cols), dtype=nb_dtype) temp_shr = cuda.shared.array(shape=(pw_blk_rows, pw_blk_cols), dtype=nb_dtype) if (p < N) and (w < W): c0 = 0.0 c1 = 0.0 c2 = 0.0 if shape <= -0.5: # I'm an ignore wall pw_dt_shr[row_loc, col_loc] = np.inf else: point = cuda.shared.array(shape=dim, dtype=nb_dtype) normal = cuda.shared.array(shape=dim, dtype=nb_dtype) d = row_loc if d < dim: point[d] = walls[w, 1, d] normal[d] = walls[w, 2, d] cuda.syncthreads() if shape <= 0.5: # I'm a flat wall for d in range(dim): dx = pos_loc[p, d] - point[d] dv = vel[p, d] c0 += dx * normal[d] c1 += dv * normal[d] c0 -= pw_gap_min[p, w] elif shape <= 1.5: # I'm a sphere wall for d in range(dim): dx = pos_loc[p, d] - point[d] dv = vel[p, d] c0 += dx * dx c1 += dx * dv * 2 c2 += dv * dv c0 -= pw_gap_min[p, w]**2 elif shape <= 2.5: # I'm a cylinder wall dx_ax_mag = 0.0 dv_ax_mag = 0.0 for d in range(dim): dx = pos_loc[p, d] - point[d] dv = vel[p, d] dx_ax_mag += dx * normal[d] dv_ax_mag += dv * normal[d] for d in range(dim): dx = pos_loc[p, d] - point[d] dv = vel[p, d] dx_ax = dx_ax_mag * normal[d] dx_normal = dx - dx_ax dv_ax = dv_ax_mag * normal[d] dv_normal = dv - dv_ax c0 += dx_normal * dx_normal c1 += dx_normal * dv_normal * 2 c2 += dv_normal * dv_normal c0 -= pw_gap_min[p, w]**2 else: raise Exception('Invalid wall type') dt = solver_gpu(c2, c1, c0, pw_events_dev[p, w]) else: dt = np.inf pw_dt_shr[row_loc, col_loc] = dt temp_shr[row_loc, col_loc] = dt cuda.syncthreads() min_gpu(temp_shr) min_dt = temp_shr[0, 0] if (p < N) and (w < W): pw_events_dev[p, w] = (dt < min_dt + thresh) if idx_loc == 0: pw_dt_blk_dev[blk_row, blk_col] = min_dt if check_gpu_cpu: pw_dt_dev[p, w] = dt def get_pp_dt_gpu(part, walls): get_pp_dt_kernel[pp_grd_shp, pp_blk_shp](part.pos_dev, part.vel_dev, part.pp_gap_min_dev, part.pp_events_dev, part.pp_dt_blk_dev, part.pp_dt_dev) if check_gpu_cpu: part.pp_dt_gpu = part.pp_dt_dev.copy_to_host() part.pp_events_new = part.pp_events_dev.copy_to_host() part.pp_dt_blk_gpu = part.pp_dt_blk_dev.copy_to_host() part.pp_dt = np.min(part.pp_dt_blk_gpu) # min over blocks blk_idx = (part.pp_dt_blk_gpu < part.pp_dt + thresh ) # which blocks obtained that min # Need to remove True for p-p dts that were smaller than all others on their block # but are larger than the global dt from other blocks. The corresponding # entry of blk_idx is False and part.pp_events_new is True. # So, we reshape blk_idx and take the entry-wise AND. reps = np.prod(pp_blk_shp) # number of threads on each block A = np.repeat(blk_idx.ravel(), reps) # repetition for each thread on the block new_sh = part.pp_events_new.shape blk_events = A[:np.prod(new_sh)].reshape( new_sh) # drop extra entries and reshape part.pp_events_new &= blk_events def get_pw_dt_gpu(part, walls): ## See comments for get_pp_dt_gpu get_pw_dt_kernel[pw_grd_shp, pw_blk_shp](part.pos_loc_dev, part.vel_dev, part.pw_gap_min_dev, part.walls_data_dev, part.pw_events_dev, part.pw_dt_blk_dev, part.pw_dt_dev) if check_gpu_cpu: part.pw_dt_gpu = part.pw_dt_dev.copy_to_host() part.pw_events_new = part.pw_events_dev.copy_to_host() part.pw_dt_blk_gpu = part.pw_dt_blk_dev.copy_to_host() part.pw_dt = np.min(part.pw_dt_blk_gpu) blk_idx = (part.pw_dt_blk_gpu < part.pw_dt + thresh) reps = np.prod(pw_blk_shp) A = np.repeat(blk_idx.ravel(), reps) new_sh = part.pw_events_new.shape blk_events = A[:np.prod(new_sh)].reshape(new_sh) part.pw_events_new &= blk_events