def test_spawn_concurrent_compilation(self): # force CUDA context init cuda.get_current_device() # use "spawn" to avoid inheriting the CUDA context ctx = multiprocessing.get_context('spawn') q = ctx.Queue() p = ctx.Process(target=spawn_process_entry, args=(q,)) p.start() try: err = q.get() finally: p.join() if err is not None: raise AssertionError(err) self.assertEqual(p.exitcode, 0, 'test failed in child process')
def setup(self): for a, arg in enumerate(sys.argv): if arg.find('-c') > -1: self._config_path = sys.argv[a + 1] if arg.find('-hc') > -1: from numba import cuda gpu = cuda.get_current_device() print("name = %s" % gpu.name) print("maxThreadsPerBlock = %s" % str(gpu.MAX_THREADS_PER_BLOCK)) print("maxBlockDimX = %s" % str(gpu.MAX_BLOCK_DIM_X)) print("maxBlockDimY = %s" % str(gpu.MAX_BLOCK_DIM_Y)) print("maxBlockDimZ = %s" % str(gpu.MAX_BLOCK_DIM_Z)) print("maxGridDimX = %s" % str(gpu.MAX_GRID_DIM_X)) print("maxGridDimY = %s" % str(gpu.MAX_GRID_DIM_Y)) print("maxGridDimZ = %s" % str(gpu.MAX_GRID_DIM_Z)) print("maxSharedMemoryPerBlock = %s" % str(gpu.MAX_SHARED_MEMORY_PER_BLOCK)) print("asyncEngineCount = %s" % str(gpu.ASYNC_ENGINE_COUNT)) print("canMapHostMemory = %s" % str(gpu.CAN_MAP_HOST_MEMORY)) print("multiProcessorCount = %s" % str(gpu.MULTIPROCESSOR_COUNT)) print("warpSize = %s" % str(gpu.WARP_SIZE)) print("unifiedAddressing = %s" % str(gpu.UNIFIED_ADDRESSING)) print("pciBusID = %s" % str(gpu.PCI_BUS_ID)) print("pciDeviceID = %s" % str(gpu.PCI_DEVICE_ID)) exit() if arg.find('-h') > -1: print('.py -c starten mit und einem pfad zu einer config') exit() self.load_config()
def gpu_release(): ''' Release gpu memory ''' device = cuda.get_current_device() device.reset()
def init_server(): _global_addr[0] = socket.gethostname() logger.info("host addr: %s", _global_addr[0]) devnum = cuda.get_current_device().id th = threading.Thread(target=server_loop, args=[devnum]) th.daemon = True th.start()
def cuda_factor(number, primes): device = cuda.get_current_device() ffactor = np.asarray([0] * len(primes)) dfact = cuda.to_device(ffactor) d_primes = cuda.to_device(np.asarray(primes)) limit = len(primes) getcontext().prec = 1000 l = Decimal(limit) / Decimal(24) if l <= 1024: tpb = l bpg = 1 else: tpb = 1024 bpg = Decimal(l) / Decimal(tpb) itpb = int(math.ceil(tpb)) ibpg = int(math.ceil(bpg)) cu_fact[ibpg, itpb](d_primes, number, dfact) c = dfact.copy_to_host() k = [] for d in c: if int(d) != 0: k.append(int(d)) return k
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = PRNG(PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def cuda_args(shape): """ Compute the blocks-per-grid and threads-per-block parameters for use when invoking cuda kernels Parameters ---------- shape: int or tuple of ints The shape of the input array that the kernel will parallelize over Returns ------- tuple Tuple of (blocks_per_grid, threads_per_block) """ if isinstance(shape, int): shape = (shape, ) max_threads = cuda.get_current_device().MAX_THREADS_PER_BLOCK # Note: We divide max_threads by 2.0 to leave room for the registers # occupied by the kernel. For some discussion, see # https://github.com/numba/numba/issues/3798. threads_per_block = int(ceil(max_threads / 2.0)**(1.0 / len(shape))) tpb = (threads_per_block, ) * len(shape) bpg = tuple(int(ceil(d / threads_per_block)) for d in shape) return bpg, tpb
def gpu_name(): ''' Query the GPU's properties via Numba to obtain the name of the device. ''' device = cuda.get_current_device() name = device.name return (name)
def bloch_sim_batch_cuda3(Nexample, batch_size, Nk, PDr, T1r, T2r, dfr, b1r, M0, trr, ter, far, ti): sim_out = np.zeros((Nexample, Nk), dtype=np.complex128) #final output data batch_data = np.zeros((batch_size, Nk), dtype=np.complex128) #batch output data bT1r = np.zeros(batch_size, dtype=np.float32) #batch T1, T2, PD, df arrays bT2r = np.zeros(batch_size, dtype=np.float32) bPDr = np.zeros(batch_size, dtype=np.float32) bdfr = np.zeros(batch_size, dtype=np.float32) bb1r = np.zeros(batch_size, dtype=np.float32) #set total number of threads on GPU device = cuda.get_current_device() tpb = device.WARP_SIZE bpg = int(np.ceil(float(batch_size) / tpb)) # batch loop for nb in range(Nexample // batch_size): #print('Doing batch %d/%d for applying Bloch sim' % (nb+1,Nexample//batch_size)) bstart = nb * batch_size #batch start index bstop = bstart + batch_size #batch stop inex #print('%d:%d' % (bstart,bstop)) bT1r = T1r[bstart:bstop] #batch T1, T2, PD, df arrays bT2r = T2r[bstart:bstop] bPDr = PDr[bstart:bstop] bdfr = dfr[bstart:bstop] bb1r = b1r[bstart:bstop] #start the parallel computing on GPU bloch_sim_irssfp_cuda3[bpg, tpb](batch_size, Nk, bPDr, bT1r, bT2r, bdfr, bb1r, M0, trr, ter, far, ti, batch_data) sim_out[bstart:bstop, :] = batch_data return sim_out
def test_spawn_concurrent_compilation(self): # force CUDA context init cuda.get_current_device() # use "spawn" to avoid inheriting the CUDA context ctx = multiprocessing.get_context('spawn') q = ctx.Queue() p = ctx.Process(target=spawn_process_entry, args=(q, )) p.start() try: err = q.get() finally: p.join() if err is not None: raise AssertionError(err) self.assertEqual(p.exitcode, 0, 'test failed in child process')
def initialize_and_compile(mat_size, mat_type): shape = np.array([mat_size, mat_size]) if mat_type.startswith('uint'): shape[1] = math.ceil(mat_size / float(mat_type[4:])) mat = cuda.to_device(np.zeros(tuple(shape), dtype=mat_type)) global threadsperblock threadsperblock = (int( np.sqrt(cuda.get_current_device().MAX_THREADS_PER_BLOCK)), ) * 2 blockspergrid = tuple( int(math.ceil(mat_size / threadsperblock[i])) for i in (0, 1)) is_changed = cuda.device_array((1, ), dtype=bool) global size, matmul_method if mat_type == 'bool': matmul_method = matmul_bool[blockspergrid, threadsperblock] elif mat_type == 'uint8': matmul_method = matmul_uint[blockspergrid, threadsperblock] size = 8 elif mat_type == 'uint32': matmul_method = matmul_uint[blockspergrid, threadsperblock] size = 32 else: raise ValueError( 'GPU multiplication of matrices type {} is not supported'.format( mat_type)) matmul_method(mat, mat, mat, is_changed)
def reset_GPU(): """ reset and clear memory of the GPU """ from numba import cuda device = cuda.get_current_device() device.reset()
def hist_vec_by_r_cu(x, dr, r_bin, r_max, middle=None, gpu=0): r"""Summing vector based function to modulus based function. $f(r) := \int F(\bm{r})\delta(r-|\bm{r}|)\mathrm{d}\bm{r} / \int \delta(r-|\bm{r}|)\mathrm{d}\bm{r}$ :param x: np.ndarray, input :param r: np.ndarray[ndim=2], x[len_1, len_2, ..., len_n] ~ (r1: len_1, r2: len_2, ..., r_n: len_n) x: (Nx, Ny, Nz) -> r: (3 (xyz), N), currently, only Nx == Ny == Nz is supported. :param r_bin: double, bin size of r :param r_max: double, max of r :param gpu: int gpu number :return: np.ndarray, averaged $F(x, y, ...) -> 1/(4\pi\r^2) f(\sqrt{x^2+y^2+...})$ """ dim = np.asarray(x.shape, dtype=np.int64) ret = np.zeros(int(r_max / r_bin) + 1, dtype=np.float) r_max2 = float((ret.shape[0] * r_bin)**2) cter = np.zeros(ret.shape, dtype=np.int64) x = x.ravel(order='F') if middle is None: middle = np.zeros(dim.shape[0], dtype=np.float64) with cuda.gpus[gpu]: device = cuda.get_current_device() tpb = device.WARP_SIZE bpg = int(x.shape[0] // tpb + 1) if np.issubdtype(x.dtype, np.dtype(np.complex)): x_real = np.ascontiguousarray(x.real) x_imag = np.ascontiguousarray(x.imag) ret_imag = np.zeros(int(r_max / r_bin) + 1, dtype=np.float) _cu_kernel_complex[bpg, tpb](x_real, x_imag, dim, middle, dr, r_bin, r_max2, ret, ret_imag, cter) ret = ret + ret_imag * 1j else: _cu_kernel[bpg, tpb](x, dim, middle, dr, r_bin, r_max2, ret, cter) cter[cter == 0] = 1 return ret / cter
def main(): A = np.ones((20, 50000), dtype=np.float32) B = np.ones((3072, 50000), dtype=np.float32) C = np.ones((20, 3072, 50000), dtype=np.float32) (Ni, Nj, Nk) = C.shape my_gpu = cuda.get_current_device() thread_ct = 8 block_ct_x = int(math.ceil(float(Ni) / thread_ct)) block_ct_y = int(math.ceil(float(Nj) / thread_ct)) block_ct_z = int(math.ceil(float(Nk) / thread_ct)) blockdim = thread_ct, thread_ct, thread_ct griddim = block_ct_x, block_ct_y, block_ct_z print("Threads per block:", blockdim) print("Blocks per grid:", griddim) start = timer() Cg = cuda.to_device(C) mult_kernel[griddim, blockdim](A, B, Cg) Cg.to_host() dt = timer() - start print("Computation done in %f s" % (dt)) print('C[:3,1,1] = ', C[:3, 1, 1]) print('C[-3:,1,1] = ', C[-3:, 1, 1])
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] mm = MM(shape=n, dtype=np.double, prealloc=5) blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) stream = cuda.stream() prng = PRNG(PRNG.MRG32K3A, stream=stream) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double, stream=stream) c0 = interest - 0.5 * volatility**2 c1 = volatility * math.sqrt(dt) d_last = cuda.to_device(paths[:, 0], to=mm.get()) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j], stream=stream, to=mm.get()) step(d_last, dt, c0, c1, d_normdist, out=d_paths, stream=stream) d_paths.copy_to_host(paths[:, j], stream=stream) mm.free(d_last) d_last = d_paths stream.synchronize()
def incoherent_scattering(traj, q, dq, q_vectors=None): ret = np.zeros((traj.shape[1], traj.shape[0])) n = int(np.round(q / dq)) q_vecs = q_vectors if q_vecs is None: @nb.jit(nopython=True) def _generate_q_vecs(_n, _dq, _q, _shape): ret = [] for _qq in np.ndindex(_shape): _q = 0 for _qqq in _qq: _q += (_qqq - _n)**2 _q = _q**0.5 * _dq if abs(_q - _q) / _q < 1.5e-3: ret.append(_qq) return ret shape = (n * 2, ) * 3 q_vecs = _generate_q_vecs(n, dq, q, shape) q_vecs = np.asarray((np.array(q_vecs) - n) * dq, dtype=np.float64) print('Start with Q vecs:', q_vecs.shape) import time s = time.time() with cuda.gpus[2]: device = cuda.get_current_device() tpb = (device.WRAP_SIZE, ) * 2 bpg = (math.ceil(traj.shape[0] / tpb[0]), math.ceil(traj.shape[0] / tpb[1])) _cu_kernel[bpg, tpb](traj, q_vecs, ret) print(time.time() - s) return ret.T / np.arange(traj.shape[0], 0, -1)[:, None] / q_vecs.shape[0], q_vecs
def __init__(self, shape, dtype, prealloc): self.device = cuda.get_current_device() self.freelist = deque() self.events = {} for i in range(prealloc): gpumem = cuda.device_array(shape=shape, dtype=dtype) self.freelist.append(gpumem) self.events[gpumem] = cuda.event(timing=False)
def gpu_compute_capability(): ''' Query the GPU's properties via Numba to obtain the compute capability of the device. ''' device = cuda.get_current_device() compute = device.compute_capability return (compute)
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] num_streams = 2 part_width = int(math.ceil(float(n) / num_streams)) partitions = [(0, part_width)] for i in range(1, num_streams): begin, end = partitions[i - 1] begin, end = end, min(end + (end - begin), n) partitions.append((begin, end)) partlens = [end - begin for begin, end in partitions] mm = MM(shape=part_width, dtype=np.double, prealloc=10 * num_streams) device = cuda.get_current_device() blksz = device.MAX_THREADS_PER_BLOCK gridszlist = [int(math.ceil(float(partlen) / blksz)) for partlen in partlens] strmlist = [cuda.stream() for _ in range(num_streams)] prnglist = [PRNG(PRNG.MRG32K3A, stream=strm) for strm in strmlist] # Allocate device side array d_normlist = [cuda.device_array(partlen, dtype=np.double, stream=strm) for partlen, strm in zip(partlens, strmlist)] c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Configure the kernel # Similar to CUDA-C: cu_monte_carlo_pricer<<<gridsz, blksz, 0, stream>>> steplist = [cu_step[gridsz, blksz, strm] for gridsz, strm in zip(gridszlist, strmlist)] d_lastlist = [cuda.to_device(paths[s:e, 0], to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for j in range(1, paths.shape[1]): for prng, d_norm in zip(prnglist, d_normlist): prng.normal(d_norm, mean=0, sigma=1) d_pathslist = [cuda.to_device(paths[s:e, j], stream=strm, to=mm.get(stream=strm)) for (s, e), strm in zip(partitions, strmlist)] for step, args in zip(steplist, zip(d_lastlist, d_pathslist, d_normlist)): d_last, d_paths, d_norm = args step(d_last, d_paths, dt, c0, c1, d_norm) for d_paths, strm, (s, e) in zip(d_pathslist, strmlist, partitions): d_paths.copy_to_host(paths[s:e, j], stream=strm) mm.free(d_last, stream=strm) d_lastlist = d_pathslist for strm in strmlist: strm.synchronize()
def GPUWrapper(data_out, device_id, photons_req_per_device, max_photons_per_device, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range, ret_cols): # TODO: These numbers can be optimized based on the device / architecture / number of photons threads_per_block = 256 blocks = 64 photons_per_thread = int(np.ceil(float(photons_req_per_device)/(threads_per_block * blocks))) max_photons_per_thread = int(np.ceil(float(max_photons_per_device)/(threads_per_block * blocks))) cuda.select_device(device_id) device = cuda.get_current_device() stream = cuda.stream() # use stream to trigger async memory transfer # Keeping this piece of code here for now -potentially we need this in the future # with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context # TODO: ideally we should call cuda.jit(signature)(propPhotonGPU), where # signature is the call to the function. So far I couldn't figure out what is the signature of the # rng_states, closest I got to was: array(Record([('s0', '<u8'), ('s1', '<u8')]), 1d, A) # But I couldn't get it to work yet. # MC_cuda_kernel = cuda.jit(propPhotonGPU) data = np.ndarray(shape=(threads_per_block*blocks, photons_per_thread, 11), dtype=np.float32) photon_counters = np.ndarray(shape=(threads_per_block*blocks, 5), dtype=np.int) data_out_device = cuda.device_array_like(data, stream=stream) photon_counters_device = cuda.device_array_like(photon_counters, stream=stream) # Used to initialize the threads random states. rng_states = create_xoroshiro128p_states(threads_per_block * blocks, seed=(np.random.randint(sys.maxsize)-128)+device_id, stream=stream) # Actual kernel call propPhotonGPU[blocks, threads_per_block](rng_states, data_out_device, photon_counters_device, photons_per_thread, max_photons_per_thread, muS, g, source_type, source_param1, source_param2, detector_params, max_N, max_distance_from_det, target_type, target_mask, target_gridsize, z_target, z_bounded, z_range) # Copy data back data_out_device.copy_to_host(data, stream=stream) photon_counters_device.copy_to_host(photon_counters, stream=stream) stream.synchronize() data = data.reshape(data.shape[0]*data.shape[1], data.shape[2]) data = data[:, ret_cols] data_out[device_id][0] = data photon_counters_aggr = np.squeeze(np.sum(photon_counters, axis=0)) data_out[device_id][1] = photon_counters_aggr
def compute_fid_score(self): for img_type in self.image_types: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu self.fid_value[img_type] = calculate_fid_given_paths([self.gt_path, self.model_path], None, low_profile=False) print("FID: ", self.fid_value[img_type]) # self.fid_value[img_type] = calculate_fid_given_paths(paths=[self.gt_path, self.model_path], batch_size=self.batch_size, dims=self.fid_dims, num_workers=self.num_workers, mod_type='_' + img_type) device = cuda.get_current_device() device.reset()
def calc_psi(r, t, z, out): out[:] = 0 K = r.shape[0] MY, MX = out.shape gpu = cuda.get_current_device() threadsperblock = gpu.MAX_THREADS_PER_BLOCK blockspergrid = m.ceil(K * MY * MX / threadsperblock) psi_kernel[blockspergrid, threadsperblock](r, t, z, out) return out
def get_number_of_cores_current_device(): device = cuda.get_current_device() sms = getattr(device, 'MULTIPROCESSOR_COUNT') cc = device.compute_capability cores_per_sm = cc_cores_per_SM_dict[cc] total_cores = cores_per_sm * sms print("GPU compute capability: ", cc) print("GPU total number of SMs: ", sms) print("total cores: ", total_cores) return total_cores
def _cumacula( fmod, deltaratio, t, theta_star, theta_spot, theta_inst, tstart, tend, c, d, Fab, TdeltaV=False, ): if fmod.dtype == "float32": numba_type = float32 elif fmod.dtype == "float64": numba_type = float64 if (str(numba_type)) in _kernel_cache: kernel = _kernel_cache[(str(numba_type))] else: sig = _numba_cumacula_signature(numba_type) if fmod.dtype == "float32": kernel = _kernel_cache[(str(numba_type))] = cuda.jit( sig, fastmath=True)(_numba_cumacula_32) print("Registers(32)", kernel._func.get().attrs.regs) elif fmod.dtype == "float64": kernel = _kernel_cache[(str(numba_type))] = cuda.jit( sig, fastmath=True)(_numba_cumacula_64) print("Registers(64)", kernel._func.get().attrs.regs) gpu = cuda.get_current_device() numSM = gpu.MULTIPROCESSOR_COUNT threadsperblock = (128, ) blockspergrid = (numSM * 20, ) kernel[blockspergrid, threadsperblock]( fmod, deltaratio, t, theta_star, theta_spot, theta_inst, tstart, tend, c, d, Fab, TdeltaV, ) cuda.synchronize()
def test_compile_ptx_for_current_device(self): def add(x, y): return x + y args = (float32, float32) ptx, resty = compile_ptx_for_current_device(add, args, device=True) # Check we target the current device's compute capability, or the # closest compute capability supported by the current toolkit. device_cc = cuda.get_current_device().compute_capability cc = cuda.cudadrv.nvvm.find_closest_arch(device_cc) target = f'.target sm_{cc[0]}{cc[1]}' self.assertIn(target, ptx)
def cu_cell_list_argsort(pos, box, ibox, gpu=0): n = pos.shape[0] n_cell = np.multiply.reduce(ibox) cell_id = np.zeros(n).astype(np.int64) with cuda.gpus[gpu]: device = cuda.get_current_device() tpb = device.WARP_SIZE bpg = ceil(n / tpb) cu_cell_ind[bpg, tpb](pos, box, ibox, cell_id) cell_list = np.argsort(cell_id) # pyculib radixsort for cuda acceleration. cell_id = cell_id[cell_list] cell_counts = np.r_[0, np.cumsum(np.bincount(cell_id, minlength=n_cell))] return cell_list.astype(np.int64), cell_counts.astype(np.int64)
def Clear(): from numba import cuda device = cuda.get_current_device() device.reset() #cuda.current_context().trashing.clear() s = cuda.current_context().get_memory_info() print(s) cuda.current_context().deallocations.clear() s = cuda.current_context().get_memory_info() print(s) cuda.select_device(0) #do tf stuff cuda.close()
def AtF2(z, psi, r, out): """ :param z: K x MY x MX :param psi: B x K x MY x MX :param r: K x 2 :param out: B x NY x NX :return: """ gpu = cuda.get_current_device() threadsperblock = gpu.MAX_THREADS_PER_BLOCK blockspergrid = m.ceil(np.prod(z.shape) / threadsperblock) AtF2_kernel[blockspergrid, threadsperblock](z, psi, r, out) return out
def train_cuda(X, y, conf, iterations=6000): gpu = cuda.get_current_device() syn0, syn1 = conf syn0g = cuda.to_device(syn0) syn1g = cuda.to_device(syn1) Xg = cuda.to_device(X) yg = cuda.to_device(y) rows = X.shape[0] thread_ct = (gpu.WARP_SIZE, gpu.WARP_SIZE) block_ct = map(lambda x: int(math.ceil(float(x) / thread_ct[0])), [rows, ndims]) train_kernel[block_ct, thread_ct](Xg, yg, syn0g, syn1g, iterations) syn0g.to_host() syn1g.to_host() return (syn0, syn1)
def __init__(self, trainPath='proxy'): self.__name = trainPath self.__model = models.Sequential() self.__trainPath = os.path.join(learnDir, 'train', trainPath) self.__testPath = os.path.join(learnDir, 'test') self.__modelPath = os.path.join(main, 'models') self.__modelType = '' self.__history = None self.__train_gen = ImageDataGenerator() self.__val_gen = ImageDataGenerator() self.__gpu = cuda.get_current_device() self.__batchSize = 64 self.__seed = 101 self.__xSize = 224 self.__ySize = 224
def Qoverlap_real2(r, z, out): """ :param r: K x 2 :param z: BB x K x MY x MX :param out: BB x NY x NX :return: out """ BB = out.shape[0] K = r.shape[0] out[:] = 1 gpu = cuda.get_current_device() threadsperblock = gpu.MAX_THREADS_PER_BLOCK blockspergrid = m.ceil(BB * K * np.prod(z.shape) / threadsperblock) overlap_kernel_real2[blockspergrid, threadsperblock](r, z, out) return out
def calc_psi_denom(r, t, out): """ :param r: K x 2 :param t: BB x NY x NX :param out: BB x MY x MX :return: """ out[:] = 0 K = r.shape[0] BB, MY, MX = out.shape gpu = cuda.get_current_device() threadsperblock = gpu.MAX_THREADS_PER_BLOCK blockspergrid = m.ceil(BB * K * MY * MX / threadsperblock) psi_denom_kernel[blockspergrid, threadsperblock](r, t, out) return out
def cuda_all_euc_dists(coords_arr): """ Pass an array of shape (models, side, dimensions), return all the euclidean distances between each coord in each model. """ num_threads = cuda.get_current_device().WARP_SIZE models, side, _ = coords_arr.shape out_arr = np.zeros((side, side, models)) # What block dims? tpb = (32, 16, 2) # Given threads per block, what should blocks per grid be? bpg = _grid_dim(out_arr, tpb) cuda_all_euc_dists_inner[bpg, tpb](coords_arr, out_arr) return (out_arr)
def device_controller(cid): cuda.select_device(cid) # bind device to thread device = cuda.get_current_device() # get current device # print some information about the CUDA card prefix = '[%s]' % device print(prefix, 'device_controller', cid, '| CC', device.COMPUTE_CAPABILITY) max_thread = device.MAX_THREADS_PER_BLOCK with compiler_lock: # lock the compiler # prepare function for this thread # the jitted CUDA kernel is loaded into the current context cuda_kernel = cuda.jit(signature)(kernel) # prepare data N = 12345 data = np.arange(N, dtype=np.int32) * (cid + 1) orig = data.copy() # determine number of threads and blocks if N >= max_thread: ngrid = int(ceil(float(N) / max_thread)) nthread = max_thread else: ngrid = 1 nthread = N print(prefix, 'grid x thread = %d x %d' % (ngrid, nthread)) # real CUDA work d_data = cuda.to_device(data) # transfer to device cuda_kernel[ngrid, nthread](d_data, d_data) # compute inplace d_data.copy_to_host(data) # transfer to host # check result if not np.all(data == orig + 1): raise ValueError
def monte_carlo_pricer(paths, dt, interest, volatility): n = paths.shape[0] blksz = cuda.get_current_device().MAX_THREADS_PER_BLOCK gridsz = int(math.ceil(float(n) / blksz)) # Instantiate cuRAND PRNG prng = PRNG(PRNG.MRG32K3A) # Allocate device side array d_normdist = cuda.device_array(n, dtype=np.double) c0 = interest - 0.5 * volatility ** 2 c1 = volatility * math.sqrt(dt) # Simulation loop d_last = cuda.to_device(paths[:, 0]) for j in range(1, paths.shape[1]): prng.normal(d_normdist, mean=0, sigma=1) d_paths = cuda.to_device(paths[:, j]) step(d_last, dt, c0, c1, d_normdist, out=d_paths) d_paths.copy_to_host(paths[:, j]) d_last = d_paths
from numba import cuda import numpy as np import cProfile @cuda.jit def increment_by_one(an_array): tx = cuda.threadIdx.x bidx = cuda.blockIdx.x bdim = cuda.blockDim.x idx = tx + bidx * bdim if idx < an_array.size: an_array[idx] += 1 dev = cuda.get_current_device() print('Cuda device name = {}'.format(dev.name)) a = np.ones(1000000) thread_per_block = 512 block_per_grid = int(np.ceil(a.size / thread_per_block)) increment_by_one[block_per_grid, thread_per_block](a) profile = cProfile.Profile() profile.enable() d_a = cuda.to_device(a) for i in range(10000): increment_by_one[block_per_grid, thread_per_block](d_a) # a += 1 d_a.copy_to_host(a)
def main(): # device = cuda.get_current_device() # maxtpb = device.MAX_THREADS_PER_BLOCK # warpsize = device.WARP_SIZE maxtpb = 512 warpsize = 32 # benchmark loop vary_warpsize = [] baseline = [] ilpx2 = [] ilpx4 = [] ilpx8 = [] # For OSX 10.8 where the GPU is used for graphic as well, # increasing the following to 10 * 2 ** 20 seems to be necessary to # produce consistent result. approx_data_size = 1.5 * 2**20 for multiplier in range(1, maxtpb // warpsize + 1): blksz = warpsize * multiplier gridsz = ceil_to_nearest(float(approx_data_size) / blksz, 8) print('kernel config [%d, %d]' % (gridsz, blksz)) N = blksz * gridsz A = np.arange(N, dtype=np.float32) B = np.arange(N, dtype=np.float32) print('data size %dMB' % (N / 2.**20 * A.dtype.itemsize)) dA = cuda.to_device(A) dB = cuda.to_device(B) assert float(N) / blksz == gridsz, (float(N) / blksz, gridsz) vary_warpsize.append(blksz) dC = cuda.device_array_like(A) basetime = time_this(vec_add, gridsz, blksz, (dA, dB, dC)) expected_result = dC.copy_to_host() if basetime > 0: baseline.append(N / basetime) dC = cuda.device_array_like(A) x2time = time_this(vec_add_ilp_x2, gridsz//2, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x2time > 0: ilpx2.append(N / x2time) dC = cuda.device_array_like(A) x4time = time_this(vec_add_ilp_x4, gridsz//4, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x4time > 0: ilpx4.append(N / x4time) dC = cuda.device_array_like(A) x8time = time_this(vec_add_ilp_x8, gridsz//8, blksz, (dA, dB, dC)) np.testing.assert_allclose(expected_result, dC.copy_to_host()) if x8time > 0: ilpx8.append(N / x8time) pylab.plot(vary_warpsize[:len(baseline)], baseline, label='baseline') pylab.plot(vary_warpsize[:len(ilpx2)], ilpx2, label='ILP2') pylab.plot(vary_warpsize[:len(ilpx4)], ilpx4, label='ILP4') pylab.plot(vary_warpsize[:len(ilpx8)], ilpx8, label='ILP8') pylab.legend(loc=4) pylab.title(cuda.get_current_device().name) pylab.xlabel('block size') pylab.ylabel('float per second') pylab.show()