def init_output(self): out_shape = self.get_output_shape() rows = int(np.prod(out_shape[:3])) cols = out_shape[3] #util.log('Layer %s allocating: (%s,%s) ', self.name, rows, cols) self.output = gpuarray.GPUArray((rows, cols), dtype=np.float32) self.output_grad = gpuarray.GPUArray((rows, cols), dtype=np.float32)
def _gpu_init(self, debug): self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32), stream=self.stream2) self.neighbors_index = cuda.to_gpu_async(self.num_neighbors, stream=self.stream1) exclusiveScan(self.neighbors_index, stream=self.stream1) self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors, stream=self.stream2) self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2) self.dev_waypoints = cuda.to_gpu_async(self.waypoints, stream=self.stream2) self.dev_neighbors = cuda.to_gpu_async(self.neighbors, stream=self.stream2) self.dev_Gindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.dev_xindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.dev_xindicator_zeros = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.zero_val = np.zeros((), np.int32) self.dev_xindicator_zeros.fill(self.zero_val, stream=self.stream1) # self.stream1.synchronize() self.stream2.synchronize()
def test_exec_raises_on_dtype(): dtype = np.float32 complex_dtype = np.complex64 M = 4096 tol = 1e-3 shape = (16, 16, 16) dim = len(shape) kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype) c = utils.gen_nonuniform_data(M).astype(complex_dtype) c_gpu = gpuarray.to_gpu(c) # Using c.real gives us wrong dtype here... c_gpu_wrong_dtype = gpuarray.to_gpu(c.real) kxyz_gpu = gpuarray.to_gpu(kxyz) fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype) # Here we'll intentionally contruct an incorrect array dtype. fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128) plan = cufinufft(1, shape, eps=tol, dtype=dtype) plan.set_pts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2]) with pytest.raises(TypeError): plan.execute(c_gpu, fk_gpu_wrong_dtype) with pytest.raises(TypeError): plan.execute(c_gpu_wrong_dtype, fk_gpu)
def fun_load(config, sock_data=5000): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending # if need to do random crop and mirror flag_randproc = not config['use_data_layer'] flag_batch = config['batch_crop_mirror'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx = dev.make_context() sock = zmq.Context().socket(zmq.PAIR) sock.bind('tcp://*:{0}'.format(sock_data)) shape, dtype, h = sock.recv_pyobj() print 'shared_x information received' gpu_data_remote = gpuarray.GPUArray(shape, dtype, gpudata=drv.IPCMemoryHandle(h)) gpu_data = gpuarray.GPUArray(shape, dtype) img_mean = recv_queue.get() print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: # getting the hkl file name to load hkl_name = recv_queue.get() # print hkl_name data = hkl.load(hkl_name) - img_mean # print 'load ', time.time() - bgn_time if flag_randproc: param_rand = recv_queue.get() data = crop_and_mirror(data, param_rand, flag_batch=flag_batch) gpu_data.set(data) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer(gpu_data_remote.ptr, gpu_data.ptr, gpu_data.dtype.itemsize * gpu_data.size, ctx, ctx) ctx.synchronize() send_queue.put('copy_finished')
def fun_load(config, sock_data_2=5001): send_queue = config['queue_l2t'] recv_queue = config['queue_t2l'] # recv_queue and send_queue are multiprocessing.Queue # recv_queue is only for receiving # send_queue is only for sending num_timesteps = config['num_timesteps'] num_seq = config['num_seq'] img_scale_x = config['img_scale_x'] img_scale_y = config['img_scale_y'] drv.init() dev = drv.Device(int(config['gpu'][-1])) ctx_2 = dev.make_context() sock_2 = zmq.Context().socket(zmq.PAIR) sock_2.bind('tcp://*:{0}'.format(sock_data_2)) shape_temporal, dtype_temporal, h_temporal = sock_2.recv_pyobj() print 'shared_x information received', shape_temporal gpu_data_remote_temporal = gpuarray.GPUArray( shape_temporal, dtype_temporal, gpudata=drv.IPCMemoryHandle(h_temporal)) gpu_data_temporal = gpuarray.GPUArray(shape_temporal, dtype_temporal) # print 'img_mean received' # The first time, do the set ups and other stuff # receive information for loading while True: video_name_temporal = recv_queue.get() rand_param = recv_queue.get() if config['modal'] == 'rgb': data_temporal = prepare_data_rgb(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y, 3)) else: data_temporal = prepare_data_flow(video_name_temporal, num_timesteps, num_seq, rand_param, data_shape=(img_scale_x, img_scale_y)) gpu_data_temporal.set(data_temporal) # wait for computation on last minibatch to finish msg = recv_queue.get() assert msg == 'calc_finished' drv.memcpy_peer( gpu_data_remote_temporal.ptr, gpu_data_temporal.ptr, gpu_data_temporal.dtype.itemsize * gpu_data_temporal.size, ctx_2, ctx_2) ctx_2.synchronize() send_queue.put('copy_finished')
def thunk(): x, truth = inputs[0], inputs[1] context = None if hasattr(x[0], 'context'): context = x[0].context z = outputs[0] z_shape = x[0].shape if z[0] is None or z[0].shape != z_shape: z[0] = pygpu.zeros(z_shape, dtype=theano.config.floatX, context=context) x_ptr, _ = get_tens_ptr(x[0]) truth_ptr, _ = get_tens_ptr(truth[0]) z_ptr, z_obj = get_tens_ptr(z[0]) # store as gpuarray best_idx_ptr = gpuarray.GPUArray(shape=(np.prod( truth[0].shape[:2]), ), dtype=np.int32) best_iou_ptr = gpuarray.GPUArray(shape=(np.prod( truth[0].shape[:2]), ), dtype=np.float32) yolo_ptr, _ = get_yolo_info(n_classes, n_anchors, l_obj, l_noobj, anchors) # get best index index_fn(best_idx_ptr, best_iou_ptr, x_ptr, truth_ptr, yolo_ptr, block=(1, 1, 1), grid=(x[0].shape[0], 1, 1)) n_total = np.int32(x[0].shape[0] * n_anchors * np.prod(x[0].shape[-2:])) n_matched = np.int32(gpuarray.sum(best_idx_ptr != -1).get()) grad_fn(z_ptr, best_idx_ptr, best_iou_ptr, x_ptr, truth_ptr, yolo_ptr, n_matched, n_total, block=(n_anchors, 1, 1), grid=(x[0].shape[0], x[0].shape[2], x[0].shape[3])) # free all memory del best_idx_ptr del best_iou_ptr yolo_ptr.free()
def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_frexp_kernel() func.set_block_shape(*arg._block) func.prepared_async_call(arg._grid, stream, arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) return sig, expt
def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_modf_kernel() func.set_block_shape(*arg._block), func.prepared_async_call(arg._grid, stream, arg.gpudata, intpart.gpudata, fracpart.gpudata, arg.mem_size) return fracpart, intpart
def prepare_server(self): self.g_param_list = self.param_list self.g_param_ga_list = [] self.w_param_ga_list = [] self.w_param_list = [] for param in self.param_list: np_param = param.get_value() w_param = theano.shared(np_param) self.w_param_list.append(w_param) w_param_ga = gpuarray.GPUArray(np_param.shape, np_param.dtype) self.w_param_ga_list.append(w_param_ga) g_param_ga = gpuarray.GPUArray(np_param.shape, np_param.dtype) self.g_param_ga_list.append(g_param_ga)
def test_type2(shape=(16, 16, 16), M=4096, tol=1e-3): kxyz = utils.gen_nu_pts(M) fk = utils.gen_uniform_data(shape) kxyz_gpu = gpuarray.to_gpu(kxyz) fk_gpu = gpuarray.to_gpu(fk) c_gpu = gpuarray.GPUArray(shape=(M,), dtype=np.complex64) plan = cufinufft.plan(2, shape, -1, tol) cufinufft.set_nu_pts(plan, M, kxyz_gpu[0].gpudata, kxyz_gpu[1].gpudata, kxyz_gpu[2].gpudata) cufinufft.execute(plan, c_gpu.gpudata, fk_gpu.gpudata) cufinufft.destroy(plan) c = c_gpu.get() ind = M // 2 c_est = c[ind] c_target = utils.direct_type2(fk, kxyz[:, ind]) type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target) print('Type 2 relative error:', type2_rel_err)
def test_type1(shape=(16, 16, 16), M=4096, tol=1e-3): kxyz = utils.gen_nu_pts(M) c = utils.gen_nonuniform_data(M) kxyz_gpu = gpuarray.to_gpu(kxyz) c_gpu = gpuarray.to_gpu(c) fk_gpu = gpuarray.GPUArray(shape, dtype=np.complex64) plan = cufinufft.plan(1, shape, 1, tol) cufinufft.set_nu_pts(plan, M, kxyz_gpu[0].gpudata, kxyz_gpu[1].gpudata, kxyz_gpu[2].gpudata) cufinufft.execute(plan, c_gpu.gpudata, fk_gpu.gpudata) fk = fk_gpu.get() ind = int(0.1789 * np.prod(shape)) fk_est = fk.ravel()[ind] fk_target = utils.direct_type1(c, kxyz, shape, ind) type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target) print('Type 1 relative error:', type1_rel_err)
def gpu_tensor_gemm(handle, a, b): c = gpuarray.GPUArray((a.shape[0], b.shape[1]), dtype=a.dtype) # c.fill(0) # driver.memset_d16(c.gpudata, 15360, c.mem_size) # print(c.get()) cublas_dot.cublas_gemm(handle, a, b, c) return c
def _test_type1(dtype, shape=(16, 16, 16), M=4096, tol=1e-3): complex_dtype = utils._complex_dtype(dtype) dim = len(shape) k = utils.gen_nu_pts(M, dim=dim).astype(dtype) c = utils.gen_nonuniform_data(M).astype(complex_dtype) k_gpu = gpuarray.to_gpu(k) c_gpu = gpuarray.to_gpu(c) fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype) plan = cufinufft(1, shape, eps=tol, dtype=dtype) plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2]) plan.execute(c_gpu, fk_gpu) fk = fk_gpu.get() ind = int(0.1789 * np.prod(shape)) fk_est = fk.ravel()[ind] fk_target = utils.direct_type1(c, k, shape, ind) type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target) print('Type 1 relative error:', type1_rel_err) assert type1_rel_err < 0.01
def ones(shape: tuple, dtype: np.dtype, order: str = 'C', allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with ones. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. order : {'C', 'F'}, optional Create array using row-major or column-major format. allocator : callable, optional Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of ones with the given shape, dtype, and order. """ out = gpuarray.GPUArray(shape, dtype, allocator, order=order) o = np.ones((), dtype) out.fill(o) return out
def _test_type2(dtype, shape=(16, 16, 16), M=4096, tol=1e-3): complex_dtype = utils._complex_dtype(dtype) k = utils.gen_nu_pts(M).astype(dtype) fk = utils.gen_uniform_data(shape).astype(complex_dtype) k_gpu = gpuarray.to_gpu(k) fk_gpu = gpuarray.to_gpu(fk) c_gpu = gpuarray.GPUArray(shape=(M, ), dtype=complex_dtype) plan = cufinufft(2, shape, eps=tol, dtype=dtype) plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2]) plan.execute(c_gpu, fk_gpu) c = c_gpu.get() ind = M // 2 c_est = c[ind] c_target = utils.direct_type2(fk, k[:, ind]) type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target) print('Type 2 relative error:', type2_rel_err) assert type2_rel_err < 0.01
def zeros(shape, dtype, allocator=drv.mem_alloc): """ Return an array of the given shape and dtype filled with zeros. Parameters ---------- shape : tuple Array shape. dtype : data-type Data type for the array. allocator : callable Returns an object that represents the memory allocated for the requested array. Returns ------- out : pycuda.gpuarray.GPUArray Array of zeros with the given shape and dtype. Notes ----- This function exists to work around the following numpy bug that prevents pycuda.gpuarray.zeros() from working properly with complex types in pycuda 2011.1.2: http://projects.scipy.org/numpy/ticket/1898 """ out = gpuarray.GPUArray(shape, dtype, allocator) out.fill(0) return out
def resize_gpu(src_vol, dst_vol=None, dst_shape=None, scaling=None): if dst_shape is None: assert scaling dst_shape = [np.int(np.round(i * scaling)) for i in src_vol.shape] if dst_vol is None: dst_vol = gpuarray.GPUArray(dst_shape, np.float32) ndarray_to_float_tex(_tex_ref, src_vol) block = (32, 32, 1) grid = (int(divup(dst_vol.shape[2], block[0])), int(divup(dst_vol.shape[1], block[1])), 1) _kernel(dst_vol, np.int32(src_vol.shape[2]), np.int32(src_vol.shape[1]), np.int32(src_vol.shape[0]), np.int32(dst_vol.shape[2]), np.int32(dst_vol.shape[1]), np.int32(dst_vol.shape[0]), grid=grid, block=block) return dst_vol
def _FarnebackUpdateFlow_GaussianBlur_gpu(self, poly_coefficients0, poly_coefficients1, flow_gpu, M, winsize, update_matrices): sigma = self.winsize * 0.3 M_filtered_gpu = gpuarray.GPUArray(M.shape, M.dtype) for i in range(M.shape[0]): farneback3d._filtering.smooth_cuda_gauss(M[i], sigma, winsize, rtn_gpu=M_filtered_gpu[i]) block = (32, 32, 1) grid = (int(divup(flow_gpu.shape[3], block[0])), int(divup(flow_gpu.shape[2], block[1])), 1) self._solve_equations_kernel(M_filtered_gpu, flow_gpu, np.int32(flow_gpu.shape[3]), np.int32(flow_gpu.shape[2]), np.int32(flow_gpu.shape[1]), block=block, grid=grid) if update_matrices: self._FarnebackUpdateMatrices_gpu(poly_coefficients0, poly_coefficients1, flow_gpu, M)
def test_opts(shape=(8, 8, 8), M=32, tol=1e-3): dtype = np.float32 complex_dtype = utils._complex_dtype(dtype) dim = len(shape) k = utils.gen_nu_pts(M, dim=dim).astype(dtype) c = utils.gen_nonuniform_data(M).astype(complex_dtype) k_gpu = gpuarray.to_gpu(k) c_gpu = gpuarray.to_gpu(c) fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype) plan = cufinufft(1, shape, eps=tol, dtype=dtype, gpu_sort=False, gpu_maxsubprobsize=10) plan.set_pts(k_gpu[0], k_gpu[1], k_gpu[2]) plan.execute(c_gpu, fk_gpu) fk = fk_gpu.get() ind = int(0.1789 * np.prod(shape)) fk_est = fk.ravel()[ind] fk_target = utils.direct_type1(c, k, shape, ind) type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target) assert type1_rel_err < 0.01
def run(self): self.dev = cuda.Device(self.devID) self.ctx = self.dev.make_context() global code global cdata numThreads = 1024 numBlocks = 30 N = numThreads * numBlocks mod = SourceModule(code % {"NGENERATORS": N}, no_extern_c=True, arch='sm_61') init_func = mod.get_function("initkernel") fill_func = mod.get_function("randfillkernel") init_func(np.int32(time.time()), block=(numThreads, 1, 1), grid=(numBlocks, 1, 1)) split = ceil(self.hptr.size / float(2000000000)) split_data = np.array_split(self.hptr, split) gdata = gpuarray.GPUArray(shape=split_data[0].size, dtype=np.float32) for data in split_data: fill_func(gdata, np.uint64(gdata.size), block=(numThreads, 1, 1), grid=(numBlocks, 1, 1)) gdata.get(data) self.ctx.pop() del self.ctx
def prepare_kernels(files, templates, constvars, blockvars={}): """ Compile and prepare CUDA kernel functions Args: files : list of cuda source file handles templates : list of tuples describing the kernels constvars : dict of readonly variables blockvars : dict of blockvars whose description will be included in the preamble as preprocessor macros Returns: kernels : dict of executable CUDA kernels """ preamble, constvars = prepare_vars(constvars, blockvars) kernels_code = preamble for f in files: kernels_code += f.read().decode("utf-8") mod = compiler.SourceModule(kernels_code) kernels = {} for d in templates: kernels[d[0]] = prepare_kernelfun(mod, *d) for name, val in constvars.items(): const_ptr, size_in_bytes = mod.get_global(name) pycuda.driver.memcpy_htod(const_ptr, val) # WARNING: The gpudata argument in gpuarray.GPUArray usually requires a # pycuda.driver.DeviceAllocation and const_ptr is an int generated from # casting a CUdeviceptr to an int. # However, since DeviceAllocation is a simple wrapper around CUdeviceptr # (that gives a CUdeviceptr when cast to an int), it works like this. constvars[name] = gpuarray.GPUArray(val.shape, val.dtype, gpudata=const_ptr) return kernels
def copy_torch2glumpy(dst: gloo.VertexBuffer, src: torch.cuda.FloatTensor): # torch 2 pycuda src = gp.GPUArray(src.shape, np.float32, gpudata=src.data_ptr()) # copy pycuda 2 glumpy copy_pycuda2glumpy(dst, src) return
def copy_pycuda2RegisteredBuffer(dst: pycuda.gl.RegisteredBuffer, src: gp.GPUArray): # pycuda 2 RegisteredBuffer with cuda_activate(dst) as ptr: dst = gp.GPUArray(src.shape, np.float32, gpudata=ptr) dst[:] = src return
def ones_cuda(shape): """Create GPUArray of ones directly on GPU memory. Parameters ---------- shape : tuple Dimensions of the GPUArray. Returns ------- gpuarray GPUArray of ones. Examples -------- >>> a = ones_cuda((3, 2)) [[ 1., 1.], [ 1., 1.], [ 1., 1.]] >>> type(a) <class 'pycuda.gpuarray.GPUArray'> """ a = cuda_array.GPUArray(shape, dtype=float32, allocator=pycuda.driver.mem_alloc, order='C') a.fill(1.0) return a
def frexp(arg, stream=None): """Return a tuple `(significands, exponents)` such that `arg == significand * 2**exponent`. """ if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") sig = gpuarray.GPUArray(arg.shape, arg.dtype) expt = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_frexp_kernel() func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata, sig.gpudata, expt.gpudata, arg.mem_size) return sig, expt
def squared_loss(y_true, y_pred): """Compute the squared loss for regression. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) values. y_pred : array-like or label indicator matrix Predicted values, as returned by a regression estimator. Returns ------- loss : float The degree to which the samples are correctly predicted. """ tmp_gpu = gpuarray.GPUArray(y_true.shape, y_true.dtype) if y_true.dtype == np.float64: cuSquaredError(y_true.gpudata, y_pred.gpudata, tmp_gpu.gpudata, np.int32(y_true.size), block=(blockSize, 1, 1), grid=(int((y_true.size - 1) / blockSize + 1), 1, 1)) else: cuSquaredErrorf(y_true.gpudata, y_pred.gpudata, tmp_gpu.gpudata, np.int32(y_true.size), block=(blockSize, 1, 1), grid=(int((y_true.size - 1) / blockSize + 1), 1, 1)) mean = float(cumisc.mean(tmp_gpu).get()) return (mean / 2)
def modf(arg, stream=None): """Return a tuple `(fracpart, intpart)` of arrays containing the integer and fractional parts of `arg`. """ if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") intpart = gpuarray.GPUArray(arg.shape, arg.dtype) fracpart = gpuarray.GPUArray(arg.shape, arg.dtype) func = elementwise.get_modf_kernel() func.prepared_async_call(arg._grid, arg._block, stream, arg.gpudata, intpart.gpudata, fracpart.gpudata, arg.mem_size) return fracpart, intpart
def alloc_buf(self, size=None, like=None, wrap_in_array=False): if like is not None: # When calculating the total array size, take into account # any striding. # XXX: why does it even work? buf_size = like.shape[0] * like.strides[0] buf = cuda.mem_alloc(buf_size) self._total_memory_bytes += buf_size if like.base is not None and type( like.base) is not cuda.PagelockedHostAllocation: self.buffers[buf] = like.base else: self.buffers[buf] = like self.to_buf(buf) if wrap_in_array: self.arrays[buf] = cudaarray.GPUArray(like.shape, like.dtype, gpudata=buf) else: self._total_memory_bytes += size buf = cuda.mem_alloc(size) return buf
def get_gpuarray(bh_ary): """Return a PyCUDA GPUArray object that points to the same device memory as `bh_ary`. Parameters ---------- bh_ary : ndarray (Bohrium array) Must be a Bohrium base array Returns ------- out : GPUArray Notes ----- Changing or deallocating `bh_ary` invalidates the returned GPUArray array! """ if get_base(bh_ary) is not bh_ary: raise RuntimeError('`bh_ary` must be a base array and not a view') assert (bh_ary.bhc_mmap_allocated) with contexts.DisableBohrium(): _import_pycuda_module() from pycuda import gpuarray dev_ptr = get_data_pointer(get_base(bh_ary), copy2host=False, allocate=True) return gpuarray.GPUArray(bh_ary.shape, bh_ary.dtype, gpudata=dev_ptr)
def array(self, shape, dtype, strides=None, allocator=None): # In PyCUDA, the default allocator is not None, but a default alloc object kwds = {} if strides is not None: kwds['strides'] = strides if allocator is not None: kwds['allocator'] = allocator return gpuarray.GPUArray(shape, dtype, **kwds)