def create_2D_array(mat): descr = driver.ArrayDescriptor() descr.width = mat.shape[1] descr.height = mat.shape[0] descr.format = driver.dtype_to_array_format(mat.dtype) descr.num_channels = 1 descr.flags = 0 ary = driver.Array(descr) return ary
def gpuArray2DtocudaArray(gpuArray): #import pycuda.autoinit h, w = gpuArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr2D.num_channels = 1 cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_device(gpuArray.ptr) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = gpuArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def resize_gpu(y_gpu, out_shape): in_shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype if dtype != np.float32: raise NotImplementedException('Only float at the moment') block_size = (16,16,1) grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])), int(np.ceil(float(out_shape[0])/block_size[1]))) preproc = _generate_preproc(dtype) mod = SourceModule(preproc + resize_code, keep=True) resize_fun_gpu = mod.get_function("resize") resized_gpu = cua.empty(tuple((np.int(out_shape[0]), np.int(out_shape[1]))),y_gpu.dtype) temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1], y_gpu.shape[0], 4) copy_object = cu.Memcpy2D() copy_object.set_src_device(y_gpu.gpudata) copy_object.set_dst_device(temp_gpu) copy_object.src_pitch = 4 * y_gpu.shape[1] copy_object.dst_pitch = pitch copy_object.width_in_bytes = 4 * y_gpu.shape[1] copy_object.height = y_gpu.shape[0] copy_object(aligned=False) in_tex = mod.get_texref('in_tex') descr = cu.ArrayDescriptor() descr.width = y_gpu.shape[1] descr.height = y_gpu.shape[0] descr.format = cu.array_format.FLOAT descr.num_channels = 1 #pitch = y_gpu.nbytes / y_gpu.shape[0] in_tex.set_address_2d(temp_gpu, descr, pitch) in_tex.set_filter_mode(cu.filter_mode.LINEAR) in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) resize_fun_gpu(resized_gpu.gpudata, np.uint32(out_shape[0]), np.uint32(out_shape[1]), block=block_size, grid=grid_size) temp_gpu.free() return resized_gpu
def np2DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit h, w = npArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(npArray.dtype) descr2D.num_channels = 1 if allowSurfaceBind: descr.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = npArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, descr2D
def setup_pitched_texture(tex_ref, shape, pitch, alloc): """ Bind 2D texture to memory location given by alloc with pitch size pitch and shape shape. alloc and pitch might come from e.g. alloc,pitch = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32 :param tex_reference: 2D texture reference :param shape: shape of the array to be placed there :param pitch: pitch parameter for CUDA texture binding :param alloc: address :return: """ assert (pitch % 8) == 0 # for float types descr = cuda.ArrayDescriptor() descr.format = cuda.array_format.FLOAT descr.height = shape[0] descr.width = shape[1] descr.num_channels = 1 tex_ref.set_address_2d(alloc, descr, pitch) tex_ref.set_address_mode(0, cuda.address_mode.WRAP) tex_ref.set_address_mode(1, cuda.address_mode.WRAP) tex_ref.set_filter_mode(cuda.filter_mode.POINT) tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
def mkdsc(dim, ch): return argset(cuda.ArrayDescriptor(), height=dim.ah, width=dim.astride, num_channels=ch, format=cuda.array_format.FLOAT)
def resample_sdbe_to_r2dbe_fft_interp(Xs, interp_kind="nearest"): """ Resample SWARM spectrum product in time-domain at R2DBE rate using iFFT and then interpolation in the time-domain. Arguments: ---------- Xs -- MxN numpy array in which the zeroth dimension is increasing snapshot index, and the first dimension is the positive frequency half of the spectrum. interp_kind -- Kind of interpolation. Returns: -------- xs -- The time-domain signal sampled at the R2DBE rate. """ # timestep sizes for SWARM and R2DBE rates dt_s = 1.0 / SWARM_RATE dt_r = 1.0 / R2DBE_RATE # cuFFT plan for complex to real DFT plan = cu_fft.Plan(SWARM_SAMPLES_PER_WINDOW, complex64, float32, Xs.shape[0]) # load complex spectrum to device x_d = gpuarray.to_gpu(Xs) xp_d = gpuarray.empty((Xs.shape[0], Xs.shape[1] + 1), dtype=complex64) # pad nyquist with zeros block = (32, 32, 1) grid = (int(ceil(1. * (Xs.shape[1] + 1) / block[1])), int(ceil(1. * Xs.shape[0] / block[0]))) fill_padded = mod.get_function("fill_padded") fill_padded(int32(Xs.shape[0]),xp_d,int32(Xs.shape[1]+1),x_d,int32(Xs.shape[1]),\ block=block,grid=grid) # allocate memory for time series xf_d = gpuarray.empty((Xs.shape[0], SWARM_SAMPLES_PER_WINDOW), float32) # calculate time series, include scaling cu_fft.ifft(xp_d, xf_d, plan, scale=True) # and interpolate xs_size = int(floor( Xs.shape[0] * SWARM_SAMPLES_PER_WINDOW * dt_s / dt_r)) - 1 TPB = 64 # threads per block nB = int(ceil(1. * xs_size / TPB)) # number of blocks xs_d = gpuarray.empty(xs_size, float32) # decimated time-series if interp_kind == 'nearest': # compile kernel nearest_interp = mod.get_function(interp_kind) # call kernel nearest_interp(xf_d, xs_d, int32(xs_size), float64(dt_r / dt_s), block=(TPB, 1, 1), grid=(nB, 1)) elif interp_kind == 'linear': # compile kernel linear_interp = mod.get_function("copy_texture_kernel") # get texture reference a_texref = mod.get_texref("a_tex") a_texref.set_filter_mode(drv.filter_mode.LINEAR) # linear #a_texref.set_filter_mode(drv.filter_mode.POINT) # nearest-neighbor # move time series to texture reference # following http://lists.tiker.net/pipermail/pycuda/2009-November/001916.html descr = drv.ArrayDescriptor() descr.format = drv.array_format.FLOAT descr.height = Xs.shape[0] descr.width = SWARM_SAMPLES_PER_WINDOW descr.num_channels = 1 a_texref.set_address_2d(xf_d.gpudata, descr, SWARM_SAMPLES_PER_WINDOW * 4) # set up linear interpolation over texture linear_interp(xs_d,int32(xs_size),float64(dt_r/dt_s),int32(SWARM_SAMPLES_PER_WINDOW),\ texrefs=[a_texref],block=(TPB,1,1),grid=(nB,1)) return xs_d.get()
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False): ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py''' # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags # if order != "C": raise LogicError("Just implemented for C order") dimension = len(npArray.shape) case = order in ["C","F"] if not case: raise LogicError("order must be either F or C") # if dimension == 1: # w = npArray.shape[0] # h, d = 0,0 if dimension == 2: if order == "C": stride = 0 if order == "F": stride = -1 h, w = npArray.shape d = 1 if allowSurfaceBind: descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: descrArr = cuda.ArrayDescriptor() descrArr.width = w descrArr.height = h # descrArr.depth = d elif dimension == 3: if order == "C": stride = 1 if order == "F": stride = 1 d, h, w = npArray.shape descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ") if prec == 'float': descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im) descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif prec == 'double': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif prec == 'cdouble': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: if dimension==2: descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED descrArr.flags |= cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descrArr) if allowSurfaceBind or dimension==3 : copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D else: # if dimension == 3: # copy3D = cuda.Memcpy3D() # copy3D.set_src_host(npArray) # copy3D.set_dst_array(cudaArray) # copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support # copy3D.src_height = copy3D.height = h # copy3D.depth = d # copy3D() # return cudaArray, copy3D # if dimension == 2: cudaArray = cuda.Array(descrArr) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride] # copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def update(self, depth, rgb_img=None): # Compute the real world depths. # TODO: Determine the best block size. depth_gpu = gpuarray.to_gpu(np.float32(depth)) width = depth_gpu.shape[1] height = depth_gpu.shape[0] gridx = (width - 1) // 16 + 1 gridy = (height - 1) // 16 + 1 pitch = depth_gpu.strides[0] self.compute_depth(depth_gpu, np.int32(width), np.int32(height), np.intp(pitch), block=(16, 16, 1), grid=(gridx, gridy)) # Prepare the depth array to be accessed as a texture. descr = drv.ArrayDescriptor() descr.width = width descr.height = height descr.format = drv.array_format.FLOAT descr.num_channels = 1 self.depth_texture.set_address_2d(depth_gpu.gpudata, descr, pitch) # Smooth depth. # pitch = self.smooth_depth_gpu.strides[0] # self.compute_smooth_depth(self.smooth_depth_gpu, np.int32(width), np.int32(height), # np.intp(pitch), np.float32(10.0), np.float32(10000.0), # block=(16,16,1), grid=(gridx, gridy)) #self.smooth_depth_texture.set_address_2d(depth_gpu.gpudata, descr, pitch) # Buffer mapping. normals_pitch = 640 * 12 vertex_measure_map, normal_measure_map = map(methodcaller("map"), self.buffers["measure"]) vertices_measure = np.intp(vertex_measure_map.device_ptr_and_size()[0]) normals_measure = np.intp(normal_measure_map.device_ptr_and_size()[0]) vertex_raycast_map, normal_raycast_map = map(methodcaller("map"), self.buffers["raycast"]) vertices_raycast = np.intp(vertex_raycast_map.device_ptr_and_size()[0]) normals_raycast = np.intp(normal_raycast_map.device_ptr_and_size()[0]) # Measure self.measure(vertices_measure, normals_measure, self.mask_gpu, np.int32(width), np.int32(height), np.intp(normals_pitch), block=(16, 16, 1), grid=(gridx, gridy)) # Update the reconstruction. grid2 = int((self.side - 1) // 8 + 1) for i in xrange(0, self.side, 8): self.update_reconstruction(self.F_gpu, self.W_gpu, normals_measure, np.intp(normals_pitch), np.int32(self.side), np.float32(self.units_per_voxel), np.float32(self.mu), np.int32(i), self.T_gk_gpu, block=(8, 8, 8), grid=(grid2, grid2)) # Copy F from gpu to F_array (binded to F_texture). self.F_gpu_to_array_copy() # Raycast. bbox = self.get_bounding_box() point = self.T_gk[:3, 3] mindistance = distance_to_bbox(bbox, point) maxdistance = distance_farthest_to_bbox(bbox, point) self.raycast(vertices_raycast, normals_raycast, np.int32(width), np.int32(height), np.intp(normals_pitch), np.int32(self.side), np.float32(self.units_per_voxel), np.float32(self.mu), self.T_gk_gpu, np.float32(mindistance), np.float32(maxdistance), block=(16, 16, 1), grid=(gridx, gridy)) # Tracking. # __global__ void compute_tracking_matrices(float* AA, float* Ab, float* omega, # float3* vertices_measure, float3* normals_measure, # float3* vertices_raycast, float3* normals_raycast, # int width, int height, size_t A_pitch, # float* mask, float* Tgk, float* Tgk1_k, # float threshold_distance) if self.active_tracking: self.AA_gpu.fill(0) self.Ab_gpu.fill(0) self.compute_tracking_matrices(self.AA_gpu, self.Ab_gpu, self.omega_gpu, vertices_measure, normals_measure, vertices_raycast, normals_raycast, np.int32(width), np.int32(height), np.intp(self.AA_gpu.strides[0]), np.intp(self.Ab_gpu.strides[0]), self.mask_gpu, self.T_gk_gpu, self.Tgk1_k_gpu, np.float32(20.0), block=(16, 16, 1), grid=(gridx, gridy)) cudareduce.add_vectors(self.AA_gpu, 640 * 480, 21) cudareduce.add_vectors(self.Ab_gpu, 640 * 480, 6) drv.memcpy_dtoh(self.AA, self.AA_gpu.gpudata) drv.memcpy_dtoh(self.Ab, self.Ab_gpu.gpudata) # Solve the system. AA = np.zeros((6, 6)) AA[np.triu_indices(6)] = self.AA AA.T[np.triu_indices(6)] = self.AA try: x = np.linalg.solve(AA, self.Ab) Tinc = np.array([[1, x[2], -x[1], x[3]], [-x[2], 1, x[0], x[4]], [x[1], -x[0], 1, x[5]], [0, 0, 0, 1]]) U, D, V = np.linalg.svd(Tinc[:3, :3]) Tinc[:3, :3] = np.dot(U, V) except np.linalg.LinAlgError: Tinc = np.eye(4) self.T_gk = np.float32(np.dot(Tinc, self.T_gk)) self.T_gk_gpu = gpuarray.to_gpu(self.T_gk[:3]) vertex_raycast_map.unmap() normal_raycast_map.unmap() vertex_measure_map.unmap() normal_measure_map.unmap()