def _pitch_allocate(self, array): """ ( alloc, pitch ) = gpu._pitch_allocate( array ) Allocates memory space on the GPU (with pitch) to fit the passed array. Returns the gpu memory array and the pitch (the width of the array in bytes) """ # array shape (height, width) = array.shape # size of element (in bytes) size = array.nbytes / array.size return drv.mem_alloc_pitch(width * size, height, size)
def _run_simulation(self, parameters, init_values, blocks, threads): total_threads = blocks * threads experiments = len(parameters) mt_data = os.path.join(os.path.split(os.path.realpath(__file__))[0], 'MersenneTwister.dat') # initialize Mersenne Twister self._initialise_twisters(mt_data, self._completeCode, threads, blocks) param = np.zeros((total_threads / self._beta + 1, self._parameterNumber), dtype=np.float32) try: for i in range(len(parameters)): for j in range(self._parameterNumber): param[i][j] = parameters[i][j] except IndexError: pass # parameter texture ary = sim.create_2D_array(param) sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4, total_threads / self._beta + 1) self._param_tex.set_array(ary) # 2D species arrays d_x, p_x = cuda.mem_alloc_pitch(width=self._speciesNumber * 4, height=total_threads, access_size=4) cuda.memcpy_htod(self._pvxp, np.array([p_x], dtype=np.int32)) # initialize species species_input = np.zeros((total_threads, self._speciesNumber), dtype=np.int32) try: for i in range(len(init_values)): for j in range(self._speciesNumber): species_input[i][j] = init_values[i][j] except IndexError: pass sim.copy2D_host_to_device(d_x, species_input, self._speciesNumber * 4, p_x, self._speciesNumber * 4, total_threads) # output array result = np.zeros(total_threads * self._resultNumber * self._speciesNumber, dtype=np.int32) d_result = cuda.mem_alloc(result.nbytes) # run code self._compiledRunMethod(d_x, d_result, block=(threads, 1, 1), grid=(blocks, 1)) # fetch from GPU memory cuda.memcpy_dtoh(result, d_result) result = result[0:experiments * self._beta * self._resultNumber * self._speciesNumber] result.shape = (experiments, self._beta, self._resultNumber, self._speciesNumber) return result
def resize_gpu(y_gpu, out_shape): in_shape = np.array(y_gpu.shape).astype(np.uint32) dtype = y_gpu.dtype if dtype != np.float32: raise NotImplementedException('Only float at the moment') block_size = (16,16,1) grid_size = (int(np.ceil(float(out_shape[1])/block_size[0])), int(np.ceil(float(out_shape[0])/block_size[1]))) preproc = _generate_preproc(dtype) mod = SourceModule(preproc + resize_code, keep=True) resize_fun_gpu = mod.get_function("resize") resized_gpu = cua.empty(tuple((np.int(out_shape[0]), np.int(out_shape[1]))),y_gpu.dtype) temp_gpu, pitch = cu.mem_alloc_pitch(4 * y_gpu.shape[1], y_gpu.shape[0], 4) copy_object = cu.Memcpy2D() copy_object.set_src_device(y_gpu.gpudata) copy_object.set_dst_device(temp_gpu) copy_object.src_pitch = 4 * y_gpu.shape[1] copy_object.dst_pitch = pitch copy_object.width_in_bytes = 4 * y_gpu.shape[1] copy_object.height = y_gpu.shape[0] copy_object(aligned=False) in_tex = mod.get_texref('in_tex') descr = cu.ArrayDescriptor() descr.width = y_gpu.shape[1] descr.height = y_gpu.shape[0] descr.format = cu.array_format.FLOAT descr.num_channels = 1 #pitch = y_gpu.nbytes / y_gpu.shape[0] in_tex.set_address_2d(temp_gpu, descr, pitch) in_tex.set_filter_mode(cu.filter_mode.LINEAR) in_tex.set_flags(cu.TRSF_NORMALIZED_COORDINATES) resize_fun_gpu(resized_gpu.gpudata, np.uint32(out_shape[0]), np.uint32(out_shape[1]), block=block_size, grid=grid_size) temp_gpu.free() return resized_gpu
def __init__(self, left_binary_block, # 2d array of boolean function_definitions, # array of uint32_t right_scalars, # Scalar values at theright column_cardinality ): self._column_cardinality = column_cardinality column_count = left_binary_block.shape[1] assert column_count < 32 self._column_count = column_count assert function_definitions.dtype == np.uint32 function_count = function_definitions.shape[0] self._function_count = function_count ranks = (rankdata(right_scalars) ).astype(np.dtype('f4')) gpu_ranks =drv.mem_alloc(ranks.nbytes) drv.memcpy_htod(gpu_ranks, ranks) self._gpu_ranks = gpu_ranks # How many rows? row_count = left_binary_block.shape[0] self._row_count = row_count # Prepare the left block left_binary_encoded = np.zeros((row_count,), dtype=np.uint32) for i in range(column_count): left_binary_encoded += left_binary_block[:,i] << i gpu_left_binary_encoded = drv.mem_alloc(left_binary_encoded.nbytes) drv.memcpy_htod(gpu_left_binary_encoded, left_binary_encoded) self._gpu_left_binary_encoded =gpu_left_binary_encoded # Function definitions gpu_function_definitions = drv.mem_alloc(function_definitions.nbytes) drv.memcpy_htod(gpu_function_definitions, function_definitions) self._gpu_function_definitions = gpu_function_definitions # Space for the results # print(row_count, function_count) gpu_result_space, gpu_result_pitch = drv.mem_alloc_pitch(row_count, function_count, 4) self._gpu_result_space = gpu_result_space self._gpu_result_pitch = gpu_result_pitch gpu_rho_space = drv.mem_alloc(function_count*8) self._gpu_rho_space = gpu_rho_space self._rho_space = np.zeros((function_count,), dtype='f8')
def __init__(self, backend, dtype, ioshape, initval, iopacking, tags): super(CUDAMatrixBase, self).__init__(backend, ioshape, iopacking, tags) # Data type info self.dtype = dtype self.itemsize = np.dtype(dtype).itemsize # Dimensions nrow, ncol = backend.compact_shape(ioshape, iopacking) self.nrow = nrow self.ncol = ncol # Compute the size, in bytes, of the minor dimension colsz = self.ncol*self.itemsize if 'align' in tags: # Allocate a 2D array aligned to the major dimension self.data, self.pitch = cuda.mem_alloc_pitch(colsz, nrow, self.itemsize) self._nbytes = nrow*self.pitch # Ensure that the pitch is a multiple of itemsize assert (self.pitch % self.itemsize) == 0 else: # Allocate a standard, tighly packed, array self._nbytes = colsz*nrow self.data = cuda.mem_alloc(self._nbytes) self.pitch = colsz self.leaddim = self.pitch / self.itemsize self.leadsubdim = self.soa_shape[-1] self.traits = (nrow, self.leaddim, self.leadsubdim, self.dtype) # Zero the entire matrix (incl. slack) assert (self._nbytes % 4) == 0 cuda.memset_d32(self.data, 0, self._nbytes/4) # Process any initial values if initval is not None: self.set(initval)
cuda_device = driver.Device(0) print("cuda_device=%s" % cuda_device) cuda_context = cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) try: print("cuda_context=%s" % cuda_context) BGRA2NV12 = get_BGRA2NV12() print("BGRA2NV12=%s" % BGRA2NV12) w = roundup(512, 32) h = roundup(512, 32) log("w=%s, h=%s", w, h) cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h*3/2, 16) log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch) #allocate CUDA NV12 buffer (on device): cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h*3/2, 16) log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch) #host buffers: inputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte) log("inputBuffer=%s", inputBuffer) outputBuffer = driver.pagelocked_zeros(inputPitch*h*3/2, dtype=numpy.byte) log("outputBuffer=%s", outputBuffer) #populate host buffer with random data: buf = inputBuffer.data for y in range(h*3/2):
def __init__(self, shape, dtype, gpudata=None, pitch = None): """create a PitchArray shape: shape of the array dtype: dtype of the array gpudata: DeviceAllocation object indicating the device memory allocated pitch: if gpudata is specified and pitch is True, gpudata will be treated as if it was allocated by cudaMallocPitch with pitch attributes: .shape: shape of self .size: number of elements of the array .mem_size: number of elements of total memory allocated .ld: leading dimension .M: 1 if self is a vector, shape[0] otherwise .N: self.size if self is a vector, product of shape[1] and shape[2] otherwise .gpudata: DeviceAllocation .ndim: number of dimensions .dtype: dtype of array self.nbytes: total memory allocated for the array in bytes Note: any 1-dim shape will result in a row vector with new shape as (1, shape) operations of PitchArray is elementwise operation """ try: tmpshape = [] s = 1 for dim in shape: dim = int(dim) assert isinstance(dim, int) s *= dim tmpshape.append(dim) self.shape = tuple(tmpshape) except TypeError: s = int(shape) assert isinstance(s, int) if s: self.shape = (1, s) else: self.shape = (0, 0) self.ndim = len(self.shape) if self.ndim > 3: raise ValueError("Only support array of dimension leq 3") self.dtype = np.dtype(dtype) self.size = s if gpudata is None: if self.size: if _pd(self.shape) == 1 or self.shape[0] == 1: self.gpudata = cuda.mem_alloc(self.size * self.dtype.itemsize) self.mem_size = self.size self.ld = _pd(self.shape) self.M = 1 self.N = self.size else: self.gpudata, pitch = cuda.mem_alloc_pitch(int(_pd(self.shape) * np.dtype(dtype).itemsize), self.shape[0], np.dtype(dtype).itemsize) self.ld = pitch / np.dtype(dtype).itemsize self.mem_size = self.ld * self.shape[0] self.M = self.shape[0] self.N = _pd(self.shape) else: self.gpudata = None self.M = 0 self.N = 0 self.ld = 0 self.mem_size = 0 else: #assumed that the device memory was also allocated by mem_alloc_pitch is required by the shape assert gpudata.__class__ == cuda.DeviceAllocation if self.size: self.gpudata = gpudata if _pd(self.shape) == 1 or self.shape[0] == 1: self.mem_size = self.size self.ld = _pd(self.shape) self.M = 1 self.N = self.size else: if pitch is None: pitch = int(np.ceil(float(_pd(self.shape) * np.dtype(dtype).itemsize) / 512) * 512) else: assert pitch == int(np.ceil(float(_pd(self.shape) * np.dtype(dtype).itemsize) / 512) * 512) self.ld = pitch / np.dtype(dtype).itemsize self.mem_size = self.ld * self.shape[0] self.M = self.shape[0] self.N = _pd(self.shape) else: self.gpudata = None self.M = 0 self.N = 0 self.ld = 0 self.mem_size = 0 print "warning: shape may not be assigned properly" self.nbytes = self.dtype.itemsize * self.mem_size self._grid, self._block = splay(self.mem_size, self.M)
''' #template = string.Template(template) module = SourceModule(template) func = module.get_function('convolutionRowGPU') original = numpy.random.rand(2, 7) * 255 original = numpy.float32(original) print original ''' destImage_gpu = cuda.mem_alloc_like(original) sourceImage_gpu = cuda.mem_alloc_like(original) intermediateImage_gpu = cuda.mem_alloc_like(original) ''' destImage_gpu, pit = cuda.mem_alloc_pitch(7 * 4, 2, numpy.dtype(numpy.float32).itemsize) sourceImage_gpu, pit2 = cuda.mem_alloc_pitch( 7 * 4, 2, numpy.dtype(numpy.float32).itemsize) print pit, pit2 #cuda.memcpy_htod(sourceImage_gpu, original) #cuda.memcpy_htod(destImage_gpu, original) copy = cuda.Memcpy2D() copy.set_src_host(original) copy.set_dst_device(destImage_gpu) copy.height = 2 copy.width_in_bytes = 7 * 4 copy.src_pitch = 7 * 4 copy.dst_pitch = 128 * 4 copy(aligned=True)
cuda_device = driver.Device(0) print("cuda_device=%s" % cuda_device) cuda_context = cuda_device.make_context(flags=driver.ctx_flags.SCHED_AUTO | driver.ctx_flags.MAP_HOST) try: print("cuda_context=%s" % cuda_context) BGRA2NV12 = get_CUDA_function(0, "BGRA_to_NV12") print("BGRA2NV12=%s" % BGRA2NV12) w = roundup(512, 32) h = roundup(512, 32) log("w=%s, h=%s", w, h) cudaInputBuffer, inputPitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16) log("CUDA Input Buffer=%s, pitch=%s", hex(int(cudaInputBuffer)), inputPitch) #allocate CUDA NV12 buffer (on device): cudaNV12Buffer, NV12Pitch = driver.mem_alloc_pitch(w, h * 3 / 2, 16) log("CUDA NV12 Buffer=%s, pitch=%s", hex(int(cudaNV12Buffer)), NV12Pitch) #host buffers: inputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2, dtype=numpy.byte) log("inputBuffer=%s", inputBuffer) outputBuffer = driver.pagelocked_zeros(inputPitch * h * 3 / 2, dtype=numpy.byte) log("outputBuffer=%s", outputBuffer)
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes == ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format( ) == self.src_format, "invalid source format: %s (expected %s)" % ( image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory( mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width / x_div, 4) out_height = roundup(self.dst_height / y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch( out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0 * (time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w / blockw / xdiv) if gridw * 2 * blockw < w: gridw += 1 gridh = max(1, h / blockh / ydiv) if gridh * 2 * blockh < h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw, blockh, 1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend - kstart) * 1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides) - 1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end - read_start) * 1000.0, 1000.0 * (time.time() - start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes==ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width/x_div, 4) out_height = roundup(self.dst_height/y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w/blockw/xdiv) if gridw*2*blockw<w: gridw += 1 gridh = max(1, h/blockh/ydiv) if gridh*2*blockh<h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides)-1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)