def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez): print 'rank= %d, (%d, %d, %d)' % (rank, nx, ny, nz), total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print '%d MB' % (total_bytes / (1024**2)) else: print '%1.2f GB' % (float(total_bytes) / (1024**3)) if nz % Dx != 0: print "Error: nz is not multiple of %d" % (Dx) sys.exit() if (nx * ny) % Dy != 0: print "Error: nx*ny is not multiple of %d" % (Dy) sys.exit() f = np.zeros((nx, ny, nz), 'f') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx mcopy.set_src_host(cex) mcopy.set_dst_array(tcex_gpu) mcopy() mcopy.set_src_host(cey) mcopy.set_dst_array(tcey_gpu) mcopy() mcopy.set_src_host(cez) mcopy.set_dst_array(tcez_gpu) mcopy() eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu] tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu] return eh_fields, tex_fields
def create_3d_texture(a, module, variable, point_sampling=False): a = numpy.asfortranarray(a) w, h, d = a.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() out_texref = module.get_texref(variable) out_texref.set_array(ary) if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def to_tex3d(data): """ Source: https://wiki.tiker.net/PyCUDA/Examples/Demo3DSurface """ d, h, w = data.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(data.dtype) descr.num_channels = 1 descr.flags = 0 if isinstance(data, gpuarray.GPUArray): data = data.get() device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(data) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = data.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def gpuArray3DtocudaArray(gpuArray, allowSurfaceBind=False, precision='float'): #import pycuda.autoinit d, h, w = gpuArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d if precision == 'float': descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr3D.num_channels = 1 elif precision == 'double': descr3D.format = cuda.array_format.SIGNED_INT32 descr3D.num_channels = 2 else: print("ERROR: CUDA_ARRAY incompatible precision") sys.exit() descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_device(gpuArray.ptr) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def numpy3d_to_array(np_array, allow_surface_bind=False, layered=True): d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST if layered: descr.flags = cuda.array3d_flags.ARRAY3D_LAYERED device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def numpy3d_to_array(np_array): '''Copy a 3D (d,h,w) numpy array into a 3D pycuda array that can be used to set a texture. (For some reason, gpuarrays can't be used to do that directly). A transpose happens implicitly; the CUDA array has dim (w,h,d).''' import pycuda.autoinit d, h, w = np_array.shape descr = driver.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d if np_array.dtype == np.float64: descr.format = driver.array_format.SIGNED_INT32 descr.num_channels = 2 else: descr.format = driver.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 device_array = driver.Array(descr) copy = driver.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def numpy3d_to_array(np_array, allow_surface_bind=True): import pycuda.autoinit d, h, w = np_array.shape descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = drv.array3d_flags.SURFACE_LDST device_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def copyTexture(self, data, pixelSize=None, extent=None, isocenter=None): # Convert data to float32 array in Contiguous ordering. self.arrIn = np.array(data, dtype=np.float32, order='C') d, h, w = self.arrIn.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(self.arrIn.dtype) descr.num_channels = 1 # descr.flags = 0 self.gpuTexture = cuda.Array(descr) # Copy array data across. This puts a 3D array in linear memory on the GPU. copy = cuda.Memcpy3D() copy.set_src_host(self.arrIn) copy.set_dst_array(self.gpuTexture) copy.src_pitch = self.arrIn.strides[1] copy.width_in_bytes = self.arrIn.strides[1] copy.height = h copy.depth = d copy.src_height = h copy() self.pixelSize = pixelSize self.isocenter = isocenter # Extent[l,r,b,t,f,b] self.extent = extent # Trigger for setting bottom left corner as 0,0,0. self.zeroExtent = False
def numpy3d_to_array(np_array, allow_surface_bind=True): """Converts 3D numpy array to 3D device array. """ # numpy3d_to_array # taken from pycuda mailing list (striped for C ordering only) d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def __init__(self): self.d_params = cuda.mem_alloc( self.ntemporal_samples * DevSrc.max_params * 4) self.palette_surf_dsc = util.argset(cuda.ArrayDescriptor3D(), height=self.palette_height, width=self.palette_width, depth=0, format=cuda.array_format.SIGNED_INT32, num_channels=2, flags=cuda.array3d_flags.SURFACE_LDST) self.d_pal_array = cuda.Array(self.palette_surf_dsc)
def create_2D_array(mat): descr = driver.ArrayDescriptor() descr.width = mat.shape[1] descr.height = mat.shape[0] descr.format = driver.dtype_to_array_format(mat.dtype) descr.num_channels = 1 descr.flags = 0 ary = driver.Array(descr) return ary
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = numpy.asarray(numpy.random.randn(*shape), dtype=numpy.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = numpy.zeros(shape, dtype=numpy.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest - a) == 0
def alloc_coeff_arrays(s): f = np.zeros((s.nx, s.ny, s.nz), 'f') s.cex = np.ones_like(f) * 0.5 s.cex[:, -1, :] = 0 s.cex[:, :, -1] = 0 s.cey = np.ones_like(f) * 0.5 s.cey[:, :, -1] = 0 s.cey[-1, :, :] = 0 s.cez = np.ones_like(f) * 0.5 s.cez[-1, :, :] = 0 s.cez[:, -1, :] = 0 descr = cuda.ArrayDescriptor3D() descr.width = s.nz descr.height = s.ny descr.depth = s.nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 s.tcex_gpu = cuda.Array(descr) s.tcey_gpu = cuda.Array(descr) s.tcez_gpu = cuda.Array(descr) mcpy = cuda.Memcpy3D() mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1] mcpy.src_height = mcpy.height = s.ny mcpy.depth = s.nx mcpy.set_src_host(s.cex) mcpy.set_dst_array(s.tcex_gpu) mcpy() mcpy.set_src_host(s.cey) mcpy.set_dst_array(s.tcey_gpu) mcpy() mcpy.set_src_host(s.cez) mcpy.set_dst_array(s.tcez_gpu) mcpy()
def gpuArray2DtocudaArray(gpuArray): #import pycuda.autoinit h, w = gpuArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr2D.num_channels = 1 cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_device(gpuArray.ptr) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = gpuArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def np2DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit h, w = npArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(npArray.dtype) descr2D.num_channels = 1 if allowSurfaceBind: descr.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = npArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, descr2D
def texCubeToGPU(texCube): descr = cuda.ArrayDescriptor3D() descr.width = texCube.shape[2] descr.height = texCube.shape[1] descr.depth = 6 descr.format = cuda.dtype_to_array_format(texCube.dtype) descr.num_channels = 4 descr.flags = cuda.array3d_flags.CUBEMAP texCubeArray = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(texCube) copy.set_dst_array(texCubeArray) copy.width_in_bytes = copy.src_pitch = texCube.strides[1] #d*h*w*c copy.src_height = copy.height = texCube.shape[1] copy.depth = 6 copy() return texCubeArray
def np3DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit d, h, w = npArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d descr3D.format = cuda.dtype_to_array_format(npArray.dtype) descr3D.num_channels = 1 descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def _prepare_F_texture(self): descr = drv.ArrayDescriptor3D() descr.width = self.side descr.height = self.side descr.depth = self.side descr.format = drv.dtype_to_array_format(self.F_gpu.dtype) descr.num_channels = 1 descr.flags = 0 F_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_device(self.F_gpu.gpudata) copy.set_dst_array(F_array) copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1] copy.src_height = copy.height = self.side copy.depth = self.side self.F_gpu_to_array_copy = copy self.F_gpu_to_array_copy() self.F_texture.set_array(F_array)
def gpu3D(src): """ """ w, h, d = src.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(src.dtype) descr.num_channels = 1 descr.flags = 0 dst = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(src) copy.set_dst_array(dst) copy.width_in_bytes = copy.src_pitch = src.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return dst
hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex = set_c(f, 'yz') cey = set_c(f, 'zx') cez = set_c(f, 'xy') descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx arrcopy(mcopy, cex, tcex_gpu) arrcopy(mcopy, cey, tcey_gpu) arrcopy(mcopy, cez, tcez_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('Dx', str(Dx)).replace('Dy', str(Dy)).replace(
f = np.zeros((nx, ny, nz), 'f') g = np.zeros((nx, ny, nz), 'f') cf = np.random.randn(nx * ny * nz).astype(np.float32).reshape((nx, ny, nz)) f_gpu = cuda.to_device(f) g_gpu = cuda.to_device(f) cf_gpu = cuda.to_device(cf) descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcf_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx arrcopy(mcopy, cf, tcf_gpu) # prepare kernels from pycuda.compiler import SourceModule kernels = kernels.replace('Dx', '16').replace('Dy', '16').replace( 'nyz', str(ny * nz)).replace('nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)) mod = SourceModule(kernels)
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False): ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py''' # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags # if order != "C": raise LogicError("Just implemented for C order") dimension = len(npArray.shape) case = order in ["C","F"] if not case: raise LogicError("order must be either F or C") # if dimension == 1: # w = npArray.shape[0] # h, d = 0,0 if dimension == 2: if order == "C": stride = 0 if order == "F": stride = -1 h, w = npArray.shape d = 1 if allowSurfaceBind: descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: descrArr = cuda.ArrayDescriptor() descrArr.width = w descrArr.height = h # descrArr.depth = d elif dimension == 3: if order == "C": stride = 1 if order == "F": stride = 1 d, h, w = npArray.shape descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ") if prec == 'float': descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im) descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif prec == 'double': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif prec == 'cdouble': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: if dimension==2: descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED descrArr.flags |= cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descrArr) if allowSurfaceBind or dimension==3 : copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D else: # if dimension == 3: # copy3D = cuda.Memcpy3D() # copy3D.set_src_host(npArray) # copy3D.set_dst_array(cudaArray) # copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support # copy3D.src_height = copy3D.height = h # copy3D.depth = d # copy3D() # return cudaArray, copy3D # if dimension == 2: cudaArray = cuda.Array(descrArr) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride] # copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) # memory allocate with cuda array descr = cuda.ArrayDescriptor3D() descr.width = nx descr.height = ny descr.depth = nz descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 cex_gpu = cuda.Array(descr) cey_gpu = cuda.Array(descr) cez_gpu = cuda.Array(descr) chx_gpu = cuda.Array(descr) chy_gpu = cuda.Array(descr) chz_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz arrcopy(mcopy, set_c(f, (None, -1, -1)), cex_gpu) arrcopy(mcopy, set_c(f, (-1, None, -1)), cey_gpu) arrcopy(mcopy, set_c(f, (-1, -1, None)), cez_gpu) arrcopy(mcopy, set_c(f, (None, 0, 0)), chx_gpu)
d = 4 shape = (d, h, w) a = np.arange(24).reshape(*shape, order='C').astype('float32') print(a.shape, a.strides) print(a) descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) {