def create_3d_texture(a, module, variable, point_sampling=False): a = numpy.asfortranarray(a) w, h, d = a.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() out_texref = module.get_texref(variable) out_texref.set_array(ary) if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def numpy3d_to_array(np_array, allow_surface_bind=True): """Converts 3D numpy array to 3D device array. """ # numpy3d_to_array # taken from pycuda mailing list (striped for C ordering only) d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def gpuArray3DtocudaArray( gpuArray, allowSurfaceBind=False, precision='float' ): #import pycuda.autoinit d, h, w = gpuArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d if precision == 'float': descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr3D.num_channels = 1 elif precision == 'double': descr3D.format = cuda.array_format.SIGNED_INT32 descr3D.num_channels = 2 else: print "ERROR: CUDA_ARRAY incompatible precision" sys.exit() descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_device(gpuArray.ptr) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, allow_offset=False): if self.dtype == np.float64 and allow_double_hack: if channels != 1: raise ValueError, "'fake' double precision textures can only have one channel" channels = 2 fmt = drv.array_format.SIGNED_INT32 read_as_int = True else: fmt = drv.dtype_to_array_format(self.dtype) read_as_int = np.integer in self.dtype.type.__mro__ offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) return offset / self.dtype.itemsize
def numpy3d_to_array(np_array, allow_surface_bind=True): import pycuda.autoinit d, h, w = np_array.shape descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = drv.array3d_flags.SURFACE_LDST device_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def numpy3d_to_array(np_array): '''Copy a 3D (d,h,w) numpy array into a 3D pycuda array that can be used to set a texture. (For some reason, gpuarrays can't be used to do that directly). A transpose happens implicitly; the CUDA array has dim (w,h,d).''' import pycuda.autoinit d, h, w = np_array.shape descr = driver.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d if np_array.dtype == np.float64: descr.format = driver.array_format.SIGNED_INT32 descr.num_channels = 2 else: descr.format = driver.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 device_array = driver.Array(descr) copy = driver.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, allow_offset=False): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if self.dtype == np.float64 and allow_double_hack: if channels != 1: raise ValueError("'fake' double precision textures can " "only have one channel") channels = 2 fmt = drv.array_format.SIGNED_INT32 read_as_int = True else: fmt = drv.dtype_to_array_format(self.dtype) read_as_int = np.integer in self.dtype.type.__mro__ offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) return offset / self.dtype.itemsize
def numpy3d_to_array(np_array, allow_surface_bind=False, layered=True): d, h, w = np_array.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = cuda.array3d_flags.SURFACE_LDST if layered: descr.flags = cuda.array3d_flags.ARRAY3D_LAYERED device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, allow_offset=False): if not self.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") if self.dtype == np.float64 and allow_double_hack: if channels != 1: raise ValueError( "'fake' double precision textures can " "only have one channel") channels = 2 fmt = drv.array_format.SIGNED_INT32 read_as_int = True else: fmt = drv.dtype_to_array_format(self.dtype) read_as_int = np.integer in self.dtype.type.__mro__ offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) return offset/self.dtype.itemsize
def gpuArray3DtocudaArray(gpuArray, allowSurfaceBind=False, precision='float'): #import pycuda.autoinit d, h, w = gpuArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d if precision == 'float': descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr3D.num_channels = 1 elif precision == 'double': descr3D.format = cuda.array_format.SIGNED_INT32 descr3D.num_channels = 2 else: print("ERROR: CUDA_ARRAY incompatible precision") sys.exit() descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_device(gpuArray.ptr) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def to_tex3d(data): """ Source: https://wiki.tiker.net/PyCUDA/Examples/Demo3DSurface """ d, h, w = data.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(data.dtype) descr.num_channels = 1 descr.flags = 0 if isinstance(data, gpuarray.GPUArray): data = data.get() device_array = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(data) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = data.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def copyTexture(self, data, pixelSize=None, extent=None, isocenter=None): # Convert data to float32 array in Contiguous ordering. self.arrIn = np.array(data, dtype=np.float32, order='C') d, h, w = self.arrIn.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(self.arrIn.dtype) descr.num_channels = 1 # descr.flags = 0 self.gpuTexture = cuda.Array(descr) # Copy array data across. This puts a 3D array in linear memory on the GPU. copy = cuda.Memcpy3D() copy.set_src_host(self.arrIn) copy.set_dst_array(self.gpuTexture) copy.src_pitch = self.arrIn.strides[1] copy.width_in_bytes = self.arrIn.strides[1] copy.height = h copy.depth = d copy.src_height = h copy() self.pixelSize = pixelSize self.isocenter = isocenter # Extent[l,r,b,t,f,b] self.extent = extent # Trigger for setting bottom left corner as 0,0,0. self.zeroExtent = False
def np3d_to_device_array(np_array, allow_surface_bind=True): d, h, w = np_array.shape descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 if allow_surface_bind: descr.flags = drv.array3d_flags.SURFACE_LDST device_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def create_2D_array(mat): descr = driver.ArrayDescriptor() descr.width = mat.shape[1] descr.height = mat.shape[0] descr.format = driver.dtype_to_array_format(mat.dtype) descr.num_channels = 1 descr.flags = 0 ary = driver.Array(descr) return ary
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = np.asarray( np.random.randn(*shape), dtype=np.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = np.zeros(shape, dtype=np.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest-a) == 0
def test_3d_texture(self): # adapted from code by Nicolas Pinto w = 2 h = 4 d = 8 shape = (w, h, d) a = numpy.asarray(numpy.random.randn(*shape), dtype=numpy.float32, order="F") descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule(""" texture<float, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(float *dest) { int x = threadIdx.x; int y = threadIdx.y; int z = threadIdx.z; int dx = blockDim.x; int dy = blockDim.y; int i = (z*dy + y)*dx + x; dest[i] = tex3D(mtx_tex, x, y, z); //dest[i] = x; } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx_tex.set_array(ary) dest = numpy.zeros(shape, dtype=numpy.float32, order="F") copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex]) assert la.norm(dest - a) == 0
def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez): print "rank= %d, (%d, %d, %d)" % (rank, nx, ny, nz), total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024 ** 3) == 0: print "%d MB" % (total_bytes / (1024 ** 2)) else: print "%1.2f GB" % (float(total_bytes) / (1024 ** 3)) if nz % Dx != 0: print "Error: nz is not multiple of %d" % (Dx) sys.exit() if (nx * ny) % Dy != 0: print "Error: nx*ny is not multiple of %d" % (Dy) sys.exit() f = np.zeros((nx, ny, nz), "f") ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx mcopy.set_src_host(cex) mcopy.set_dst_array(tcex_gpu) mcopy() mcopy.set_src_host(cey) mcopy.set_dst_array(tcey_gpu) mcopy() mcopy.set_src_host(cez) mcopy.set_dst_array(tcez_gpu) mcopy() eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu] tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu] return eh_fields, tex_fields
def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez): print 'rank= %d, (%d, %d, %d)' % (rank, nx, ny, nz), total_bytes = nx * ny * nz * 4 * 9 if total_bytes / (1024**3) == 0: print '%d MB' % (total_bytes / (1024**2)) else: print '%1.2f GB' % (float(total_bytes) / (1024**3)) if nz % Dx != 0: print "Error: nz is not multiple of %d" % (Dx) sys.exit() if (nx * ny) % Dy != 0: print "Error: nx*ny is not multiple of %d" % (Dy) sys.exit() f = np.zeros((nx, ny, nz), 'f') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) descr = cuda.ArrayDescriptor3D() descr.width = nz descr.height = ny descr.depth = nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 tcex_gpu = cuda.Array(descr) tcey_gpu = cuda.Array(descr) tcez_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nx mcopy.set_src_host(cex) mcopy.set_dst_array(tcex_gpu) mcopy() mcopy.set_src_host(cey) mcopy.set_dst_array(tcey_gpu) mcopy() mcopy.set_src_host(cez) mcopy.set_dst_array(tcez_gpu) mcopy() eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu] tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu] return eh_fields, tex_fields
def gpuArray2DtocudaArray(gpuArray): #import pycuda.autoinit h, w = gpuArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr2D.num_channels = 1 cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_device(gpuArray.ptr) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = gpuArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def gpuArray2DtocudaArray( gpuArray ): #import pycuda.autoinit h, w = gpuArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype) descr2D.num_channels = 1 cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_device(gpuArray.ptr) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = gpuArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
def np2DtoCudaArray( npArray, allowSurfaceBind=False ): #import pycuda.autoinit h, w = npArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(npArray.dtype) descr2D.num_channels = 1 if allowSurfaceBind: descr.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = npArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, descr2D
def np2DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit h, w = npArray.shape descr2D = cuda.ArrayDescriptor() descr2D.width = w descr2D.height = h descr2D.format = cuda.dtype_to_array_format(npArray.dtype) descr2D.num_channels = 1 if allowSurfaceBind: descr.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr2D) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.src_pitch = npArray.strides[0] copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, descr2D
def texCubeToGPU(texCube): descr = cuda.ArrayDescriptor3D() descr.width = texCube.shape[2] descr.height = texCube.shape[1] descr.depth = 6 descr.format = cuda.dtype_to_array_format(texCube.dtype) descr.num_channels = 4 descr.flags = cuda.array3d_flags.CUBEMAP texCubeArray = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(texCube) copy.set_dst_array(texCubeArray) copy.width_in_bytes = copy.src_pitch = texCube.strides[1] #d*h*w*c copy.src_height = copy.height = texCube.shape[1] copy.depth = 6 copy() return texCubeArray
def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, allow_offset=False): if self.dtype == numpy.float64 and allow_double_hack: if channels != 1: raise ValueError, "'fake' double precision textures can only have one channel" channels = 2 fmt = drv.array_format.SIGNED_INT32 read_as_int = True else: fmt = drv.dtype_to_array_format(self.dtype) read_as_int = numpy.integer in self.dtype.type.__mro__ offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset) texref.set_format(fmt, channels) if read_as_int: texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER) return offset/self.dtype.itemsize
def np3DtoCudaArray( npArray, allowSurfaceBind=False ): #import pycuda.autoinit d, h, w = npArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d descr3D.format = cuda.dtype_to_array_format(npArray.dtype) descr3D.num_channels = 1 descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray
def np3DtoCudaArray(npArray, allowSurfaceBind=False): #import pycuda.autoinit d, h, w = npArray.shape descr3D = cuda.ArrayDescriptor3D() descr3D.width = w descr3D.height = h descr3D.depth = d descr3D.format = cuda.dtype_to_array_format(npArray.dtype) descr3D.num_channels = 1 descr3D.flags = 0 if allowSurfaceBind: descr3D.flags = cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descr3D) copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D
def gpu3D(src): """ """ w, h, d = src.shape descr = cuda.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = cuda.dtype_to_array_format(src.dtype) descr.num_channels = 1 descr.flags = 0 dst = cuda.Array(descr) copy = cuda.Memcpy3D() copy.set_src_host(src) copy.set_dst_array(dst) copy.width_in_bytes = copy.src_pitch = src.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return dst
def _prepare_F_texture(self): descr = drv.ArrayDescriptor3D() descr.width = self.side descr.height = self.side descr.depth = self.side descr.format = drv.dtype_to_array_format(self.F_gpu.dtype) descr.num_channels = 1 descr.flags = 0 F_array = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_device(self.F_gpu.gpudata) copy.set_dst_array(F_array) copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1] copy.src_height = copy.height = self.side copy.depth = self.side self.F_gpu_to_array_copy = copy self.F_gpu_to_array_copy() self.F_texture.set_array(F_array)
def numpy3d_to_array(np_array, order=None): ''' Method for copying a numpy array to a CUDA array If you get a buffer error, run this method on np_array.copy('F') ''' from pycuda.driver import Array, ArrayDescriptor3D, Memcpy3D, dtype_to_array_format if order is None: order = 'C' if np_array.strides[0] > np_array.strides[2] else 'F' if order.upper() == 'C': d, h, w = np_array.shape elif order.upper() == "F": w, h, d = np_array.shape else: raise Exception("order must be either F or C") descr = ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = dtype_to_array_format(np_array.dtype) descr.num_channels = 1 descr.flags = 0 device_array = Array(descr) copy = Memcpy3D() copy.set_src_host(np_array) copy.set_dst_array(device_array) copy.width_in_bytes = copy.src_pitch = np_array.strides[1] copy.src_height = copy.height = h copy.depth = d copy() return device_array
def alloc_coeff_arrays(s): f = np.zeros((s.nx, s.ny, s.nz), 'f') s.cex = np.ones_like(f) * 0.5 s.cex[:, -1, :] = 0 s.cex[:, :, -1] = 0 s.cey = np.ones_like(f) * 0.5 s.cey[:, :, -1] = 0 s.cey[-1, :, :] = 0 s.cez = np.ones_like(f) * 0.5 s.cez[-1, :, :] = 0 s.cez[:, -1, :] = 0 descr = cuda.ArrayDescriptor3D() descr.width = s.nz descr.height = s.ny descr.depth = s.nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 s.tcex_gpu = cuda.Array(descr) s.tcey_gpu = cuda.Array(descr) s.tcez_gpu = cuda.Array(descr) mcpy = cuda.Memcpy3D() mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1] mcpy.src_height = mcpy.height = s.ny mcpy.depth = s.nx mcpy.set_src_host(s.cex) mcpy.set_dst_array(s.tcex_gpu) mcpy() mcpy.set_src_host(s.cey) mcpy.set_dst_array(s.tcey_gpu) mcpy() mcpy.set_src_host(s.cez) mcpy.set_dst_array(s.tcez_gpu) mcpy()
def alloc_coeff_arrays(s): f = np.zeros((s.nx, s.ny, s.nz), 'f') s.cex = np.ones_like(f)*0.5 s.cex[:,-1,:] = 0 s.cex[:,:,-1] = 0 s.cey = np.ones_like(f)*0.5 s.cey[:,:,-1] = 0 s.cey[-1,:,:] = 0 s.cez = np.ones_like(f)*0.5 s.cez[-1,:,:] = 0 s.cez[:,-1,:] = 0 descr = cuda.ArrayDescriptor3D() descr.width = s.nz descr.height = s.ny descr.depth = s.nx descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 s.tcex_gpu = cuda.Array(descr) s.tcey_gpu = cuda.Array(descr) s.tcez_gpu = cuda.Array(descr) mcpy = cuda.Memcpy3D() mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1] mcpy.src_height = mcpy.height = s.ny mcpy.depth = s.nx mcpy.set_src_host( s.cex ) mcpy.set_dst_array( s.tcex_gpu ) mcpy() mcpy.set_src_host( s.cey ) mcpy.set_dst_array( s.tcey_gpu ) mcpy() mcpy.set_src_host( s.cez ) mcpy.set_dst_array( s.tcez_gpu ) mcpy()
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ]+[ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ] )) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value("texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn("Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define("COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf( POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT") ), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend(get_load_code( dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") +[S("__syncthreads()"), Line() ]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer( Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True)) ), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment("fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False)) ), Line(), Comment("fluxes for single-sided boundary face pairs"), While("fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark)) ), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([ Line(), S("__syncthreads()"), Line() ]) f_body.extend_log_block("store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For("unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes))] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] ) ) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ] ) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref( "field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address( ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2+len(self.fluxes))*"P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
# memory allocate f = np.zeros((nx, ny, nz), 'f', order='F') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) # memory allocate with cuda array descr = cuda.ArrayDescriptor3D() descr.width = nx descr.height = ny descr.depth = nz descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 cex_gpu = cuda.Array(descr) cey_gpu = cuda.Array(descr) cez_gpu = cuda.Array(descr) chx_gpu = cuda.Array(descr) chy_gpu = cuda.Array(descr) chz_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz
def get_kernel(self, fdata, ilist_data, for_benchmark): from cgen.cuda import CudaShared, CudaGlobal from pycuda.tools import dtype_to_ctype discr = self.discr given = self.plan.given fplan = self.plan d = discr.dimensions dims = range(d) elgroup, = discr.element_groups float_type = given.float_type f_decl = CudaGlobal( FunctionDeclaration(Value("void", "apply_flux"), [ Pointer(POD(float_type, "debugbuf")), Pointer(POD(numpy.uint8, "gmem_facedata")), ] + [ Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr)) for flux_nr in range(len(self.fluxes)) ])) cmod = Module() cmod.append(Include("pycuda-helpers.hpp")) for dep_expr in self.all_deps: cmod.extend([ Value( "texture<%s, 1, cudaReadModeElementType>" % dtype_to_ctype(float_type, with_fp_tex_hack=True), "field%d_tex" % self.dep_to_index[dep_expr]) ]) if fplan.flux_count != len(self.fluxes): from warnings import warn warn( "Flux count in flux execution plan different from actual flux count.\n" "You may want to specify the tune_for= kwarg in the Discretization\n" "constructor.") cmod.extend([ Line(), Typedef(POD(float_type, "value_type")), Line(), flux_header_struct(float_type, discr.dimensions), Line(), face_pair_struct(float_type, discr.dimensions), Line(), Define("DIMENSIONS", discr.dimensions), Define("DOFS_PER_FACE", fplan.dofs_per_face), Define("THREADS_PER_FACE", fplan.threads_per_face()), Line(), Define("CONCURRENT_FACES", fplan.parallel_faces), Define("BLOCK_MB_COUNT", fplan.mbs_per_block), Line(), Define("FACEDOF_NR", "threadIdx.x"), Define("BLOCK_FACE", "threadIdx.y"), Line(), Define("FLUX_COUNT", len(self.fluxes)), Line(), Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"), Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"), Define( "COALESCING_THREAD_COUNT", "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"), Line(), Define("DATA_BLOCK_SIZE", fdata.block_bytes), Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()), Define("ALIGNED_FACE_DOFS_PER_BLOCK", "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"), Line(), Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"), Line(), ] + ilist_data.code + [ Line(), Value("texture<index_list_entry_t, 1, cudaReadModeElementType>", "tex_index_lists"), Line(), fdata.struct, Line(), CudaShared(Value("flux_data", "data")), ]) if not fplan.direct_store: cmod.extend([ CudaShared( ArrayOf( ArrayOf(POD(float_type, "smem_fluxes_on_faces"), "FLUX_COUNT"), "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")), Line(), ]) S = Statement f_body = Block() from hedge.backends.cuda.tools import get_load_code f_body.extend( get_load_code(dest="&data", base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE", bytes="sizeof(flux_data)", descr="load face_pair data") + [S("__syncthreads()"), Line()]) def get_flux_code(flux_writer): flux_code = Block([]) flux_code.extend([ Initializer(Pointer(Value("face_pair", "fpair")), "data.facepairs+fpair_nr"), Initializer( MaybeUnused(POD(numpy.uint32, "a_index")), "fpair->a_base + tex1Dfetch(tex_index_lists, " "fpair->a_ilist_index + FACEDOF_NR)"), Initializer( MaybeUnused(POD(numpy.uint32, "b_index")), "fpair->b_base + tex1Dfetch(tex_index_lists, " "fpair->b_ilist_index + FACEDOF_NR)"), Line(), flux_writer(), Line(), S("fpair_nr += CONCURRENT_FACES") ]) return flux_code flux_computation = Block([ Comment("fluxes for dual-sided (intra-block) interior face pairs"), While("fpair_nr < data.header.same_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(True))), Line(), Comment("work around nvcc assertion failure"), S("fpair_nr+=1"), S("fpair_nr-=1"), Line(), Comment( "fluxes for single-sided (inter-block) interior face pairs"), While("fpair_nr < data.header.diff_facepairs_end", get_flux_code(lambda: self.write_interior_flux_code(False))), Line(), Comment("fluxes for single-sided boundary face pairs"), While( "fpair_nr < data.header.bdry_facepairs_end", get_flux_code( lambda: self.write_boundary_flux_code(for_benchmark))), ]) f_body.extend_log_block("compute the fluxes", [ Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"), If("FACEDOF_NR < DOFS_PER_FACE", flux_computation) ]) if not fplan.direct_store: f_body.extend([Line(), S("__syncthreads()"), Line()]) f_body.extend_log_block( "store fluxes", [ #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "FOF_BLOCK_BASE"), #Assign("debugbuf[0]", "sizeof(face_pair)"), For( "unsigned word_nr = THREAD_NUM", "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT", "word_nr += COALESCING_THREAD_COUNT", Block([ Assign( "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr, "smem_fluxes_on_faces[%d][word_nr]" % flux_nr) for flux_nr in range(len(self.fluxes)) ] #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr, #Block([ #Assign("debugbuf[blockIdx.x]", "word_nr"), #]) #) #for flux_nr in range(len(self.fluxes))] )) ]) if False: f_body.extend([ Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"), Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"), Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"), Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"), ]) # finish off ---------------------------------------------------------- cmod.append(FunctionBody(f_decl, f_body)) if not for_benchmark and "cuda_dump_kernels" in discr.debug: from hedge.tools import open_unique_debug_file open_unique_debug_file("flux_gather", ".cu").write(str(cmod)) #from pycuda.tools import allow_user_edit mod = SourceModule( #allow_user_edit(cmod, "kernel.cu", "the flux kernel"), cmod, keep="cuda_keep_kernels" in discr.debug) expr_to_texture_map = dict( (dep_expr, mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr])) for dep_expr in self.all_deps) index_list_texref = mod.get_texref("tex_index_lists") index_list_texref.set_address(ilist_data.device_memory, ilist_data.bytes) index_list_texref.set_format( cuda.dtype_to_array_format(ilist_data.type), 1) index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER) func = mod.get_function("apply_flux") block = (fplan.threads_per_face(), fplan.parallel_faces, 1) func.prepare( (2 + len(self.fluxes)) * "P", texrefs=expr_to_texture_map.values() + [index_list_texref]) if "cuda_flux" in discr.debug: print "flux: lmem=%d smem=%d regs=%d" % ( func.local_size_bytes, func.shared_size_bytes, func.num_regs) return block, func, expr_to_texture_map
import numpy as np w = 2 h = 3 d = 4 shape = (d, h, w) a = np.arange(24).reshape(*shape, order='C').astype('float32') print(a.shape, a.strides) print(a) descr = drv.ArrayDescriptor3D() descr.width = w descr.height = h descr.depth = d descr.format = drv.dtype_to_array_format(a.dtype) descr.num_channels = 1 descr.flags = 0 ary = drv.Array(descr) copy = drv.Memcpy3D() copy.set_src_host(a) copy.set_dst_array(ary) copy.width_in_bytes = copy.src_pitch = a.strides[1] copy.src_height = copy.height = h copy.depth = d copy() mod = SourceModule("""
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False): ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py''' # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags # if order != "C": raise LogicError("Just implemented for C order") dimension = len(npArray.shape) case = order in ["C","F"] if not case: raise LogicError("order must be either F or C") # if dimension == 1: # w = npArray.shape[0] # h, d = 0,0 if dimension == 2: if order == "C": stride = 0 if order == "F": stride = -1 h, w = npArray.shape d = 1 if allowSurfaceBind: descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: descrArr = cuda.ArrayDescriptor() descrArr.width = w descrArr.height = h # descrArr.depth = d elif dimension == 3: if order == "C": stride = 1 if order == "F": stride = 1 d, h, w = npArray.shape descrArr = cuda.ArrayDescriptor3D() descrArr.width = w descrArr.height = h descrArr.depth = d else: raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ") if prec == 'float': descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im) descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure descrArr.num_channels = 2 elif prec == 'double': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure descrArr.num_channels = 2 elif prec == 'cdouble': # Hack for doubles descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure descrArr.num_channels = 4 else: descrArr.format = cuda.dtype_to_array_format(npArray.dtype) descrArr.num_channels = 1 if allowSurfaceBind: if dimension==2: descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED descrArr.flags |= cuda.array3d_flags.SURFACE_LDST cudaArray = cuda.Array(descrArr) if allowSurfaceBind or dimension==3 : copy3D = cuda.Memcpy3D() copy3D.set_src_host(npArray) copy3D.set_dst_array(cudaArray) copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support copy3D.src_height = copy3D.height = h copy3D.depth = d copy3D() return cudaArray, copy3D else: # if dimension == 3: # copy3D = cuda.Memcpy3D() # copy3D.set_src_host(npArray) # copy3D.set_dst_array(cudaArray) # copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride] # # if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support # # if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support # copy3D.src_height = copy3D.height = h # copy3D.depth = d # copy3D() # return cudaArray, copy3D # if dimension == 2: cudaArray = cuda.Array(descrArr) copy2D = cuda.Memcpy2D() copy2D.set_src_host(npArray) copy2D.set_dst_array(cudaArray) copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride] # copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support copy2D.src_height = copy2D.height = h copy2D(aligned=True) return cudaArray, copy2D
# memory allocate f = np.zeros((nx,ny,nz),'f',order='F') ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) descr = cuda.ArrayDescriptor3D() descr.width = nx descr.height = ny descr.depth = nz descr.format = cuda.dtype_to_array_format(f.dtype) descr.num_channels = 1 descr.flags = 0 cex_gpu = cuda.Array(descr) cey_gpu = cuda.Array(descr) cez_gpu = cuda.Array(descr) chx_gpu = cuda.Array(descr) chy_gpu = cuda.Array(descr) chz_gpu = cuda.Array(descr) mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz