Esempio n. 1
0
def create_3d_texture(a, module, variable, point_sampling=False):
    
    a = numpy.asfortranarray(a)
    w, h, d = a.shape
    
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(a.dtype)
    descr.num_channels = 1
    descr.flags = 0
    ary = cuda.Array(descr)
   
    copy = cuda.Memcpy3D()
    copy.set_src_host(a)
    copy.set_dst_array(ary)
    copy.width_in_bytes = copy.src_pitch = a.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    out_texref = module.get_texref(variable)
    out_texref.set_array(ary)
    if point_sampling: 
        out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: 
        out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Esempio n. 2
0
def numpy3d_to_array(np_array, allow_surface_bind=True):
    """Converts 3D numpy array to 3D device array.
    """
    # numpy3d_to_array
    # taken from pycuda mailing list (striped for C ordering only)

    d, h, w = np_array.shape

    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST

    device_array = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array
Esempio n. 3
0
def gpuArray3DtocudaArray( gpuArray, allowSurfaceBind=False, precision='float' ):
  #import pycuda.autoinit
  d, h, w = gpuArray.shape
  descr3D = cuda.ArrayDescriptor3D()
  descr3D.width = w
  descr3D.height = h
  descr3D.depth = d
  if precision == 'float':
    descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype)
    descr3D.num_channels = 1
  elif precision == 'double': 
    descr3D.format = cuda.array_format.SIGNED_INT32
    descr3D.num_channels = 2
  else: 
    print "ERROR:  CUDA_ARRAY incompatible precision"
    sys.exit()
  descr3D.flags = 0
  if allowSurfaceBind:
    descr3D.flags = cuda.array3d_flags.SURFACE_LDST
  cudaArray = cuda.Array(descr3D)
  copy3D = cuda.Memcpy3D()
  copy3D.set_src_device(gpuArray.ptr)
  copy3D.set_dst_array(cudaArray)
  copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1]
  copy3D.src_height = copy3D.height = h
  copy3D.depth = d
  copy3D()
  return cudaArray, copy3D
Esempio n. 4
0
    def bind_to_texref_ext(self,
                           texref,
                           channels=1,
                           allow_double_hack=False,
                           allow_offset=False):
        if self.dtype == np.float64 and allow_double_hack:
            if channels != 1:
                raise ValueError, "'fake' double precision textures can only have one channel"

            channels = 2
            fmt = drv.array_format.SIGNED_INT32
            read_as_int = True
        else:
            fmt = drv.dtype_to_array_format(self.dtype)
            read_as_int = np.integer in self.dtype.type.__mro__

        offset = texref.set_address(self.gpudata,
                                    self.nbytes,
                                    allow_offset=allow_offset)
        texref.set_format(fmt, channels)

        if read_as_int:
            texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER)

        return offset / self.dtype.itemsize
Esempio n. 5
0
def numpy3d_to_array(np_array, allow_surface_bind=True):

    import pycuda.autoinit

    d, h, w = np_array.shape

    descr = drv.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = drv.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = drv.array3d_flags.SURFACE_LDST

    device_array = drv.Array(descr)

    copy = drv.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array
Esempio n. 6
0
def create_3d_texture(a, module, variable, point_sampling=False):
    
    a = numpy.asfortranarray(a)
    w, h, d = a.shape
    
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(a.dtype)
    descr.num_channels = 1
    descr.flags = 0
    ary = cuda.Array(descr)
   
    copy = cuda.Memcpy3D()
    copy.set_src_host(a)
    copy.set_dst_array(ary)
    copy.width_in_bytes = copy.src_pitch = a.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    out_texref = module.get_texref(variable)
    out_texref.set_array(ary)
    if point_sampling: 
        out_texref.set_filter_mode(cuda.filter_mode.POINT)
    else: 
        out_texref.set_filter_mode(cuda.filter_mode.LINEAR)
    return out_texref
Esempio n. 7
0
def numpy3d_to_array(np_array):
    '''Copy a 3D (d,h,w) numpy array into a 3D pycuda array that can be used 
    to set a texture.  (For some reason, gpuarrays can't be used to do that 
    directly).  A transpose happens implicitly; the CUDA array has dim (w,h,d).'''
    import pycuda.autoinit
    d, h, w = np_array.shape
    descr = driver.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    if np_array.dtype == np.float64:
        descr.format = driver.array_format.SIGNED_INT32
        descr.num_channels = 2
    else:
        descr.format = driver.dtype_to_array_format(np_array.dtype)
        descr.num_channels = 1
    descr.flags = 0
    device_array = driver.Array(descr)
    copy = driver.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()
    return device_array
Esempio n. 8
0
    def bind_to_texref_ext(self,
                           texref,
                           channels=1,
                           allow_double_hack=False,
                           allow_offset=False):
        if not self.flags.forc:
            raise RuntimeError("only contiguous arrays may "
                               "be used as arguments to this operation")

        if self.dtype == np.float64 and allow_double_hack:
            if channels != 1:
                raise ValueError("'fake' double precision textures can "
                                 "only have one channel")

            channels = 2
            fmt = drv.array_format.SIGNED_INT32
            read_as_int = True
        else:
            fmt = drv.dtype_to_array_format(self.dtype)
            read_as_int = np.integer in self.dtype.type.__mro__

        offset = texref.set_address(self.gpudata,
                                    self.nbytes,
                                    allow_offset=allow_offset)
        texref.set_format(fmt, channels)

        if read_as_int:
            texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER)

        return offset / self.dtype.itemsize
Esempio n. 9
0
def numpy3d_to_array(np_array, allow_surface_bind=False, layered=True):
    d, h, w = np_array.shape

    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if allow_surface_bind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST

    if layered:
        descr.flags = cuda.array3d_flags.ARRAY3D_LAYERED

    device_array = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array
Esempio n. 10
0
    def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False,
            allow_offset=False):
        if not self.flags.forc:
            raise RuntimeError("only contiguous arrays may "
                    "be used as arguments to this operation")

        if self.dtype == np.float64 and allow_double_hack:
            if channels != 1:
                raise ValueError(
                        "'fake' double precision textures can "
                        "only have one channel")

            channels = 2
            fmt = drv.array_format.SIGNED_INT32
            read_as_int = True
        else:
            fmt = drv.dtype_to_array_format(self.dtype)
            read_as_int = np.integer in self.dtype.type.__mro__

        offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset)
        texref.set_format(fmt, channels)

        if read_as_int:
            texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER)

        return offset/self.dtype.itemsize
Esempio n. 11
0
def gpuArray3DtocudaArray(gpuArray, allowSurfaceBind=False, precision='float'):
    #import pycuda.autoinit
    d, h, w = gpuArray.shape
    descr3D = cuda.ArrayDescriptor3D()
    descr3D.width = w
    descr3D.height = h
    descr3D.depth = d
    if precision == 'float':
        descr3D.format = cuda.dtype_to_array_format(gpuArray.dtype)
        descr3D.num_channels = 1
    elif precision == 'double':
        descr3D.format = cuda.array_format.SIGNED_INT32
        descr3D.num_channels = 2
    else:
        print("ERROR:  CUDA_ARRAY incompatible precision")
        sys.exit()
    descr3D.flags = 0
    if allowSurfaceBind:
        descr3D.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr3D)
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_device(gpuArray.ptr)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = gpuArray.strides[1]
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
Esempio n. 12
0
def to_tex3d(data):
    """
    Source: https://wiki.tiker.net/PyCUDA/Examples/Demo3DSurface
    """
    d, h, w = data.shape
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(data.dtype)
    descr.num_channels = 1
    descr.flags = 0

    if isinstance(data, gpuarray.GPUArray):
        data = data.get()

    device_array = cuda.Array(descr)
    copy = cuda.Memcpy3D()
    copy.set_src_host(data)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = data.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    return device_array
Esempio n. 13
0
    def copyTexture(self, data, pixelSize=None, extent=None, isocenter=None):
        # Convert data to float32 array in Contiguous ordering.
        self.arrIn = np.array(data, dtype=np.float32, order='C')
        d, h, w = self.arrIn.shape

        descr = cuda.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = cuda.dtype_to_array_format(self.arrIn.dtype)
        descr.num_channels = 1
        # descr.flags = 0
        self.gpuTexture = cuda.Array(descr)

        # Copy array data across. This puts a 3D array in linear memory on the GPU.
        copy = cuda.Memcpy3D()
        copy.set_src_host(self.arrIn)
        copy.set_dst_array(self.gpuTexture)
        copy.src_pitch = self.arrIn.strides[1]
        copy.width_in_bytes = self.arrIn.strides[1]
        copy.height = h
        copy.depth = d
        copy.src_height = h
        copy()

        self.pixelSize = pixelSize
        self.isocenter = isocenter
        # Extent[l,r,b,t,f,b]
        self.extent = extent
        # Trigger for setting bottom left corner as 0,0,0.
        self.zeroExtent = False
Esempio n. 14
0
def np3d_to_device_array(np_array, allow_surface_bind=True):
      d, h, w = np_array.shape

      descr = drv.ArrayDescriptor3D()
      descr.width = w
      descr.height = h
      descr.depth = d
      descr.format = drv.dtype_to_array_format(np_array.dtype)
      descr.num_channels = 1
      descr.flags = 0

      if allow_surface_bind:
            descr.flags = drv.array3d_flags.SURFACE_LDST

      device_array = drv.Array(descr)

      copy = drv.Memcpy3D()
      copy.set_src_host(np_array)
      copy.set_dst_array(device_array)
      copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
      copy.src_height = copy.height = h
      copy.depth = d

      copy()

      return device_array
Esempio n. 15
0
def create_2D_array(mat):
    descr = driver.ArrayDescriptor()
    descr.width = mat.shape[1]
    descr.height = mat.shape[0]
    descr.format = driver.dtype_to_array_format(mat.dtype)
    descr.num_channels = 1
    descr.flags = 0
    ary = driver.Array(descr)
    return ary
Esempio n. 16
0
    def test_3d_texture(self):
        # adapted from code by Nicolas Pinto
        w = 2
        h = 4
        d = 8
        shape = (w, h, d)

        a = np.asarray(
                np.random.randn(*shape),
                dtype=np.float32, order="F")

        descr = drv.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = drv.dtype_to_array_format(a.dtype)
        descr.num_channels = 1
        descr.flags = 0

        ary = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_host(a)
        copy.set_dst_array(ary)
        copy.width_in_bytes = copy.src_pitch = a.strides[1]
        copy.src_height = copy.height = h
        copy.depth = d

        copy()

        mod = SourceModule("""
        texture<float, 3, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int x = threadIdx.x;
          int y = threadIdx.y;
          int z = threadIdx.z;
          int dx = blockDim.x;
          int dy = blockDim.y;
          int i = (z*dy + y)*dx + x;
          dest[i] = tex3D(mtx_tex, x, y, z);
          //dest[i] = x;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        mtx_tex.set_array(ary)

        dest = np.zeros(shape, dtype=np.float32, order="F")
        copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
        assert la.norm(dest-a) == 0
Esempio n. 17
0
    def test_3d_texture(self):
        # adapted from code by Nicolas Pinto
        w = 2
        h = 4
        d = 8
        shape = (w, h, d)

        a = numpy.asarray(numpy.random.randn(*shape),
                          dtype=numpy.float32,
                          order="F")

        descr = drv.ArrayDescriptor3D()
        descr.width = w
        descr.height = h
        descr.depth = d
        descr.format = drv.dtype_to_array_format(a.dtype)
        descr.num_channels = 1
        descr.flags = 0

        ary = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_host(a)
        copy.set_dst_array(ary)
        copy.width_in_bytes = copy.src_pitch = a.strides[1]
        copy.src_height = copy.height = h
        copy.depth = d

        copy()

        mod = SourceModule("""
        texture<float, 3, cudaReadModeElementType> mtx_tex;

        __global__ void copy_texture(float *dest)
        {
          int x = threadIdx.x;
          int y = threadIdx.y;
          int z = threadIdx.z;
          int dx = blockDim.x;
          int dy = blockDim.y;
          int i = (z*dy + y)*dx + x;
          dest[i] = tex3D(mtx_tex, x, y, z);
          //dest[i] = x;
        }
        """)

        copy_texture = mod.get_function("copy_texture")
        mtx_tex = mod.get_texref("mtx_tex")

        mtx_tex.set_array(ary)

        dest = numpy.zeros(shape, dtype=numpy.float32, order="F")
        copy_texture(drv.Out(dest), block=shape, texrefs=[mtx_tex])
        assert la.norm(dest - a) == 0
Esempio n. 18
0
def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez):
    print "rank= %d, (%d, %d, %d)" % (rank, nx, ny, nz),
    total_bytes = nx * ny * nz * 4 * 9
    if total_bytes / (1024 ** 3) == 0:
        print "%d MB" % (total_bytes / (1024 ** 2))
    else:
        print "%1.2f GB" % (float(total_bytes) / (1024 ** 3))

    if nz % Dx != 0:
        print "Error: nz is not multiple of %d" % (Dx)
        sys.exit()
    if (nx * ny) % Dy != 0:
        print "Error: nx*ny is not multiple of %d" % (Dy)
        sys.exit()

    f = np.zeros((nx, ny, nz), "f")
    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    descr = cuda.ArrayDescriptor3D()
    descr.width = nz
    descr.height = ny
    descr.depth = nx
    descr.format = cuda.dtype_to_array_format(f.dtype)
    descr.num_channels = 1
    descr.flags = 0
    tcex_gpu = cuda.Array(descr)
    tcey_gpu = cuda.Array(descr)
    tcez_gpu = cuda.Array(descr)

    mcopy = cuda.Memcpy3D()
    mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
    mcopy.src_height = mcopy.height = ny
    mcopy.depth = nx

    mcopy.set_src_host(cex)
    mcopy.set_dst_array(tcex_gpu)
    mcopy()
    mcopy.set_src_host(cey)
    mcopy.set_dst_array(tcey_gpu)
    mcopy()
    mcopy.set_src_host(cez)
    mcopy.set_dst_array(tcez_gpu)
    mcopy()

    eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu]
    tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu]

    return eh_fields, tex_fields
Esempio n. 19
0
def malloc_gpu_arrays(nx, ny, nz, cex, cey, cez):
    print 'rank= %d, (%d, %d, %d)' % (rank, nx, ny, nz),
    total_bytes = nx * ny * nz * 4 * 9
    if total_bytes / (1024**3) == 0:
        print '%d MB' % (total_bytes / (1024**2))
    else:
        print '%1.2f GB' % (float(total_bytes) / (1024**3))

    if nz % Dx != 0:
        print "Error: nz is not multiple of %d" % (Dx)
        sys.exit()
    if (nx * ny) % Dy != 0:
        print "Error: nx*ny is not multiple of %d" % (Dy)
        sys.exit()

    f = np.zeros((nx, ny, nz), 'f')
    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    descr = cuda.ArrayDescriptor3D()
    descr.width = nz
    descr.height = ny
    descr.depth = nx
    descr.format = cuda.dtype_to_array_format(f.dtype)
    descr.num_channels = 1
    descr.flags = 0
    tcex_gpu = cuda.Array(descr)
    tcey_gpu = cuda.Array(descr)
    tcez_gpu = cuda.Array(descr)

    mcopy = cuda.Memcpy3D()
    mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
    mcopy.src_height = mcopy.height = ny
    mcopy.depth = nx

    mcopy.set_src_host(cex)
    mcopy.set_dst_array(tcex_gpu)
    mcopy()
    mcopy.set_src_host(cey)
    mcopy.set_dst_array(tcey_gpu)
    mcopy()
    mcopy.set_src_host(cez)
    mcopy.set_dst_array(tcez_gpu)
    mcopy()

    eh_fields = [ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu]
    tex_fields = [tcex_gpu, tcey_gpu, tcez_gpu]

    return eh_fields, tex_fields
Esempio n. 20
0
def gpuArray2DtocudaArray(gpuArray):
    #import pycuda.autoinit
    h, w = gpuArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype)
    descr2D.num_channels = 1
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_device(gpuArray.ptr)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = gpuArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, copy2D
Esempio n. 21
0
def gpuArray2DtocudaArray( gpuArray ):
  #import pycuda.autoinit
  h, w = gpuArray.shape
  descr2D = cuda.ArrayDescriptor()
  descr2D.width = w
  descr2D.height = h
  descr2D.format = cuda.dtype_to_array_format(gpuArray.dtype)
  descr2D.num_channels = 1
  cudaArray = cuda.Array(descr2D)
  copy2D = cuda.Memcpy2D()
  copy2D.set_src_device(gpuArray.ptr)
  copy2D.set_dst_array(cudaArray)
  copy2D.src_pitch = gpuArray.strides[0]
  copy2D.width_in_bytes = copy2D.src_pitch = gpuArray.strides[0]
  copy2D.src_height = copy2D.height = h
  copy2D(aligned=True)
  return cudaArray, copy2D
Esempio n. 22
0
def np2DtoCudaArray( npArray, allowSurfaceBind=False ):
  #import pycuda.autoinit
  h, w = npArray.shape
  descr2D = cuda.ArrayDescriptor()
  descr2D.width = w
  descr2D.height = h
  descr2D.format = cuda.dtype_to_array_format(npArray.dtype)
  descr2D.num_channels = 1
  if allowSurfaceBind:
    descr.flags = cuda.array3d_flags.SURFACE_LDST
  cudaArray = cuda.Array(descr2D)
  copy2D = cuda.Memcpy2D()
  copy2D.set_src_host(npArray)
  copy2D.set_dst_array(cudaArray)
  copy2D.src_pitch = npArray.strides[0]
  copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0]
  copy2D.src_height = copy2D.height = h
  copy2D(aligned=True)
  return cudaArray, descr2D
Esempio n. 23
0
def np2DtoCudaArray(npArray, allowSurfaceBind=False):
    #import pycuda.autoinit
    h, w = npArray.shape
    descr2D = cuda.ArrayDescriptor()
    descr2D.width = w
    descr2D.height = h
    descr2D.format = cuda.dtype_to_array_format(npArray.dtype)
    descr2D.num_channels = 1
    if allowSurfaceBind:
        descr.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr2D)
    copy2D = cuda.Memcpy2D()
    copy2D.set_src_host(npArray)
    copy2D.set_dst_array(cudaArray)
    copy2D.src_pitch = npArray.strides[0]
    copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0]
    copy2D.src_height = copy2D.height = h
    copy2D(aligned=True)
    return cudaArray, descr2D
def texCubeToGPU(texCube):
    descr = cuda.ArrayDescriptor3D()
    descr.width = texCube.shape[2]
    descr.height = texCube.shape[1]
    descr.depth = 6
    descr.format = cuda.dtype_to_array_format(texCube.dtype)
    descr.num_channels = 4
    descr.flags = cuda.array3d_flags.CUBEMAP

    texCubeArray = cuda.Array(descr)
    copy = cuda.Memcpy3D()
    copy.set_src_host(texCube)
    copy.set_dst_array(texCubeArray)
    copy.width_in_bytes = copy.src_pitch = texCube.strides[1]  #d*h*w*c
    copy.src_height = copy.height = texCube.shape[1]
    copy.depth = 6

    copy()

    return texCubeArray
Esempio n. 25
0
    def bind_to_texref_ext(self, texref, channels=1, allow_double_hack=False, 
            allow_offset=False):
        if self.dtype == numpy.float64 and allow_double_hack:
            if channels != 1:
                raise ValueError, "'fake' double precision textures can only have one channel"

            channels = 2
            fmt = drv.array_format.SIGNED_INT32
            read_as_int = True
        else:
            fmt = drv.dtype_to_array_format(self.dtype)
            read_as_int = numpy.integer in self.dtype.type.__mro__

        offset = texref.set_address(self.gpudata, self.nbytes, allow_offset=allow_offset)
        texref.set_format(fmt, channels)

        if read_as_int:
            texref.set_flags(texref.get_flags() | drv.TRSF_READ_AS_INTEGER)

        return offset/self.dtype.itemsize
Esempio n. 26
0
def np3DtoCudaArray( npArray, allowSurfaceBind=False ):
  #import pycuda.autoinit
  d, h, w = npArray.shape
  descr3D = cuda.ArrayDescriptor3D()
  descr3D.width = w
  descr3D.height = h
  descr3D.depth = d
  descr3D.format = cuda.dtype_to_array_format(npArray.dtype)
  descr3D.num_channels = 1
  descr3D.flags = 0
  if allowSurfaceBind:
    descr3D.flags = cuda.array3d_flags.SURFACE_LDST
  cudaArray = cuda.Array(descr3D)
  copy3D = cuda.Memcpy3D()
  copy3D.set_src_host(npArray)
  copy3D.set_dst_array(cudaArray)
  copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1]
  copy3D.src_height = copy3D.height = h
  copy3D.depth = d
  copy3D()
  return cudaArray
Esempio n. 27
0
def np3DtoCudaArray(npArray, allowSurfaceBind=False):
    #import pycuda.autoinit
    d, h, w = npArray.shape
    descr3D = cuda.ArrayDescriptor3D()
    descr3D.width = w
    descr3D.height = h
    descr3D.depth = d
    descr3D.format = cuda.dtype_to_array_format(npArray.dtype)
    descr3D.num_channels = 1
    descr3D.flags = 0
    if allowSurfaceBind:
        descr3D.flags = cuda.array3d_flags.SURFACE_LDST
    cudaArray = cuda.Array(descr3D)
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1]
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
Esempio n. 28
0
def gpu3D(src):
    """
    """
    w, h, d = src.shape
    descr = cuda.ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = cuda.dtype_to_array_format(src.dtype)
    descr.num_channels = 1
    descr.flags = 0
    dst = cuda.Array(descr)

    copy = cuda.Memcpy3D()
    copy.set_src_host(src)
    copy.set_dst_array(dst)
    copy.width_in_bytes = copy.src_pitch = src.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d
    copy()

    return dst
Esempio n. 29
0
 def _prepare_F_texture(self):
     
     descr = drv.ArrayDescriptor3D()
     descr.width = self.side
     descr.height = self.side
     descr.depth = self.side
     descr.format = drv.dtype_to_array_format(self.F_gpu.dtype)
     descr.num_channels = 1
     descr.flags = 0
     
     F_array = drv.Array(descr)
     
     copy = drv.Memcpy3D()
     copy.set_src_device(self.F_gpu.gpudata)
     copy.set_dst_array(F_array)
     copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1]
     copy.src_height = copy.height = self.side
     copy.depth = self.side
     
     self.F_gpu_to_array_copy = copy
     self.F_gpu_to_array_copy()
     self.F_texture.set_array(F_array)
Esempio n. 30
0
    def _prepare_F_texture(self):

        descr = drv.ArrayDescriptor3D()
        descr.width = self.side
        descr.height = self.side
        descr.depth = self.side
        descr.format = drv.dtype_to_array_format(self.F_gpu.dtype)
        descr.num_channels = 1
        descr.flags = 0

        F_array = drv.Array(descr)

        copy = drv.Memcpy3D()
        copy.set_src_device(self.F_gpu.gpudata)
        copy.set_dst_array(F_array)
        copy.width_in_bytes = copy.src_pitch = self.F_gpu.strides[1]
        copy.src_height = copy.height = self.side
        copy.depth = self.side

        self.F_gpu_to_array_copy = copy
        self.F_gpu_to_array_copy()
        self.F_texture.set_array(F_array)
Esempio n. 31
0
def numpy3d_to_array(np_array, order=None):
    '''
    Method for copying a numpy array to a CUDA array

    If you get a buffer error, run this method on np_array.copy('F')
    '''
    from pycuda.driver import Array, ArrayDescriptor3D, Memcpy3D, dtype_to_array_format
    if order is None:
        order = 'C' if np_array.strides[0] > np_array.strides[2] else 'F'

    if order.upper() == 'C':
        d, h, w = np_array.shape
    elif order.upper() == "F":
        w, h, d = np_array.shape
    else:
        raise Exception("order must be either F or C")

    descr = ArrayDescriptor3D()
    descr.width = w
    descr.height = h
    descr.depth = d
    descr.format = dtype_to_array_format(np_array.dtype)
    descr.num_channels = 1
    descr.flags = 0

    device_array = Array(descr)

    copy = Memcpy3D()
    copy.set_src_host(np_array)
    copy.set_dst_array(device_array)
    copy.width_in_bytes = copy.src_pitch = np_array.strides[1]
    copy.src_height = copy.height = h
    copy.depth = d

    copy()

    return device_array
Esempio n. 32
0
    def alloc_coeff_arrays(s):
        f = np.zeros((s.nx, s.ny, s.nz), 'f')
        s.cex = np.ones_like(f) * 0.5
        s.cex[:, -1, :] = 0
        s.cex[:, :, -1] = 0
        s.cey = np.ones_like(f) * 0.5
        s.cey[:, :, -1] = 0
        s.cey[-1, :, :] = 0
        s.cez = np.ones_like(f) * 0.5
        s.cez[-1, :, :] = 0
        s.cez[:, -1, :] = 0

        descr = cuda.ArrayDescriptor3D()
        descr.width = s.nz
        descr.height = s.ny
        descr.depth = s.nx
        descr.format = cuda.dtype_to_array_format(f.dtype)
        descr.num_channels = 1
        descr.flags = 0
        s.tcex_gpu = cuda.Array(descr)
        s.tcey_gpu = cuda.Array(descr)
        s.tcez_gpu = cuda.Array(descr)

        mcpy = cuda.Memcpy3D()
        mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1]
        mcpy.src_height = mcpy.height = s.ny
        mcpy.depth = s.nx
        mcpy.set_src_host(s.cex)
        mcpy.set_dst_array(s.tcex_gpu)
        mcpy()
        mcpy.set_src_host(s.cey)
        mcpy.set_dst_array(s.tcey_gpu)
        mcpy()
        mcpy.set_src_host(s.cez)
        mcpy.set_dst_array(s.tcez_gpu)
        mcpy()
Esempio n. 33
0
	def alloc_coeff_arrays(s):
		f = np.zeros((s.nx, s.ny, s.nz), 'f')
		s.cex = np.ones_like(f)*0.5
		s.cex[:,-1,:] = 0
		s.cex[:,:,-1] = 0
		s.cey = np.ones_like(f)*0.5
		s.cey[:,:,-1] = 0
		s.cey[-1,:,:] = 0
		s.cez = np.ones_like(f)*0.5
		s.cez[-1,:,:] = 0
		s.cez[:,-1,:] = 0

		descr = cuda.ArrayDescriptor3D()
		descr.width = s.nz
		descr.height = s.ny
		descr.depth = s.nx
		descr.format = cuda.dtype_to_array_format(f.dtype)
		descr.num_channels = 1
		descr.flags = 0
		s.tcex_gpu = cuda.Array(descr)
		s.tcey_gpu = cuda.Array(descr)
		s.tcez_gpu = cuda.Array(descr)

		mcpy = cuda.Memcpy3D()
		mcpy.width_in_bytes = mcpy.src_pitch = f.strides[1]
		mcpy.src_height = mcpy.height = s.ny
		mcpy.depth = s.nx
		mcpy.set_src_host( s.cex )
		mcpy.set_dst_array( s.tcex_gpu )
		mcpy()
		mcpy.set_src_host( s.cey )
		mcpy.set_dst_array( s.tcey_gpu )
		mcpy()
		mcpy.set_src_host( s.cez )
		mcpy.set_dst_array( s.tcez_gpu )
		mcpy()
Esempio n. 34
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(FunctionDeclaration(Value("void", "apply_flux"),
            [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
                ]+[
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
                ]
            ))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value("texture<%s, 1, cudaReadModeElementType>"
                    % dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
                ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn("Flux count in flux execution plan different from actual flux count.\n"
                    "You may want to specify the tune_for= kwarg in the Discretization\n"
                    "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define("COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB", fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE", "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
            ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
            ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(
                            POD(float_type, "smem_fluxes_on_faces"),
                            "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")
                    ),
                Line(),
                ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(get_load_code(
            dest="&data",
            base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
            bytes="sizeof(flux_data)",
            descr="load face_pair data")
            +[S("__syncthreads()"), Line() ])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(
                    Value("face_pair", "fpair")),
                    "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
                ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(True))
                ),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment("fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                get_flux_code(lambda:
                    self.write_interior_flux_code(False))
                ),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While("fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))
                ),
            ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
            ])

        if not fplan.direct_store:
            f_body.extend([
                Line(),
                S("__syncthreads()"),
                Line()
                ])

            f_body.extend_log_block("store fluxes", [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For("unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([Assign(
                            "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]" % flux_nr,
                            "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))]
                           #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                               #Block([
                                   #Assign("debugbuf[blockIdx.x]", "word_nr"),
                                   #])
                               #)
                            #for flux_nr in range(len(self.fluxes))]
                        )
                    )
                    ])
        if False:
            f_body.extend([
                    Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]", "fpair_nr"),
                    Assign("debugbuf[blockIdx.x*96+16]", "data.header.same_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+17]", "data.header.diff_facepairs_end"),
                    Assign("debugbuf[blockIdx.x*96+18]", "data.header.bdry_facepairs_end"),
                    ]
                    )

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
                #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
                cmod,
                keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
                (dep_expr, mod.get_texref(
                    "field%d_tex" % self.dep_to_index[dep_expr]))
                for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(
                ilist_data.device_memory,
                ilist_data.bytes)
        index_list_texref.set_format(
                cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
                (2+len(self.fluxes))*"P",
                texrefs=expr_to_texture_map.values()
                + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                    func.local_size_bytes,
                    func.shared_size_bytes,
                    func.num_regs)

        return block, func, expr_to_texture_map
    # memory allocate
    f = np.zeros((nx, ny, nz), 'f', order='F')

    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    # memory allocate with cuda array
    descr = cuda.ArrayDescriptor3D()
    descr.width = nx
    descr.height = ny
    descr.depth = nz
    descr.format = cuda.dtype_to_array_format(f.dtype)
    descr.num_channels = 1
    descr.flags = 0

    cex_gpu = cuda.Array(descr)
    cey_gpu = cuda.Array(descr)
    cez_gpu = cuda.Array(descr)
    chx_gpu = cuda.Array(descr)
    chy_gpu = cuda.Array(descr)
    chz_gpu = cuda.Array(descr)

    mcopy = cuda.Memcpy3D()
    mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
    mcopy.src_height = mcopy.height = ny
    mcopy.depth = nz
Esempio n. 36
0
    def get_kernel(self, fdata, ilist_data, for_benchmark):
        from cgen.cuda import CudaShared, CudaGlobal
        from pycuda.tools import dtype_to_ctype

        discr = self.discr
        given = self.plan.given
        fplan = self.plan
        d = discr.dimensions
        dims = range(d)

        elgroup, = discr.element_groups

        float_type = given.float_type

        f_decl = CudaGlobal(
            FunctionDeclaration(Value("void", "apply_flux"), [
                Pointer(POD(float_type, "debugbuf")),
                Pointer(POD(numpy.uint8, "gmem_facedata")),
            ] + [
                Pointer(POD(float_type, "gmem_fluxes_on_faces%d" % flux_nr))
                for flux_nr in range(len(self.fluxes))
            ]))

        cmod = Module()
        cmod.append(Include("pycuda-helpers.hpp"))

        for dep_expr in self.all_deps:
            cmod.extend([
                Value(
                    "texture<%s, 1, cudaReadModeElementType>" %
                    dtype_to_ctype(float_type, with_fp_tex_hack=True),
                    "field%d_tex" % self.dep_to_index[dep_expr])
            ])

        if fplan.flux_count != len(self.fluxes):
            from warnings import warn
            warn(
                "Flux count in flux execution plan different from actual flux count.\n"
                "You may want to specify the tune_for= kwarg in the Discretization\n"
                "constructor.")

        cmod.extend([
            Line(),
            Typedef(POD(float_type, "value_type")),
            Line(),
            flux_header_struct(float_type, discr.dimensions),
            Line(),
            face_pair_struct(float_type, discr.dimensions),
            Line(),
            Define("DIMENSIONS", discr.dimensions),
            Define("DOFS_PER_FACE", fplan.dofs_per_face),
            Define("THREADS_PER_FACE", fplan.threads_per_face()),
            Line(),
            Define("CONCURRENT_FACES", fplan.parallel_faces),
            Define("BLOCK_MB_COUNT", fplan.mbs_per_block),
            Line(),
            Define("FACEDOF_NR", "threadIdx.x"),
            Define("BLOCK_FACE", "threadIdx.y"),
            Line(),
            Define("FLUX_COUNT", len(self.fluxes)),
            Line(),
            Define("THREAD_NUM", "(FACEDOF_NR + BLOCK_FACE*THREADS_PER_FACE)"),
            Define("THREAD_COUNT", "(THREADS_PER_FACE*CONCURRENT_FACES)"),
            Define(
                "COALESCING_THREAD_COUNT",
                "(THREAD_COUNT < 0x10 ? THREAD_COUNT : THREAD_COUNT & ~0xf)"),
            Line(),
            Define("DATA_BLOCK_SIZE", fdata.block_bytes),
            Define("ALIGNED_FACE_DOFS_PER_MB",
                   fplan.aligned_face_dofs_per_microblock()),
            Define("ALIGNED_FACE_DOFS_PER_BLOCK",
                   "(ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT)"),
            Line(),
            Define("FOF_BLOCK_BASE",
                   "(blockIdx.x*ALIGNED_FACE_DOFS_PER_BLOCK)"),
            Line(),
        ] + ilist_data.code + [
            Line(),
            Value("texture<index_list_entry_t, 1, cudaReadModeElementType>",
                  "tex_index_lists"),
            Line(),
            fdata.struct,
            Line(),
            CudaShared(Value("flux_data", "data")),
        ])

        if not fplan.direct_store:
            cmod.extend([
                CudaShared(
                    ArrayOf(
                        ArrayOf(POD(float_type, "smem_fluxes_on_faces"),
                                "FLUX_COUNT"),
                        "ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT")),
                Line(),
            ])

        S = Statement
        f_body = Block()

        from hedge.backends.cuda.tools import get_load_code

        f_body.extend(
            get_load_code(dest="&data",
                          base="gmem_facedata + blockIdx.x*DATA_BLOCK_SIZE",
                          bytes="sizeof(flux_data)",
                          descr="load face_pair data") +
            [S("__syncthreads()"), Line()])

        def get_flux_code(flux_writer):
            flux_code = Block([])

            flux_code.extend([
                Initializer(Pointer(Value("face_pair", "fpair")),
                            "data.facepairs+fpair_nr"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "a_index")),
                    "fpair->a_base + tex1Dfetch(tex_index_lists, "
                    "fpair->a_ilist_index + FACEDOF_NR)"),
                Initializer(
                    MaybeUnused(POD(numpy.uint32, "b_index")),
                    "fpair->b_base + tex1Dfetch(tex_index_lists, "
                    "fpair->b_ilist_index + FACEDOF_NR)"),
                Line(),
                flux_writer(),
                Line(),
                S("fpair_nr += CONCURRENT_FACES")
            ])

            return flux_code

        flux_computation = Block([
            Comment("fluxes for dual-sided (intra-block) interior face pairs"),
            While("fpair_nr < data.header.same_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(True))),
            Line(),
            Comment("work around nvcc assertion failure"),
            S("fpair_nr+=1"),
            S("fpair_nr-=1"),
            Line(),
            Comment(
                "fluxes for single-sided (inter-block) interior face pairs"),
            While("fpair_nr < data.header.diff_facepairs_end",
                  get_flux_code(lambda: self.write_interior_flux_code(False))),
            Line(),
            Comment("fluxes for single-sided boundary face pairs"),
            While(
                "fpair_nr < data.header.bdry_facepairs_end",
                get_flux_code(
                    lambda: self.write_boundary_flux_code(for_benchmark))),
        ])

        f_body.extend_log_block("compute the fluxes", [
            Initializer(POD(numpy.uint32, "fpair_nr"), "BLOCK_FACE"),
            If("FACEDOF_NR < DOFS_PER_FACE", flux_computation)
        ])

        if not fplan.direct_store:
            f_body.extend([Line(), S("__syncthreads()"), Line()])

            f_body.extend_log_block(
                "store fluxes",
                [
                    #Assign("debugbuf[blockIdx.x]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "FOF_BLOCK_BASE"),
                    #Assign("debugbuf[0]", "sizeof(face_pair)"),
                    For(
                        "unsigned word_nr = THREAD_NUM",
                        "word_nr < ALIGNED_FACE_DOFS_PER_MB*BLOCK_MB_COUNT",
                        "word_nr += COALESCING_THREAD_COUNT",
                        Block([
                            Assign(
                                "gmem_fluxes_on_faces%d[FOF_BLOCK_BASE+word_nr]"
                                % flux_nr,
                                "smem_fluxes_on_faces[%d][word_nr]" % flux_nr)
                            for flux_nr in range(len(self.fluxes))
                        ]
                              #+[If("isnan(smem_fluxes_on_faces[%d][word_nr])" % flux_nr,
                              #Block([
                              #Assign("debugbuf[blockIdx.x]", "word_nr"),
                              #])
                              #)
                              #for flux_nr in range(len(self.fluxes))]
                              ))
                ])
        if False:
            f_body.extend([
                Assign("debugbuf[blockIdx.x*96+32+BLOCK_FACE*32+threadIdx.x]",
                       "fpair_nr"),
                Assign("debugbuf[blockIdx.x*96+16]",
                       "data.header.same_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+17]",
                       "data.header.diff_facepairs_end"),
                Assign("debugbuf[blockIdx.x*96+18]",
                       "data.header.bdry_facepairs_end"),
            ])

        # finish off ----------------------------------------------------------
        cmod.append(FunctionBody(f_decl, f_body))

        if not for_benchmark and "cuda_dump_kernels" in discr.debug:
            from hedge.tools import open_unique_debug_file
            open_unique_debug_file("flux_gather", ".cu").write(str(cmod))

        #from pycuda.tools import allow_user_edit
        mod = SourceModule(
            #allow_user_edit(cmod, "kernel.cu", "the flux kernel"),
            cmod,
            keep="cuda_keep_kernels" in discr.debug)
        expr_to_texture_map = dict(
            (dep_expr,
             mod.get_texref("field%d_tex" % self.dep_to_index[dep_expr]))
            for dep_expr in self.all_deps)

        index_list_texref = mod.get_texref("tex_index_lists")
        index_list_texref.set_address(ilist_data.device_memory,
                                      ilist_data.bytes)
        index_list_texref.set_format(
            cuda.dtype_to_array_format(ilist_data.type), 1)
        index_list_texref.set_flags(cuda.TRSF_READ_AS_INTEGER)

        func = mod.get_function("apply_flux")
        block = (fplan.threads_per_face(), fplan.parallel_faces, 1)
        func.prepare(
            (2 + len(self.fluxes)) * "P",
            texrefs=expr_to_texture_map.values() + [index_list_texref])

        if "cuda_flux" in discr.debug:
            print "flux: lmem=%d smem=%d regs=%d" % (
                func.local_size_bytes, func.shared_size_bytes, func.num_regs)

        return block, func, expr_to_texture_map
Esempio n. 37
0
import numpy as np

w = 2
h = 3
d = 4
shape = (d, h, w)

a = np.arange(24).reshape(*shape, order='C').astype('float32')
print(a.shape, a.strides)
print(a)

descr = drv.ArrayDescriptor3D()
descr.width = w
descr.height = h
descr.depth = d
descr.format = drv.dtype_to_array_format(a.dtype)
descr.num_channels = 1
descr.flags = 0

ary = drv.Array(descr)

copy = drv.Memcpy3D()
copy.set_src_host(a)
copy.set_dst_array(ary)
copy.width_in_bytes = copy.src_pitch = a.strides[1]
copy.src_height = copy.height = h
copy.depth = d

copy()

mod = SourceModule("""
Esempio n. 38
0
def np3DtoCudaArray(npArray, prec, order = "C", allowSurfaceBind=False):
  ''' Some parameters like stride are explained in PyCUDA: driver.py test_driver.py gpuarray.py'''
  # For 1D-2D Cuda Arrays the descriptor is the same just puttin LAYERED flags
#   if order != "C": raise LogicError("Just implemented for C order")
  dimension = len(npArray.shape)
  case = order in ["C","F"]
  if not case:
    raise LogicError("order must be either F or C")
#   if dimension == 1:
#       w = npArray.shape[0]
#       h, d = 0,0
  if dimension == 2:
      if order == "C": stride = 0
      if order == "F": stride = -1
      h, w = npArray.shape
      d = 1
      if allowSurfaceBind:
        descrArr = cuda.ArrayDescriptor3D()
        descrArr.width = w
        descrArr.height = h
        descrArr.depth = d
      else:
        descrArr = cuda.ArrayDescriptor()
        descrArr.width = w
        descrArr.height = h
#         descrArr.depth = d
  elif dimension == 3:
      if order == "C": stride = 1
      if order == "F": stride = 1
      d, h, w = npArray.shape
      descrArr = cuda.ArrayDescriptor3D()
      descrArr.width = w
      descrArr.height = h
      descrArr.depth = d
  else:
      raise LogicError("CUDArray dimesnsion 2 and 3 supported at the moment ... ")
  if prec == 'float':
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1
  elif prec == 'cfloat': # Hack for complex 64 = (float 32, float 32) == (re,im)
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi=re,lo=im) structure
    descrArr.num_channels = 2
  elif prec == 'double': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int2 (hi,lo) structure
    descrArr.num_channels = 2
  elif prec == 'cdouble': # Hack for doubles
    descrArr.format = cuda.array_format.SIGNED_INT32 # Reading data as int4 (re=(hi,lo),im=(hi,lo)) structure
    descrArr.num_channels = 4
  else:
    descrArr.format = cuda.dtype_to_array_format(npArray.dtype)
    descrArr.num_channels = 1

  if allowSurfaceBind:
    if dimension==2:  descrArr.flags |= cuda.array3d_flags.ARRAY3D_LAYERED
    descrArr.flags |= cuda.array3d_flags.SURFACE_LDST

  cudaArray = cuda.Array(descrArr)
  if allowSurfaceBind or dimension==3 :
    copy3D = cuda.Memcpy3D()
    copy3D.set_src_host(npArray)
    copy3D.set_dst_array(cudaArray)
    copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
#     if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
#     if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
    copy3D.src_height = copy3D.height = h
    copy3D.depth = d
    copy3D()
    return cudaArray, copy3D
  else:
#     if dimension == 3:
#       copy3D = cuda.Memcpy3D()
#       copy3D.set_src_host(npArray)
#       copy3D.set_dst_array(cudaArray)
#       copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[stride]
# #       if dimension==3: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[1] #Jut C order support
# #       if dimension==2: copy3D.width_in_bytes = copy3D.src_pitch = npArray.strides[0] #Jut C order support
#       copy3D.src_height = copy3D.height = h
#       copy3D.depth = d
#       copy3D()
#       return cudaArray, copy3D
#     if dimension == 2:
      cudaArray = cuda.Array(descrArr)
      copy2D = cuda.Memcpy2D()
      copy2D.set_src_host(npArray)
      copy2D.set_dst_array(cudaArray)
      copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[stride]
#       copy2D.width_in_bytes = copy2D.src_pitch = npArray.strides[0] #Jut C order support
      copy2D.src_height = copy2D.height = h
      copy2D(aligned=True)
      return cudaArray, copy2D
	# memory allocate
	f = np.zeros((nx,ny,nz),'f',order='F')

	ex_gpu = cuda.to_device(f)
	ey_gpu = cuda.to_device(f)
	ez_gpu = cuda.to_device(f)
	hx_gpu = cuda.to_device(f)
	hy_gpu = cuda.to_device(f)
	hz_gpu = cuda.to_device(f)

	descr = cuda.ArrayDescriptor3D()
	descr.width = nx
	descr.height = ny
	descr.depth = nz
	descr.format = cuda.dtype_to_array_format(f.dtype)
	descr.num_channels = 1
	descr.flags = 0

	cex_gpu = cuda.Array(descr)
	cey_gpu = cuda.Array(descr)
	cez_gpu = cuda.Array(descr)
	chx_gpu = cuda.Array(descr)
	chy_gpu = cuda.Array(descr)
	chz_gpu = cuda.Array(descr)

	mcopy = cuda.Memcpy3D()
	mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
	mcopy.src_height = mcopy.height = ny
	mcopy.depth = nz