Beispiel #1
0
def setup_texture_gpuarr(tex_ref, arr):
    _arr = arr.astype(np.float32) if arr.dtype != np.float32 else arr
    tex_ref.set_array(cuda.gpuarray_to_array(_arr, "C"))
    tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
    tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
    tex_ref.set_filter_mode(cuda.filter_mode.POINT)
    tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
 def update_ref(self):
     """Needs to be called after self.img_d has been written directly"""
     self.debug(3, "Updating original image")
     self.array = cuda.gpuarray_to_array(self.devRef, 'C')
     # 'C' order implies tex2D(x,y) will fetch matrix(y,x):
     # this is where x and y are inverted to comlpy with the kernels order
     self.tex.set_array(self.array)
     self._computeGradients()
     self._ready = False
 def updateOrig(self):
   """
   Needs to be called after self.img_d has been written directly (without using setOrig)
   """
   self.debug(2,"Updating original image")
   self.array = cuda.gpuarray_to_array(self.devOrig,"C")
   self.tex.set_array(self.array)
   self.__computeGradients()
   self.__ready = False
Beispiel #4
0
    def test_2d_fp_surfaces(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints)[:]
            A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized

            myKernRW = '''
            #include <pycuda-helpers.hpp>

            surface<void, cudaSurfaceType2DLayered> mtx_tex;

            __global__ void copy_texture(cuPres *dest, int rw)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int layer = 1;
              int tid = row + col*blockDim.x*gridDim.x ;
              if (rw==0){
              cuPres aux = dest[tid];
              fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);}
              else {
              cuPres aux = 0;
              fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp);
              dest[tid] = aux;
              }
            }
            '''
            myKernRW = myKernRW.replace('fpName',fpName_str)
            myKernRW = myKernRW.replace('cuPres',prec_str)
            modW = SourceModule(myKernRW)

            copy_texture = modW.get_function("copy_texture")
            mtx_tex = modW.get_surfref("mtx_tex")
            cuBlock = (8,8,1)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,1)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
            copy_texture.prepare('Pi')#,texrefs=[mtx_tex])
            A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros
            cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True)
            A_cpu = A_gpu.get() # To remember original array
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
Beispiel #5
0
    def test_2d_fp_surfaces(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints)[:]
            A_gpu = gpuarray.to_gpu(A_cpu) # Array randomized

            myKernRW = '''
            #include <pycuda-helpers.hpp>

            surface<void, cudaSurfaceType2DLayered> mtx_tex;

            __global__ void copy_texture(cuPres *dest, int rw)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int layer = 1;
              int tid = row + col*blockDim.x*gridDim.x ;
              if (rw==0){
              cuPres aux = dest[tid];
              fp_surf2DLayeredwrite(aux, mtx_tex, row, col, layer,cudaBoundaryModeClamp);}
              else {
              cuPres aux = 0;
              fp_surf2DLayeredread(&aux, mtx_tex, col, row, layer, cudaBoundaryModeClamp);
              dest[tid] = aux;
              }
            }
            '''
            myKernRW = myKernRW.replace('fpName',fpName_str)
            myKernRW = myKernRW.replace('cuPres',prec_str)
            modW = SourceModule(myKernRW)

            copy_texture = modW.get_function("copy_texture")
            mtx_tex = modW.get_surfref("mtx_tex")
            cuBlock = (8,8,1)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,1)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
            copy_texture.prepare('Pi')#,texrefs=[mtx_tex])
            A_gpu2 = gpuarray.zeros_like(A_gpu) # To initialize surface with zeros
            cudaArray = drv.gpuarray_to_array(A_gpu2,orden,allowSurfaceBind=True)
            A_cpu = A_gpu.get() # To remember original array
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(0)) # Write random array
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata, np.int32(1)) # Read, but transposed
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
 def set_mask(self, mask):
     self.debug(3, "Setting the mask")
     assert mask.shape == (self.h, self.w), \
       "Got a {} mask in a {} routine.".format(mask.shape, (self.h, self.w))
     if not mask.dtype == np.float32:
         self.debug(2, "Converting the mask to float32")
         mask = mask.astype(np.float32)
     if isinstance(mask, np.ndarray):
         self.maskArray = cuda.matrix_to_array(mask, 'C')
     elif isinstance(mask, gpuarray.GPUArray):
         self.maskArray = cuda.gpuarray_to_array(mask, 'C')
     else:
         self.debug(0, "Error! Mask data type not understood")
         raise ValueError
     self.texMask.set_array(self.maskArray)
 def setImage(self,img_d):
   """
   Set the image to compare with the original
   """
   assert img_d.shape == (self.w,self.h),"Got a {} image in a {} correlation routine!".format(img_d.shape,(self.w,self.h))
   if isinstance(img_d,np.ndarray):
     self.debug(3,"Creating texture from numpy array")
     self.array_d = cuda.matrix_to_array(img_d,"C")
   elif isinstance(img_d,gpuarray.GPUArray):
     self.debug(3,"Creating texture from gpuarray")
     self.array_d = cuda.gpuarray_to_array(img_d,"C")
   else:
     print("[Error] Unknown type of data given to CorrelStage.setImage()")
     raise ValueError
   self.tex_d.set_array(self.array_d)
   self.devX.set(np.zeros(self.Nfields,dtype=np.float32))
    def set_image(self, img_d):
        """
    Set the image to compare with the original

    Note that calling this method is not necessary: you can do .compute(image)
    This will automatically call this method first
    """
        assert img_d.shape == (self.h, self.w), \
          "Got a {} image in a {} correlation routine!".format(
            img_d.shape, (self.h, self.w))
        if isinstance(img_d, np.ndarray):
            self.debug(3, "Creating texture from numpy array")
            self.array_d = cuda.matrix_to_array(img_d, "C")
        elif isinstance(img_d, gpuarray.GPUArray):
            self.debug(3, "Creating texture from gpuarray")
            self.array_d = cuda.gpuarray_to_array(img_d, "C")
        else:
            self.debug(0, "Error ! Unknown type of data given to .set_image()")
            raise ValueError
        self.tex_d.set_array(self.array_d)
        self.devX.set(np.zeros(self.fields_count, dtype=np.float32))
Beispiel #9
0
def ndarray_to_float_tex(tex_ref, ndarray, address_mode=cuda.address_mode.BORDER, filter_mode=cuda.filter_mode.LINEAR):
    if isinstance(ndarray, np.ndarray):
        cu_array = cuda.np_to_array(ndarray, 'C')
    elif isinstance(ndarray, gpuarray.GPUArray):
        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
    else:
        raise TypeError(
            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')

    cuda.TextureReference.set_array(tex_ref, cu_array)

    cuda.TextureReference.set_address_mode(
        tex_ref, 0, address_mode)
    if ndarray.ndim >= 2:
        cuda.TextureReference.set_address_mode(
            tex_ref, 1, address_mode)
    if ndarray.ndim >= 3:
        cuda.TextureReference.set_address_mode(
            tex_ref, 2,  address_mode)
    cuda.TextureReference.set_filter_mode(
        tex_ref, filter_mode)
    tex_ref.set_flags(tex_ref.get_flags(
    ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER)
Beispiel #10
0
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False):
    """
    Lens the input unl_CMB map on the GPU using the pyCUDA interface.
    dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.)
    Can be path to arrays or arrays or memmap.
    Will probably crash for too large maps, with need to split the job.
    Works for 4096 x 4096 at least on my laptop.

    Cost dominated by texture setup. # FIXME : try get rid of texture
    Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans
    for subsequent usage.
    :param unl_CMB:
    :param func: bicubic or bilinear
    :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map
    :return:
    """
    if timed:
        ti = time.time()
    shape = lib_alm.ell_mat.shape
    rshape = (shape[0], shape[1] / 2 + 1)
    assert shape[0] == shape[1], shape
    assert IsPowerOfTwo(shape[0]), shape
    assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape,
                                            lib_alm.ell_mat.shape)
    assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape,
                                            lib_alm.ell_mat.shape)

    assert np.all(np.array(shape) % GPU_block[0] == 0), shape
    if shape[0] > 4096:
        print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---"

    GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1)

    # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost.
    rfft2_unlCMB_gpu = gpuarray.to_gpu(
        lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64))
    coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32)
    plan, plan_inv = get_rfft_plans(shape)

    if not do_not_prefilter:
        # The prefilter makes sure the spline is exact at the nodes.
        # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding :
        # alloc,pitch  = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32
        wx = (6. / (2. * np.cos(
            2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.))
        wx_gpu = gpuarray.to_gpu(wx.astype(np.float32))
        prefilter = CUDA_module.get_function("cf_outer_w")
        prefilter(rfft2_unlCMB_gpu,
                  wx_gpu,
                  np.int32(rshape[1]),
                  np.int32(rshape[0]),
                  block=GPU_block,
                  grid=GPU_grid)
        del wx_gpu

    ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False)

    # Binding arrays to textures and getting lensing func.
    if texture_count == 0:
        lens_func = CUDA_module.get_function("bicubiclensKernel_notex")
        tex_refs = []
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
    elif texture_count == 1:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        tex_refs = [unl_CMB_tex]
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32))
        dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32))
        lens_func = CUDA_module.get_function(
            "bicubiclensKernel_normtex_singletex")
    elif texture_count == 3:
        unl_CMB_tex = CUDA_module.get_texref("unl_CMB")
        dx_tex = CUDA_module.get_texref("tex_dx")
        dy_tex = CUDA_module.get_texref("tex_dy")
        tex_refs = ([unl_CMB_tex, dx_tex, dy_tex])
        unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C"))
        del coeffs_gpu
        cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32),
                              dx_tex,
                              order="C")
        cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32),
                              dy_tex,
                              order="C")
        lens_func = CUDA_module.get_function("bicubiclensKernel_normtex")
    else:
        tex_refs = []
        lens_func = 0
        assert 0
    # Wraping, important for periodic boundary conditions.
    # Note that WRAP has not effect for unnormalized texture coordinates.

    for tex_ref in tex_refs:
        tex_ref.set_address_mode(0, cuda.address_mode.WRAP)
        tex_ref.set_address_mode(1, cuda.address_mode.WRAP)
        tex_ref.set_filter_mode(cuda.filter_mode.POINT)
        tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)

    if timed: t0 = time.time()

    len_CMB = np.empty(shape, dtype=np.float32)

    if texture_count == 0:
        lens_func(cuda.Out(len_CMB),
                  coeffs_gpu,
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 1:
        lens_func(cuda.Out(len_CMB),
                  dx_gu,
                  dy_gu,
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)
    elif texture_count == 3:
        lens_func(cuda.Out(len_CMB),
                  np.int32(shape[0]),
                  block=GPU_block,
                  grid=GPU_grid,
                  texrefs=tex_refs)

    if timed:
        dt = time.time() - t0
        t_tot = time.time() - ti
        print "     GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % (
            np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt)
        print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % (
            np.prod(shape) / 1e6 / t_tot, t_tot)
    return len_CMB.astype(np.float64)