Example #1
0
 def __init__(self,
              volume,
              segmentation,
              voxelsize,
              origin=[0.0, 0.0, 0.0],
              stepsize=0.1,
              mode="linear"):
     #generate kernels
     self.mod = self.generateKernelModuleProjector()
     self.projKernel = self.mod.get_function("projectKernel")
     self.volumesize = volume.shape
     self.volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy()
     self.segmentation = np.moveaxis(segmentation.astype(np.float32),
                                     [0, 1, 2], [2, 1, 0]).copy()
     # print("done swap")
     self.volume_gpu = cuda.np_to_array(self.volume, order='C')
     self.texref_volume = self.mod.get_texref("tex_density")
     cuda.bind_array_to_texref(self.volume_gpu, self.texref_volume)
     self.segmentation_gpu = cuda.np_to_array(self.segmentation, order='C')
     self.texref_segmentation = self.mod.get_texref("tex_segmentation")
     cuda.bind_array_to_texref(self.segmentation_gpu,
                               self.texref_segmentation)
     if mode == "linear":
         self.texref_volume.set_filter_mode(cuda.filter_mode.LINEAR)
         self.texref_segmentation.set_filter_mode(cuda.filter_mode.LINEAR)
     self.voxelsize = voxelsize
     self.stepsize = np.float32(stepsize)
     self.origin = origin
     self.initialized = False
     print("initialized projector")
Example #2
0
    def project(self, volume_context, geometry_context, T_Nx4x4):
        assert self.cpu is not None, 'CPU ray casting is not supported.'

        image_size = self.target_detector.to_cpu().image_size
        pm_Nx3x4 = geometry_context.projection_matrix

        p_Nx12 = utils.constructProjectionParameter(pm_Nx3x4,
                                                    np.array(image_size[:2]),
                                                    T_Nx4x4)

        assert self.target_detector.cpu.image_size[2] == p_Nx12.shape[
            0], 'Unmatched detector channel and pose parameter channel.(Actual: {} != {})'.format(
                self.target_detector.cpu.image_size[2], p_Nx12.shape[0])

        h_p_Nx12 = p_Nx12.astype(np.float32)
        d_p_Nx12 = driver.np_to_array(h_p_Nx12, order='C')
        t_p_Nx12 = KernelManager.Module.get_texture('t_proj_param_Nx12',
                                                    d_p_Nx12)

        grid = (16, 16, 1)
        if Projector.grid is None:
            grid = tuple(
                np.uint32(np.ceil(image_size / Projector.block)).tolist())

        KernelManager.Kernel.invoke(self.target_detector.image.gpudata,
                                    texrefs=[volume_context.volume, t_p_Nx12],
                                    block=Projector.block,
                                    grid=grid)
        # Display debug info
        # print_kernel = KernelManager.Module.get_kernel('print_device_params')
        # print_kernel.invoke(texrefs=[t_p_Nx12])

        return self.target_detector.image
Example #3
0
    def _run(self, I, shift, rot, ratio, out):
        logger.debug("I.shape={}".format(I.shape))
        # bind input image to texture
        _in_buf = cuda.np_to_array(I, 'C')
        self._shear_texture.set_array(_in_buf)

        # determine grid and block size
        _, nv, nu = out.shape
        block_sz = (32, 32, 1)
        grid_sz = (ceil(float(nu) / block_sz[0]),
                   ceil(float(nv) / block_sz[1]))

        if (self._out_buf is None) or (self._out_buf.shape != out.shape):
            logger.debug("resize buffer to {}".format(out.shape))
            self._out_buf = gpuarray.empty(out.shape,
                                           dtype=np.float32,
                                           order='C')

        # TODO create rotate kernel buffer area

        # execute
        nz, ny, nx = I.shape
        self._shear_kernel.prepared_call(grid_sz, block_sz,
                                         self._out_buf.gpudata,
                                         np.float32(shift), np.uint32(nu),
                                         np.uint32(nv), np.float32(ratio),
                                         np.uint32(nx), np.uint32(ny),
                                         np.float32(nz))
        # TODO add rotate kernel call
        self._out_buf.get(out)

        # unbind texture
        _in_buf.free()
Example #4
0
 def setTexture(texref, numpy_array):       
     #Upload data to GPU and bind to texture reference
     texref.set_array(cuda.np_to_array(numpy_array, order="C"))
     
     # Set texture parameters
     texref.set_filter_mode(cuda.filter_mode.LINEAR) #bilinear interpolation
     texref.set_address_mode(0, cuda.address_mode.CLAMP) #no indexing outside domain
     texref.set_address_mode(1, cuda.address_mode.CLAMP)
     texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
Example #5
0
 def setTexture(texref, numpy_array):       
     #Upload data to GPU and bind to texture reference
     texref.set_array(cuda.np_to_array(numpy_array, order="C"))
     
     # Set texture parameters
     texref.set_filter_mode(cuda.filter_mode.LINEAR) #bilinear interpolation
     texref.set_address_mode(0, cuda.address_mode.CLAMP) #no indexing outside domain
     texref.set_address_mode(1, cuda.address_mode.CLAMP)
     texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
Example #6
0
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [
                np.int16, np.float32, np.float64, np.complex64, np.complex128
        ]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64:
                fpName_str = "fp_tex_cfloat"
            elif prec == np.complex128:
                fpName_str = "fp_tex_cdouble"
            elif prec == np.float64:
                fpName_str = "fp_tex_double"
            else:
                fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints, npoints],
                             order=orden,
                             dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = """
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            """
            myKern = myKern.replace("fpName", fpName_str)
            myKern = myKern.replace("cuPres", prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8, 8, 8)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, npoints)
            cuGrid = (
                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
            )
            copy_texture.prepare("P", texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() -
                                 np.transpose(A_cpu))) == np.array(0,
                                                                   dtype=prec)
            A_gpu.gpudata.free()
def tex2DToGPU(tex):
    nChannal = 1 if (len(tex.shape) == 2) else 3

    if (nChannal == 3):
        #Add padding channal
        tex = np.dstack((tex, np.ones((tex.shape[0], tex.shape[1]))))
        tex = np.ascontiguousarray(tex).astype(np.float32)
        texGPUArray = cuda.make_multichannel_2d_array(tex, 'C')
    else:
        texGPUArray = cuda.np_to_array(tex, 'C')

    return texGPUArray
Example #8
0
    def initialize(self):
        """Allocate GPU memory and transfer the volume, segmentations to GPU."""
        if self.initialized:
            raise RuntimeError("Close projector before initializing again.")

        # allocate and transfer volume texture to GPU
        # TODO: this axis-swap is messy and actually may be messing things up. Maybe use a FrameTransform in the Volume class instead?
        volume = self.volume.data
        volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() # TODO: is this axis swap necessary?
        self.volume_gpu = cuda.np_to_array(volume, order='C')
        self.volume_texref = self.mod.get_texref("volume")
        cuda.bind_array_to_texref(self.volume_gpu, self.volume_texref)
        
        # set the (interpolation?) mode
        if self.mode == 'linear':
            self.volume_texref.set_filter_mode(cuda.filter_mode.LINEAR)
        else:
            raise RuntimeError

        # allocate and transfer segmentation texture to GPU
        # TODO: remove axis swap?
        # self.segmentations_gpu = [cuda.np_to_array(seg, order='C') for mat, seg in self.volume.materials.items()]
        self.segmentations_gpu = [cuda.np_to_array(np.moveaxis(seg, [0, 1, 2], [2, 1, 0]).copy(), order='C') for mat, seg in self.volume.materials.items()]
        self.segmentations_texref = [self.mod.get_texref(f"seg_{m}") for m, _ in enumerate(self.volume.materials)]
        for seg, texref in zip(self.segmentations_gpu, self.segmentations_texref):
            cuda.bind_array_to_texref(seg, texref)
            if self.mode == 'linear':
                texref.set_filter_mode(cuda.filter_mode.LINEAR)
            else:
                raise RuntimeError

        # allocate output image array on GPU (4 bytes to a float32)
        self.output_gpu = cuda.mem_alloc(self.output_size * 4)

        # allocate ijk_from_index matrix array on GPU (3x3 array x 4 bytes per float32)
        self.rt_kinv_gpu = cuda.mem_alloc(3 * 3 * 4)
        
        # Mark self as initialized.
        self.initialized = True
Example #9
0
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64:
                fpName_str = "fp_tex_cfloat"
            elif prec == np.complex128:
                fpName_str = "fp_tex_cdouble"
            elif prec == np.float64:
                fpName_str = "fp_tex_double"
            else:
                fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = """
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            """
            myKern = myKern.replace("fpName", fpName_str)
            myKern = myKern.replace("cuPres", prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8, 8, 8)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, npoints)
            cuGrid = (
                npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0),
            )
            copy_texture.prepare("P", texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec)
            A_gpu.gpudata.free()
Example #10
0
    def test_2d_fp_texturesLayered(self):
        orden = "F"
        npoints = 32

        for prec in [
                np.int16, np.float32, np.float64, np.complex64, np.complex128
        ]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec)
            A_cpu[:] = np.random.rand(npoints, npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row = blockIdx.x*blockDim.x + threadIdx.x;
              int col = blockIdx.y*blockDim.y + threadIdx.y;

              dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
            }
            '''
            myKern = myKern.replace('fpName', fpName_str)
            myKern = myKern.replace('cuPres', prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (16, 16, 1)
            if cuBlock[0] > npoints:
                cuBlock = (npoints, npoints, 1)
            cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0),
                      npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0),
                      1)
            copy_texture.prepare('P', texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get() -
                                 np.transpose(A_cpu))) == np.array(0,
                                                                   dtype=prec)
            A_gpu.gpudata.free()
Example #11
0
    def test_2d_fp_texturesLayered(self):
        orden = "F"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row = blockIdx.x*blockDim.x + threadIdx.x;
              int col = blockIdx.y*blockDim.y + threadIdx.y;

              dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1);
            }
            '''
            myKern = myKern.replace('fpName',fpName_str)
            myKern = myKern.replace('cuPres',prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (16,16,1)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,1)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1)
            copy_texture.prepare('P',texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
Example #12
0
def ndarray_to_float_tex(tex_ref, ndarray, address_mode=cuda.address_mode.BORDER, filter_mode=cuda.filter_mode.LINEAR):
    if isinstance(ndarray, np.ndarray):
        cu_array = cuda.np_to_array(ndarray, 'C')
    elif isinstance(ndarray, gpuarray.GPUArray):
        cu_array = cuda.gpuarray_to_array(ndarray, 'C')
    else:
        raise TypeError(
            'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray')

    cuda.TextureReference.set_array(tex_ref, cu_array)

    cuda.TextureReference.set_address_mode(
        tex_ref, 0, address_mode)
    if ndarray.ndim >= 2:
        cuda.TextureReference.set_address_mode(
            tex_ref, 1, address_mode)
    if ndarray.ndim >= 3:
        cuda.TextureReference.set_address_mode(
            tex_ref, 2,  address_mode)
    cuda.TextureReference.set_filter_mode(
        tex_ref, filter_mode)
    tex_ref.set_flags(tex_ref.get_flags(
    ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER)
Example #13
0
    def __init__(self, \
                 gpu_ctx, \
                 eta0, hu0, hv0, H, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r, \
                 angle=np.array([[0]], dtype=np.float32), \
                 t=0.0, \
                 theta=1.3, rk_order=2, \
                 coriolis_beta=0.0, \
                 max_wind_direction_perturbation = 0, \
                 wind_stress=WindStress.WindStress(), \
                 boundary_conditions=Common.BoundaryConditions(), \
                 boundary_conditions_data=Common.BoundaryConditionsData(), \
                 small_scale_perturbation=False, \
                 small_scale_perturbation_amplitude=None, \
                 small_scale_perturbation_interpolation_factor = 1, \
                 model_time_step=None,
                 reportGeostrophicEquilibrium=False, \
                 use_lcg=False, \
                 write_netcdf=False, \
                 comm=None, \
                 netcdf_filename=None, \
                 ignore_ghostcells=False, \
                 courant_number=0.8, \
                 offset_x=0, offset_y=0, \
                 flux_slope_eps = 1.0e-1, \
                 desingularization_eps = 1.0e-1, \
                 depth_cutoff = 1.0e-5, \
                 block_width=32, block_height=8, num_threads_dt=256,
                 block_width_model_error=16, block_height_model_error=16):
        """
        Initialization routine
        eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
        hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
        hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
        H: Depth from equilibrium defined on cell corners, (nx+5)*(ny+5) corners
        nx: Number of cells along x-axis
        ny: Number of cells along y-axis
        dx: Grid cell spacing along x-axis (20 000 m)
        dy: Grid cell spacing along y-axis (20 000 m)
        dt: Size of each timestep (90 s)
        g: Gravitational accelleration (9.81 m/s^2)
        f: Coriolis parameter (1.2e-4 s^1), effectively as f = f + beta*y
        r: Bottom friction coefficient (2.4e-3 m/s)
        angle: Angle of rotation from North to y-axis
        t: Start simulation at time t
        theta: MINMOD theta used the reconstructions of the derivatives in the numerical scheme
        rk_order: Order of Runge Kutta method {1,2*,3}
        coriolis_beta: Coriolis linear factor -> f = f + beta*(y-y_0)
        max_wind_direction_perturbation: Large-scale model error emulation by per-time-step perturbation of wind direction by +/- max_wind_direction_perturbation (degrees)
        wind_stress: Wind stress parameters
        boundary_conditions: Boundary condition object
        small_scale_perturbation: Boolean value for applying a stochastic model error
        small_scale_perturbation_amplitude: Amplitude (q0 coefficient) for model error
        small_scale_perturbation_interpolation_factor: Width factor for correlation in model error
        model_time_step: The size of a data assimilation model step (default same as dt)
        reportGeostrophicEquilibrium: Calculate the Geostrophic Equilibrium variables for each superstep
        use_lcg: Use LCG as the random number generator. Default is False, which means using curand.
        write_netcdf: Write the results after each superstep to a netCDF file
        comm: MPI communicator
        desingularization_eps: Used for desingularizing hu/h
        flux_slope_eps: Used for setting zero flux for symmetric Riemann fan
        depth_cutoff: Used for defining dry cells
        netcdf_filename: Use this filename. (If not defined, a filename will be generated by SimWriter.)
        """
               
        self.logger = logging.getLogger(__name__)

        assert( rk_order < 4 or rk_order > 0 ), "Only 1st, 2nd and 3rd order Runge Kutta supported"

        if (rk_order == 3):
            assert(r == 0.0), "3rd order Runge Kutta supported only without friction"
        
        # Sort out internally represented ghost_cells in the presence of given
        # boundary conditions
        ghost_cells_x = 2
        ghost_cells_y = 2
        
        #Coriolis at "first" cell
        x_zero_reference_cell = ghost_cells_x
        y_zero_reference_cell = ghost_cells_y # In order to pass it to the super constructor
        
        # Boundary conditions
        self.boundary_conditions = boundary_conditions
        if (boundary_conditions.isSponge()):
            nx = nx + boundary_conditions.spongeCells[1] + boundary_conditions.spongeCells[3] - 2*ghost_cells_x
            ny = ny + boundary_conditions.spongeCells[0] + boundary_conditions.spongeCells[2] - 2*ghost_cells_y
            
            x_zero_reference_cell += boundary_conditions.spongeCells[3]
            y_zero_reference_cell += boundary_conditions.spongeCells[2]

        #Compensate f for reference cell (first cell in internal of domain)
        north = np.array([np.sin(angle[0,0]), np.cos(angle[0,0])])
        f = f - coriolis_beta * (x_zero_reference_cell*dx*north[0] + y_zero_reference_cell*dy*north[1])
        
        x_zero_reference_cell = 0
        y_zero_reference_cell = 0
        
        A = None
        self.max_wind_direction_perturbation = max_wind_direction_perturbation
        super(CDKLM16, self).__init__(gpu_ctx, \
                                      nx, ny, \
                                      ghost_cells_x, \
                                      ghost_cells_y, \
                                      dx, dy, dt, \
                                      g, f, r, A, \
                                      t, \
                                      theta, rk_order, \
                                      coriolis_beta, \
                                      y_zero_reference_cell, \
                                      wind_stress, \
                                      write_netcdf, \
                                      ignore_ghostcells, \
                                      offset_x, offset_y, \
                                      comm, \
                                      block_width, block_height)
        
        # Index range for interior domain (north, east, south, west)
        # so that interior domain of eta is
        # eta[self.interior_domain_indices[2]:self.interior_domain_indices[0], \
        #     self.interior_domain_indices[3]:self.interior_domain_indices[1] ]
        self.interior_domain_indices = np.array([-2,-2,2,2])
        self._set_interior_domain_from_sponge_cells()
        
        defines={'block_width': block_width, 'block_height': block_height,
                   'KPSIMULATOR_DESING_EPS': str(desingularization_eps)+'f',
                   'KPSIMULATOR_FLUX_SLOPE_EPS': str(flux_slope_eps)+'f',
                   'KPSIMULATOR_DEPTH_CUTOFF': str(depth_cutoff)+'f'}
        
        #Get kernels
        self.kernel = gpu_ctx.get_kernel("CDKLM16_kernel.cu", 
                defines=defines, 
                compile_args={                          # default, fast_math, optimal
                    'options' : ["--ftz=true",          # false,   true,      true
                                 "--prec-div=false",    # true,    false,     false,
                                 "--prec-sqrt=false",   # true,    false,     false
                                 "--fmad=false"]        # true,    true,      false
                    
                    #'options': ["--use_fast_math"]
                    #'options': ["--generate-line-info"], 
                    #nvcc_options=["--maxrregcount=39"],
                    #'arch': "compute_50", 
                    #'code': "sm_50"
                },
                jit_compile_args={
                    #jit_options=[(cuda.jit_option.MAX_REGISTERS, 39)]
                }
                )
        
        # Get CUDA functions and define data types for prepared_{async_}call()
        self.cdklm_swe_2D = self.kernel.get_function("cdklm_swe_2D")
        self.cdklm_swe_2D.prepare("iiffffffffiiPiPiPiPiPiPiPiPiffi")
        self.update_wind_stress(self.kernel, self.cdklm_swe_2D)
        
        # CUDA functions for finding max time step size:
        self.num_threads_dt = num_threads_dt
        self.num_blocks_dt  = np.int32(self.global_size[0]*self.global_size[1])
        self.update_dt_kernels = gpu_ctx.get_kernel("max_dt.cu",
                defines={'block_width': block_width, 
                         'block_height': block_height,
                         'NUM_THREADS': self.num_threads_dt})
        self.per_block_max_dt_kernel = self.update_dt_kernels.get_function("per_block_max_dt")
        self.per_block_max_dt_kernel.prepare("iifffPiPiPiPifPi")
        self.max_dt_reduction_kernel = self.update_dt_kernels.get_function("max_dt_reduction")
        self.max_dt_reduction_kernel.prepare("iPP")
        
            
        # Bathymetry
        self.bathymetry = Common.Bathymetry(gpu_ctx, self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, H, boundary_conditions)
                
        # Adjust eta for possible dry states
        Hm = self.downloadBathymetry()[1]
        eta0 = np.maximum(eta0, -Hm)
        
        # Create data by uploading to device
        self.gpu_data = Common.SWEDataArakawaA(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0)

        # Allocate memory for calculating maximum timestep
        host_dt = np.zeros((self.global_size[1], self.global_size[0]), dtype=np.float32)
        self.device_dt = Common.CUDAArray2D(self.gpu_stream, self.global_size[0], self.global_size[1],
                                            0, 0, host_dt)
        host_max_dt_buffer = np.zeros((1,1), dtype=np.float32)
        self.max_dt_buffer = Common.CUDAArray2D(self.gpu_stream, 1, 1, 0, 0, host_max_dt_buffer)
        self.courant_number = courant_number
        
        ## Allocating memory for geostrophical equilibrium variables
        self.reportGeostrophicEquilibrium = np.int32(reportGeostrophicEquilibrium)
        self.geoEq_uxpvy = None
        self.geoEq_Kx = None
        self.geoEq_Ly = None
        if self.reportGeostrophicEquilibrium:
            dummy_zero_array = np.zeros((ny+2*ghost_cells_y, nx+2*ghost_cells_x), dtype=np.float32, order='C') 
            self.geoEq_uxpvy = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array)
            self.geoEq_Kx = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array)
            self.geoEq_Ly = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array)

        self.constant_equilibrium_depth = np.max(H)
        
        self.bc_kernel = Common.BoundaryConditionsArakawaA(gpu_ctx, \
                                                           self.nx, \
                                                           self.ny, \
                                                           ghost_cells_x, \
                                                           ghost_cells_y, \
                                                           self.boundary_conditions, \
                                                           boundary_conditions_data, \
        )

        # Small scale perturbation:
        self.small_scale_perturbation = small_scale_perturbation
        self.small_scale_model_error = None
        self.small_scale_perturbation_interpolation_factor = small_scale_perturbation_interpolation_factor
        if small_scale_perturbation:
            if small_scale_perturbation_amplitude is None:
                self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim(self,
                                                                                       interpolation_factor=small_scale_perturbation_interpolation_factor,
                                                                                       use_lcg=use_lcg,
                                                                                       block_width=block_width_model_error, 
                                                                                       block_height=block_height_model_error)
            else:
                self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim(self, 
                                                                                       soar_q0=small_scale_perturbation_amplitude,
                                                                                       interpolation_factor=small_scale_perturbation_interpolation_factor,
                                                                                       use_lcg=use_lcg,
                                                                                       block_width=block_width_model_error, 
                                                                                       block_height=block_height_model_error)
        
        
        # Data assimilation model step size
        self.model_time_step = model_time_step
        if model_time_step is None:
            self.model_time_step = self.dt
        self.total_time_steps = 0
        
        
        if self.write_netcdf:
            self.sim_writer = SimWriter.SimNetCDFWriter(self, filename=netcdf_filename, ignore_ghostcells=self.ignore_ghostcells, \
                                    offset_x=self.offset_x, offset_y=self.offset_y)
                                    
                                    
        #Upload data to GPU and bind to texture reference
        self.angle_texref = self.kernel.get_texref("angle_tex")
        self.angle_texref.set_array(cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C"))
                    
        # Set texture parameters
        self.angle_texref.set_filter_mode(cuda.filter_mode.LINEAR) #bilinear interpolation
        self.angle_texref.set_address_mode(0, cuda.address_mode.CLAMP) #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
Example #14
0
    def __init__(self, \
                 gpu_ctx, \
                 eta0, hu0, hv0, H, \
                 nx, ny, \
                 dx, dy, dt, \
                 g, f, r, \
                 subsample_f=10, \
                 angle=np.array([[0]], dtype=np.float32), \
                 subsample_angle=10, \
                 latitude=None, \
                 t=0.0, \
                 theta=1.3, rk_order=2, \
                 coriolis_beta=0.0, \
                 max_wind_direction_perturbation = 0, \
                 wind_stress=WindStress.WindStress(), \
                 boundary_conditions=Common.BoundaryConditions(), \
                 boundary_conditions_data=Common.BoundaryConditionsData(), \
                 small_scale_perturbation=False, \
                 small_scale_perturbation_amplitude=None, \
                 small_scale_perturbation_interpolation_factor = 1, \
                 model_time_step=None,
                 reportGeostrophicEquilibrium=False, \
                 use_lcg=False, \
                 write_netcdf=False, \
                 comm=None, \
                 local_particle_id=0, \
                 super_dir_name=None, \
                 netcdf_filename=None, \
                 ignore_ghostcells=False, \
                 courant_number=0.8, \
                 offset_x=0, offset_y=0, \
                 flux_slope_eps = 1.0e-1, \
                 desingularization_eps = 1.0e-1, \
                 depth_cutoff = 1.0e-5, \
                 block_width=12, block_height=32, num_threads_dt=256,
                 block_width_model_error=16, block_height_model_error=16):
        """
        Initialization routine
        eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells
        hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells
        hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells
        H: Depth from equilibrium defined on cell corners, (nx+5)*(ny+5) corners
        nx: Number of cells along x-axis
        ny: Number of cells along y-axis
        dx: Grid cell spacing along x-axis (20 000 m)
        dy: Grid cell spacing along y-axis (20 000 m)
        dt: Size of each timestep (90 s)
        g: Gravitational accelleration (9.81 m/s^2)
        f: Coriolis parameter (1.2e-4 s^1), effectively as f = f + beta*y
        r: Bottom friction coefficient (2.4e-3 m/s)
        subsample_f: Subsample the coriolis f when creating texture by factor
        angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array (in radians)
        subsample_angle: Subsample the angles given as input when creating texture by factor
        latitude: Specify latitude. This will override any f and beta plane already set (in radians)
        t: Start simulation at time t
        theta: MINMOD theta used the reconstructions of the derivatives in the numerical scheme
        rk_order: Order of Runge Kutta method {1,2*,3}
        coriolis_beta: Coriolis linear factor -> f = f + beta*(y-y_0)
        max_wind_direction_perturbation: Large-scale model error emulation by per-time-step perturbation of wind direction by +/- max_wind_direction_perturbation (degrees)
        wind_stress: Wind stress parameters
        boundary_conditions: Boundary condition object
        small_scale_perturbation: Boolean value for applying a stochastic model error
        small_scale_perturbation_amplitude: Amplitude (q0 coefficient) for model error
        small_scale_perturbation_interpolation_factor: Width factor for correlation in model error
        model_time_step: The size of a data assimilation model step (default same as dt)
        reportGeostrophicEquilibrium: Calculate the Geostrophic Equilibrium variables for each superstep
        use_lcg: Use LCG as the random number generator. Default is False, which means using curand.
        write_netcdf: Write the results after each superstep to a netCDF file
        comm: MPI communicator
        local_particle_id: Local (for each MPI process) particle id
        desingularization_eps: Used for desingularizing hu/h
        flux_slope_eps: Used for setting zero flux for symmetric Riemann fan
        depth_cutoff: Used for defining dry cells
        super_dir_name: Directory to write netcdf files to
        netcdf_filename: Use this filename. (If not defined, a filename will be generated by SimWriter.)
        """

        self.logger = logging.getLogger(__name__)

        assert (rk_order < 4 or rk_order > 0
                ), "Only 1st, 2nd and 3rd order Runge Kutta supported"

        if (rk_order == 3):
            assert (r == 0.0
                    ), "3rd order Runge Kutta supported only without friction"

        # Sort out internally represented ghost_cells in the presence of given
        # boundary conditions
        ghost_cells_x = 2
        ghost_cells_y = 2

        #Coriolis at "first" cell
        x_zero_reference_cell = ghost_cells_x
        y_zero_reference_cell = ghost_cells_y  # In order to pass it to the super constructor

        # Boundary conditions
        self.boundary_conditions = boundary_conditions

        #Compensate f for reference cell (first cell in internal of domain)
        north = np.array([np.sin(angle[0, 0]), np.cos(angle[0, 0])])
        f = f - coriolis_beta * (x_zero_reference_cell * dx * north[0] +
                                 y_zero_reference_cell * dy * north[1])

        x_zero_reference_cell = 0
        y_zero_reference_cell = 0

        A = None
        self.max_wind_direction_perturbation = max_wind_direction_perturbation
        super(CDKLM16, self).__init__(gpu_ctx, \
                                      nx, ny, \
                                      ghost_cells_x, \
                                      ghost_cells_y, \
                                      dx, dy, dt, \
                                      g, f, r, A, \
                                      t, \
                                      theta, rk_order, \
                                      coriolis_beta, \
                                      y_zero_reference_cell, \
                                      wind_stress, \
                                      write_netcdf, \
                                      ignore_ghostcells, \
                                      offset_x, offset_y, \
                                      comm, \
                                      block_width, block_height,
                                      local_particle_id=local_particle_id)

        # Index range for interior domain (north, east, south, west)
        # so that interior domain of eta is
        # eta[self.interior_domain_indices[2]:self.interior_domain_indices[0], \
        #     self.interior_domain_indices[3]:self.interior_domain_indices[1] ]
        self.interior_domain_indices = np.array([-2, -2, 2, 2])

        defines = {
            'block_width': block_width,
            'block_height': block_height,
            'KPSIMULATOR_DESING_EPS': "{:.12f}f".format(desingularization_eps),
            'KPSIMULATOR_FLUX_SLOPE_EPS': "{:.12f}f".format(flux_slope_eps),
            'KPSIMULATOR_DEPTH_CUTOFF': "{:.12f}f".format(depth_cutoff),
            'THETA': "{:.12f}f".format(self.theta),
            'RK_ORDER': int(self.rk_order),
            'NX': int(self.nx),
            'NY': int(self.ny),
            'DX': "{:.12f}f".format(self.dx),
            'DY': "{:.12f}f".format(self.dy),
            'GRAV': "{:.12f}f".format(self.g),
            'FRIC': "{:.12f}f".format(self.r)
        }

        #Get kernels
        self.kernel = gpu_ctx.get_kernel(
            "CDKLM16_kernel.cu",
            defines=defines,
            compile_args={  # default, fast_math, optimal
                'options': [
                    "--ftz=true",  # false,   true,      true
                    "--prec-div=false",  # true,    false,     false,
                    "--prec-sqrt=false",  # true,    false,     false
                    "--fmad=false"
                ]  # true,    true,      false

                #'options': ["--use_fast_math"]
                #'options': ["--generate-line-info"],
                #nvcc_options=["--maxrregcount=39"],
                #'arch': "compute_50",
                #'code': "sm_50"
            },
            jit_compile_args={
                #jit_options=[(cuda.jit_option.MAX_REGISTERS, 39)]
            })

        # Get CUDA functions and define data types for prepared_{async_}call()
        self.cdklm_swe_2D = self.kernel.get_function("cdklm_swe_2D")
        self.cdklm_swe_2D.prepare("fiPiPiPiPiPiPiPiPiffi")
        self.update_wind_stress(self.kernel, self.cdklm_swe_2D)

        # CUDA functions for finding max time step size:
        self.num_threads_dt = num_threads_dt
        self.num_blocks_dt = np.int32(self.global_size[0] *
                                      self.global_size[1])
        self.update_dt_kernels = gpu_ctx.get_kernel("max_dt.cu",
                                                    defines={
                                                        'block_width':
                                                        block_width,
                                                        'block_height':
                                                        block_height,
                                                        'NUM_THREADS':
                                                        self.num_threads_dt
                                                    })
        self.per_block_max_dt_kernel = self.update_dt_kernels.get_function(
            "per_block_max_dt")
        self.per_block_max_dt_kernel.prepare("iifffPiPiPiPifPi")
        self.max_dt_reduction_kernel = self.update_dt_kernels.get_function(
            "max_dt_reduction")
        self.max_dt_reduction_kernel.prepare("iPP")

        # Bathymetry
        self.bathymetry = Common.Bathymetry(gpu_ctx, self.gpu_stream, nx, ny,
                                            ghost_cells_x, ghost_cells_y, H,
                                            boundary_conditions)

        # Adjust eta for possible dry states
        Hm = self.downloadBathymetry()[1]
        eta0 = np.maximum(eta0, -Hm)

        # Create data by uploading to device
        self.gpu_data = Common.SWEDataArakawaA(self.gpu_stream, nx, ny,
                                               ghost_cells_x, ghost_cells_y,
                                               eta0, hu0, hv0)

        # Allocate memory for calculating maximum timestep
        host_dt = np.zeros((self.global_size[1], self.global_size[0]),
                           dtype=np.float32)
        self.device_dt = Common.CUDAArray2D(self.gpu_stream,
                                            self.global_size[0],
                                            self.global_size[1], 0, 0, host_dt)
        host_max_dt_buffer = np.zeros((1, 1), dtype=np.float32)
        self.max_dt_buffer = Common.CUDAArray2D(self.gpu_stream, 1, 1, 0, 0,
                                                host_max_dt_buffer)
        self.courant_number = courant_number

        ## Allocating memory for geostrophical equilibrium variables
        self.reportGeostrophicEquilibrium = np.int32(
            reportGeostrophicEquilibrium)
        self.geoEq_uxpvy = None
        self.geoEq_Kx = None
        self.geoEq_Ly = None
        if self.reportGeostrophicEquilibrium:
            dummy_zero_array = np.zeros(
                (ny + 2 * ghost_cells_y, nx + 2 * ghost_cells_x),
                dtype=np.float32,
                order='C')
            self.geoEq_uxpvy = Common.CUDAArray2D(self.gpu_stream, nx, ny,
                                                  ghost_cells_x, ghost_cells_y,
                                                  dummy_zero_array)
            self.geoEq_Kx = Common.CUDAArray2D(self.gpu_stream, nx, ny,
                                               ghost_cells_x, ghost_cells_y,
                                               dummy_zero_array)
            self.geoEq_Ly = Common.CUDAArray2D(self.gpu_stream, nx, ny,
                                               ghost_cells_x, ghost_cells_y,
                                               dummy_zero_array)

        self.constant_equilibrium_depth = np.max(H)

        self.bc_kernel = Common.BoundaryConditionsArakawaA(gpu_ctx, \
                                                           self.nx, \
                                                           self.ny, \
                                                           ghost_cells_x, \
                                                           ghost_cells_y, \
                                                           self.boundary_conditions, \
                                                           boundary_conditions_data, \
        )

        def subsample_texture(data, factor):
            ny, nx = data.shape
            dx, dy = 1 / nx, 1 / ny
            I = interp2d(np.linspace(0.5 * dx, 1 - 0.5 * dx, nx),
                         np.linspace(0.5 * dy, 1 - 0.5 * dy, ny),
                         data,
                         kind='linear')

            new_nx, new_ny = max(2, nx // factor), max(2, ny // factor)
            new_dx, new_dy = 1 / new_nx, 1 / new_ny
            x_new = np.linspace(0.5 * new_dx, 1 - 0.5 * new_dx, new_nx)
            y_new = np.linspace(0.5 * new_dy, 1 - 0.5 * new_dy, new_ny)
            return I(x_new, y_new)

        # Texture for angle
        self.angle_texref = self.kernel.get_texref("angle_tex")
        if isinstance(angle, cuda.Array):
            # angle is already a texture, so we just set the texture reference
            self.angle_texref.set_array(angle)
        else:
            #Upload data to GPU and bind to texture reference
            if (subsample_angle and angle.size >= eta0.size):
                self.logger.info("Subsampling angle texture by factor " +
                                 str(subsample_angle))
                self.logger.warning(
                    "This will give inaccurate angle along the border!")
                angle = subsample_texture(angle, subsample_angle)

            self.angle_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.angle_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.angle_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing

        # Texture for coriolis f
        self.coriolis_texref = self.kernel.get_texref("coriolis_f_tex")

        # Create the CPU coriolis
        if (latitude is not None):
            if (self.f != 0.0):
                raise RuntimeError(
                    "Cannot specify both latitude and f. Make your mind up.")
            coriolis_f, _ = OceanographicUtilities.calcCoriolisParams(latitude)
            coriolis_f = coriolis_f.astype(np.float32)
        else:
            if (self.coriolis_beta != 0.0):
                if (angle.size != 1):
                    raise RuntimeError(
                        "non-constant angle cannot be combined with beta plane model (makes no sense)"
                    )
                #Generate coordinates for all cells, including ghost cells from center to center
                # [-3/2dx, nx+3/2dx] for ghost_cells_x == 2
                x = np.linspace((-self.ghost_cells_x + 0.5) * self.dx,
                                (self.nx + self.ghost_cells_x - 0.5) * self.dx,
                                self.nx + 2 * self.ghost_cells_x)
                y = np.linspace((-self.ghost_cells_y + 0.5) * self.dy,
                                (self.ny + self.ghost_cells_y - 0.5) * self.dy,
                                self.ny + 2 * self.ghost_cells_x)
                self.logger.info(
                    "Using latitude to create Coriolis f texture ({:f}x{:f} cells)"
                    .format(x.size, y.size))
                x, y = np.meshgrid(x, y)
                n = x * np.sin(angle[0, 0]) + y * np.cos(
                    angle[0, 0])  #North vector
                coriolis_f = self.f + self.coriolis_beta * n
            else:
                if (self.f.size == 1):
                    coriolis_f = np.array([[self.f]], dtype=np.float32)
                elif (self.f.shape == eta0.shape):
                    coriolis_f = np.array(self.f, dtype=np.float32)
                else:
                    raise RuntimeError(
                        "The shape of f should match up with eta or be scalar."
                    )

        if (subsample_f and coriolis_f.size >= eta0.size):
            self.logger.info("Subsampling coriolis texture by factor " +
                             str(subsample_f))
            self.logger.warning(
                "This will give inaccurate coriolis along the border!")
            coriolis_f = subsample_texture(coriolis_f, subsample_f)

        #Upload data to GPU and bind to texture reference
        self.coriolis_texref.set_array(
            cuda.np_to_array(np.ascontiguousarray(coriolis_f,
                                                  dtype=np.float32),
                             order="C"))

        # Set texture parameters
        self.coriolis_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.coriolis_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.coriolis_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing

        # Small scale perturbation:
        self.small_scale_perturbation = small_scale_perturbation
        self.small_scale_model_error = None
        self.small_scale_perturbation_interpolation_factor = small_scale_perturbation_interpolation_factor
        if small_scale_perturbation:
            self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim(
                self,
                soar_q0=small_scale_perturbation_amplitude,
                interpolation_factor=
                small_scale_perturbation_interpolation_factor,
                use_lcg=use_lcg,
                block_width=block_width_model_error,
                block_height=block_height_model_error)

        # Data assimilation model step size
        self.model_time_step = model_time_step
        self.total_time_steps = 0
        if model_time_step is None:
            self.model_time_step = self.dt

        if self.write_netcdf:
            self.sim_writer = SimWriter.SimNetCDFWriter(self, super_dir_name=super_dir_name, filename=netcdf_filename, \
                                            ignore_ghostcells=self.ignore_ghostcells, offset_x=self.offset_x, offset_y=self.offset_y)

        # Update timestep if dt is given as zero
        if self.dt <= 0:
            self.updateDt()
Example #15
0
    def __init__(self,
                 gpu_ctx,
                 gpu_stream,
                 nx,
                 ny,
                 dx,
                 dy,
                 boundaryConditions,
                 staggered,
                 soar_q0=None,
                 soar_L=None,
                 interpolation_factor=1,
                 use_lcg=False,
                 angle=np.array([[0]], dtype=np.float32),
                 coriolis_f=np.array([[0]], dtype=np.float32),
                 block_width=16,
                 block_height=16):
        """
        Initiates a class that generates small scale geostrophically balanced perturbations of
        the ocean state.
        (nx, ny): number of internal grid cells in the domain
        (dx, dy): size of each grid cell
        soar_q0: amplitude parameter for the perturbation, default: dx*1e-5
        soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor
        interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, 
            and then interpolated down to the computational mesh. The coarse mesh will then have
            (nx/interpolation_factor, ny/interpolation_factor) grid cells.
        use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers
        angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array
        (block_width, block_height): The size of each GPU block
        """

        self.use_lcg = use_lcg

        # Set numpy random state
        self.random_state = np.random.RandomState()

        # Make sure that all variables initialized within ifs are defined
        self.random_numbers = None
        self.rng = None
        self.seed = None
        self.host_seed = None

        self.gpu_ctx = gpu_ctx
        self.gpu_stream = gpu_stream

        self.nx = np.int32(nx)
        self.ny = np.int32(ny)
        self.dx = np.float32(dx)
        self.dy = np.float32(dy)
        self.staggered = np.int(0)
        if staggered:
            self.staggered = np.int(1)

        # The cutoff parameter is hard-coded.
        # The size of the cutoff determines the computational radius in the
        # SOAR function. Hence, the size of the local memory in the OpenCL
        # kernels has to be hard-coded.
        self.cutoff = np.int32(config.soar_cutoff)

        # Check that the interpolation factor plays well with the grid size:
        assert (interpolation_factor > 0 and interpolation_factor % 2
                == 1), 'interpolation_factor must be a positive odd integer'

        assert (nx % interpolation_factor == 0
                ), 'nx must be divisible by the interpolation factor'
        assert (ny % interpolation_factor == 0
                ), 'ny must be divisible by the interpolation factor'
        self.interpolation_factor = np.int32(interpolation_factor)

        # The size of the coarse grid
        self.coarse_nx = np.int32(nx / self.interpolation_factor)
        self.coarse_ny = np.int32(ny / self.interpolation_factor)
        self.coarse_dx = np.float32(dx * self.interpolation_factor)
        self.coarse_dy = np.float32(dy * self.interpolation_factor)

        self.periodicNorthSouth = np.int32(
            boundaryConditions.isPeriodicNorthSouth())
        self.periodicEastWest = np.int32(
            boundaryConditions.isPeriodicEastWest())

        # Size of random field and seed
        # The SOAR function is a stencil which requires cutoff number of grid cells,
        # and the interpolation operator requires further 2 ghost cell values in each direction.
        # The random field must therefore be created with 2 + cutoff number of ghost cells.
        self.rand_ghost_cells_x = np.int32(2 + self.cutoff)
        self.rand_ghost_cells_y = np.int32(2 + self.cutoff)
        if self.periodicEastWest:
            self.rand_ghost_cells_x = np.int32(0)
        if self.periodicNorthSouth:
            self.rand_ghost_cells_y = np.int32(0)
        self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x)
        self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y)

        # Since normal distributed numbers are generated in pairs, we need to store half the number of
        # of seed values compared to the number of random numbers.
        self.seed_ny = np.int32(self.rand_ny)
        self.seed_nx = np.int32(np.ceil(self.rand_nx / 2))

        # Generate seed:
        self.floatMax = 2147483648.0
        if self.use_lcg:
            self.host_seed = self.random_state.rand(
                self.seed_ny, self.seed_nx) * self.floatMax
            self.host_seed = self.host_seed.astype(np.uint64, order='C')

        if not self.use_lcg:
            self.rng = XORWOWRandomNumberGenerator()
        else:
            self.seed = Common.CUDAArray2D(gpu_stream,
                                           self.seed_nx,
                                           self.seed_ny,
                                           0,
                                           0,
                                           self.host_seed,
                                           double_precision=True,
                                           integers=True)

        # Constants for the SOAR function:
        self.soar_q0 = np.float32(self.dx / 100000)
        if soar_q0 is not None:
            self.soar_q0 = np.float32(soar_q0)

        self.soar_L = np.float32(0.75 * self.coarse_dx)
        if soar_L is not None:
            self.soar_L = np.float32(soar_L)

        # Allocate memory for random numbers (xi)
        self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx),
                                            dtype=np.float32,
                                            order='C')
        self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx,
                                                 self.rand_ny, 0, 0,
                                                 self.random_numbers_host)

        # Allocate a second buffer for random numbers (nu)
        self.perpendicular_random_numbers_host = np.zeros(
            (self.rand_ny, self.rand_nx), dtype=np.float32, order='C')
        self.perpendicular_random_numbers = Common.CUDAArray2D(
            self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0,
            self.random_numbers_host)

        # Allocate memory for coarse buffer if needed
        # Two ghost cells in each direction needed for bicubic interpolation
        self.coarse_buffer_host = np.zeros(
            (self.coarse_ny + 4, self.coarse_nx + 4),
            dtype=np.float32,
            order='C')
        self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream,
                                                self.coarse_nx, self.coarse_ny,
                                                2, 2, self.coarse_buffer_host)

        # Allocate extra memory needed for reduction kernels.
        # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu]
        self.reduction_buffer = None
        reduction_buffer_host = np.zeros((1, 3), dtype=np.float32)
        self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0,
                                                   reduction_buffer_host)

        # Generate kernels
        self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \
                                          defines={'block_width': block_width, 'block_height': block_height},
                                          compile_args={
                                              'options': ["--use_fast_math",
                                                          "--maxrregcount=32"]
                                          })

        self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \
                                                         defines={})

        # Get CUDA functions and define data types for prepared_{async_}call()
        # Generate kernels
        self.squareSumKernel = self.reduction_kernels.get_function("squareSum")
        self.squareSumKernel.prepare("iiPP")

        self.squareSumDoubleKernel = self.reduction_kernels.get_function(
            "squareSumDouble")
        self.squareSumDoubleKernel.prepare("iiPPP")

        self.makePerpendicularKernel = self.kernels.get_function(
            "makePerpendicular")
        self.makePerpendicularKernel.prepare("iiPiPiP")

        self.uniformDistributionKernel = self.kernels.get_function(
            "uniformDistribution")
        self.uniformDistributionKernel.prepare("iiiPiPi")

        self.normalDistributionKernel = None
        if self.use_lcg:
            self.normalDistributionKernel = self.kernels.get_function(
                "normalDistribution")
            self.normalDistributionKernel.prepare("iiiPiPi")

        self.soarKernel = self.kernels.get_function("SOAR")
        self.soarKernel.prepare("iifffffiiPiPii")

        self.geostrophicBalanceKernel = self.kernels.get_function(
            "geostrophicBalance")
        self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif")

        self.bicubicInterpolationKernel = self.kernels.get_function(
            "bicubicInterpolation")
        self.bicubicInterpolationKernel.prepare(
            "iiiiffiiiiffiiffffPiPiPiPiPif")

        #Compute kernel launch parameters
        self.local_size = (block_width, block_height, 1)

        self.local_size_reductions = (128, 1, 1)
        self.global_size_reductions = (1, 1)

        # Launch one thread for each seed, which in turns generates two iid N(0,1)
        self.global_size_random_numbers = ( \
                       int(np.ceil(self.seed_nx / float(self.local_size[0]))), \
                       int(np.ceil(self.seed_ny / float(self.local_size[1]))) \
                     )

        # Launch on thread for each random number (in order to create perpendicular random numbers)
        self.global_size_perpendicular = ( \
                      int(np.ceil(self.rand_nx / float(self.local_size[0]))), \
                      int(np.ceil(self.rand_ny / float(self.local_size[1]))) \
                     )

        # Launch one thread per SOAR-correlated result - need to write to two ghost
        # cells in order to do bicubic interpolation based on the result
        self.global_size_SOAR = ( \
                     int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \
                     int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \
                    )

        # One thread per resulting perturbed grid cell
        self.global_size_geo_balance = ( \
                    int(np.ceil( (self.nx)/float(self.local_size[0]))), \
                    int(np.ceil( (self.ny)/float(self.local_size[1]))) \
                   )

        # Texture for coriolis field
        self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex")
        if isinstance(coriolis_f, cuda.Array):
            # coriolis_f is already a texture, so we just set the reference
            self.coriolis_texref.set_array(coriolis_f)
        else:
            #Upload data to GPU and bind to texture reference
            self.coriolis_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(coriolis_f,
                                                      dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.coriolis_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.coriolis_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.coriolis_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
        # FIXME! Allow different versions of coriolis, similar to CDKLM

        # Texture for angle towards north
        self.angle_texref = self.kernels.get_texref("angle_tex")
        if isinstance(angle, cuda.Array):
            # angle is already a texture, so we just set the reference
            self.angle_texref.set_array(angle)
        else:
            #Upload data to GPU and bind to texture reference
            self.angle_texref.set_array(
                cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32),
                                 order="C"))

        # Set texture parameters
        self.angle_texref.set_filter_mode(
            cuda.filter_mode.LINEAR)  #bilinear interpolation
        self.angle_texref.set_address_mode(
            0, cuda.address_mode.CLAMP)  #no indexing outside domain
        self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP)
        self.angle_texref.set_flags(
            cuda.TRSF_NORMALIZED_COORDINATES)  #Use [0, 1] indexing
mod = SourceModule(source_file %
                   {"NUMBER_OF_GENERATORS": output_width * output_height},
                   no_extern_c=True,
                   include_dirs=[os.getcwd()])
init_rand_num_generators = mod.get_function("initRandNumGenerators")
render = mod.get_function("render")

#Settings for the volume density texture.
texture = mod.get_texref("gVolumeTexture")
texture.set_address_mode(0, cuda.address_mode.BORDER)
texture.set_address_mode(1, cuda.address_mode.BORDER)
texture.set_address_mode(2, cuda.address_mode.BORDER)
texture.set_filter_mode(cuda.filter_mode.LINEAR)
texture.set_format(cuda.array_format.FLOAT, 1)
texture.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
texture.set_array(cuda.np_to_array(volume_data, order="F"))

output = np.zeros((output_height, output_width, 3)).astype(np.float32)

init_rand_num_generators(block=(block_size * block_size, 1, 1),
                         grid=((output_width * output_height) //
                               (block_size * block_size) + 1, 1, 1))

#GUI
window = tk.Tk()
canvas = tk.Canvas(window, width=output_width, height=output_height)
canvas.pack()
image_on_canvas = canvas.create_image(0, 0, anchor=tk.NW)

for sample_no in range(1, number_of_samples + 1):
    render(cuda.InOut(output),
Example #17
0
    def __init__(self, vol, img, ray_step_mm=None, render_op='drr'):
        source = """
        #include "cuda_math.h"

        //------------------------------ DATA STRUCTURES -------------------------------	
        struct sRenderParams
        {
            // 2D detector data
            uint2 sizes2D;
            float2 steps2D;
            
            // 3D image data
            uint3 sizes3D; float1 __padding1;
            float3 steps3D; float1 __padding2;
            float3 boxmin; float1 __padding3;
            float3 boxmax; float1 __padding4;
            
            // Source position
            float3 ray_org; float1 __padding5;
        
            // Step along rays
            float1 ray_step, __padding6;
            
            // Transformation from 2D image to 2D plane in WCS 
            float T2D[16];
        };
            
        //-------------------------------- DEVICE CODE ---------------------------------	
        // Device variables
        extern "C" {
        texture<float, cudaTextureType3D, cudaReadModeElementType> d_tex;
        }
            
        // Intersect ray with a 3D volume:
        // see https://wiki.aalto.fi/download/attachments/40023967/ 
        // gpgpu.pdf?version=1&modificationDate=1265652539000
        __device__
        int intersectBox(
            float3 ray_org, float3 raydir, 
            sRenderParams *d_params, 
            float *tnear, float *tfar )
        {							    
            // Compute intersection of ray with all six bbox planes
            float3 invR = make_float3(1.0f) / raydir;
            float3 tbot = invR * (d_params->boxmin - ray_org);
            float3 ttop = invR * (d_params->boxmax - ray_org);	
        
            // Re-order intersections to find smallest and largest on each axis
            float3 tmin = fminf(ttop, tbot);
            float3 tmax = fmaxf(ttop, tbot);
        
            // Find the largest tmin and the smallest tmax
            float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z));
            float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z));
        
            *tnear = largest_tmin;
            *tfar = smallest_tmax;
        
            return smallest_tmax > largest_tmin;	
        }
        
        // Define DRR operator
        struct drr_operator
        {        
            static __inline__ __host__ __device__ 
            void compute ( 
                float &in, float &acc )
            {        
                acc += in;
            }
        };
        
        // Define MIP operator
        struct mip_operator
        {        
            static __inline__ __host__ __device__ 
            void compute ( 
                float &in, float &acc )
            {        	
                if(in > acc)
                    acc = in;
            }
        };
        
        // Define MINIP operator
        struct minip_operator
        {        
            static __inline__ __host__ __device__ 
            void compute ( 
                float &in, float &acc )
            {        	
                if(in < acc)
                    acc = in;
            }
        };
        
        // Homogeneous transformation: 
        // multiplication of a point w homog. transf. matrix
        /*static __inline__ __host__ __device__ 
        float3 hom_trans(float*& Tx, float3& pos)
        {
            float xw = Tx[0]*pos.x + Tx[4]*pos.y +  Tx[8]*pos.z + Tx[12];
            float yw = Tx[1]*pos.x + Tx[5]*pos.y +  Tx[9]*pos.z + Tx[13];
            float zw = Tx[2]*pos.x + Tx[6]*pos.y + Tx[10]*pos.z + Tx[14];
            
            return make_float3( xw, yw, zw );
        }*/
        
        static __inline__ __host__ __device__ 
        float3 hom_trans(float*& Tx, float3& pos)
        {
            float xw = Tx[0]*pos.x + Tx[1]*pos.y +  Tx[2]*pos.z + Tx[3];
            float yw = Tx[4]*pos.x + Tx[5]*pos.y +  Tx[6]*pos.z + Tx[7];
            float zw = Tx[8]*pos.x + Tx[9]*pos.y + Tx[10]*pos.z + Tx[11];
            
            return make_float3( xw, yw, zw );
        }
        
        // Rendering kernel: 
        // traverses the volume and performs linear interpolation
        extern "C" {
        __global__ 
        void render_kernel( 
            float* d_image, 
            float* d_Tx, float* d_TxT2D, 
            sRenderParams *d_params )	
        { 
            // Resolve 2D image index
            float x = blockIdx.x*blockDim.x + threadIdx.x;
            float y = blockIdx.y*blockDim.y + threadIdx.y;
            
            if ( (uint(x) >= d_params->sizes2D.x) || 
                    (uint(y) >= d_params->sizes2D.y) ) 
                return;		
            
            float3 ray_org, pos2D;
            
            // Transform source position to volume space
            ray_org = hom_trans( d_Tx, d_params->ray_org );
            
            // Create a point in 2D detector space
            pos2D = make_float3( x*d_params->steps2D.x, y*d_params->steps2D.y, 0.0f );
            
            // Inline homogeneous transformation to volume space
            // ie., (x,y) pixel in 3D volume coordinate system
            pos2D = hom_trans( d_TxT2D, pos2D );
                
            // Find eye ray in world space that points from the X-ray source 
            // to the current pixel on the detector plane:
            // - ray origin is in the X-ray source (xs,ys,zs)
            // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs)		
            float3 ray_dir = normalize( pos2D - ray_org ); 
                    
            // Find intersection with 3D volume
            float tnear, tfar;
            if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) )
                return;
            
            // March along ray from front to back		
            float dt = d_params->ray_step.x;
                    
            float3 pos = make_float3(
                (ray_org.x + ray_dir.x*tnear) / d_params->steps3D.x, 
                (ray_org.y + ray_dir.y*tnear) / d_params->steps3D.y, 
                (ray_org.z + ray_dir.z*tnear) / d_params->steps3D.z);
        
            float3 step = make_float3(
                ray_dir.x * dt / d_params->steps3D.x, 
                ray_dir.y * dt / d_params->steps3D.y, 
                ray_dir.z * dt / d_params->steps3D.z);
                    
            #ifdef RENDER_MINIP
            float acc = 1e+7;
            #else
            float acc = 0;
            #endif
            for( ; tnear<=tfar; tnear+=dt )
            {		
                // resample the volume
                float sample = tex3D( d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f );
                
                #ifdef RENDER_MAXIP
                mip_operator::compute( sample, acc );
                #elif RENDER_MINIP
                minip_operator::compute( sample, acc );
                #elif RENDER_DRR
                drr_operator::compute( sample, acc );
                #endif   
        
                // update position
                pos += step;
            }
        
            // Write to the output buffer
            uint idx = uint(x) + uint(y) * d_params->sizes2D.x;
            d_image[idx] = acc;	
        }
        }
        
        // Rendering kernel: 
        // traverses the volume and performs linear interpolation
        // for selected points in the 2d image
        extern "C" {
        __global__ 
        void render_kernel_idx( 
            float* d_image, uint* d_idx, uint max_idx,
            float* d_Tx, float* d_TxT2D, 
            sRenderParams *d_params )	
        { 
            // Resolve 1D index
            uint idx = blockIdx.x*blockDim.x + threadIdx.x;
                
            if ( idx > max_idx )				
                return;
                
            uint idx_t = d_idx[ idx ];
            
            // Resolve 2D image index
            uint y = idx_t / d_params->sizes2D.x;
            uint x = idx_t - y * d_params->sizes2D.x;
            
            //if ( (uint(x) >= d_params->sizes2D.x) || 
            //		(uint(y) >= d_params->sizes2D.y) ) 
            //	return;		
            
            float3 ray_org, pos2D;
            
            // Transform souce position to volume space
            ray_org = hom_trans( d_Tx, d_params->ray_org );
            
            // Create a point in 2D detector space
            pos2D = make_float3(float(x)*d_params->steps2D.x, 
                float(y)*d_params->steps2D.y, 0.0f);
            
            // Inline homogeneous transformation to volume space
            // ie., (x,y) pixel in 3D volume coordinate system
            pos2D = hom_trans( d_TxT2D, pos2D );
                
            // Find eye ray in world space that points from the X-ray source 
            // to the current pixel on the detector plane:
            // - ray origin is in the X-ray source (xs,ys,zs)
            // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs)		
            float3 ray_dir = normalize( pos2D - ray_org ); 
                    
            // Find intersection with 3D volume
            float tnear, tfar;
            if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) )
                return;
            
            // March along ray from front to back		
            float dt = d_params->ray_step.x;
                    
            float3 pos = make_float3(
                (ray_org.x + ray_dir.x*tnear)/d_params->steps3D.x, 
                (ray_org.y + ray_dir.y*tnear)/d_params->steps3D.y, 
                (ray_org.z + ray_dir.z*tnear)/d_params->steps3D.z);
        
            float3 step = make_float3(
                ray_dir.x*dt/d_params->steps3D.x, 
                ray_dir.y*dt/d_params->steps3D.y, 
                ray_dir.z*dt/d_params->steps3D.z);
                    
            #ifdef RENDER_MINIP
            float acc = 1e+7;
            #else
            float acc = 0;
            #endif
            for( ; tnear<=tfar; tnear+=dt )
            {		
                // resample the volume
                float sample = tex3D(d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f);
                
                #ifdef RENDER_MAXIP
                mip_operator::compute( sample, acc );
                #elif RENDER_MINIP
                minip_operator::compute( sample, acc );
                #elif RENDER_DRR
                drr_operator::compute( sample, acc );
                #endif                              
        
                // update position
                pos += step;
            }
        
            // Write to the output buffer
            d_image[idx] = acc;
        }
        }        
        """
        if render_op not in self.VALID_RENDER_OPERATION:
            raise ValueError(
                'Rendering operation "{}" is not valid.'.format(render_op))
        cmodule = pycuda.compiler.SourceModule(
            source,
            options=['-DRENDER_{}'.format(render_op.upper())],
            include_dirs=[
                "C:\\Users\\Ana\\Documents\\ROBOTSKI VID SEMINAR\\include"
            ],
            no_extern_c=True)
        # include_dirs=[os.path.join(os.getcwd(), 'include')],

        self._texture = cmodule.get_texref('d_tex')
        self._renderer = cmodule.get_function('render_kernel')
        self._renderer_idx = cmodule.get_function('render_kernel_idx')

        if ray_step_mm is None:
            ray_step_mm = float(np.linalg.norm(vol['spac']) / 2.0)

        self._params_d = cuda.mem_alloc(RenderParams.mem_size)
        self.params = RenderParams(
            RenderParams.Attributes(
                sizes2d=img['img'].shape[::-1],
                steps2d=img['spac'],
                sizes3d=vol['img'].shape[::-1],
                steps3d=vol['spac'],
                boxmin=np.array((0, 0, 0), dtype='float32').flatten(),
                boxmax=(np.array(vol['img'].shape[::-1]) - 1.0) *
                np.array(vol['spac']).astype('float32').flatten(),
                ray_org=img['SPos'].flatten(),
                ray_step=ray_step_mm,
                trans_2d=np.array(img['TPos'], dtype='float32').flatten()),
            self._params_d)

        # Copy array to texture memory
        self._texture.set_array(
            cuda.np_to_array(vol['img'].astype('float32'), order='C'))
        # We could set the next if we wanted to address the image
        # in normalized coordinates ( 0 <= coordinate < 1.)
        # self._texture.set_flags(cuda.TRSF_READ_AS_INTEGER)
        # self._texture.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        # Perform linear interpolation
        self._texture.set_filter_mode(cuda.filter_mode.LINEAR)
        self._texture.set_address_mode(0, cuda.address_mode.CLAMP)
        self._texture.set_address_mode(1, cuda.address_mode.CLAMP)
        self._texture.set_address_mode(2, cuda.address_mode.CLAMP)

        # check max threads per block for current GPU
        max_threads_per_block = tools.DeviceData().max_threads

        if self.BLOCK_SIZE_1D is None:
            self.BLOCK_SIZE_1D = max_threads_per_block
        elif self.BLOCK_SIZE_1D > max_threads_per_block:
            raise ValueError(
                'Parameter BLOCK_SIZE_1D={} exceeds maximal pool of threads per block '
                '(current GPU has maximum of {} threads per block).'.format(
                    self.BLOCK_SIZE_1D, max_threads_per_block))

        if self.BLOCK_SIZE_2D is None:
            self.BLOCK_SIZE_2D = 2
            while self.BLOCK_SIZE_2D**2 < max_threads_per_block:
                self.BLOCK_SIZE_2D *= 2
        elif self.BLOCK_SIZE_2D**2 > max_threads_per_block:
            raise ValueError(
                'Parameter BLOCK_SIZE_2D={} (squared=) exceeds maximal pool of threads per block '
                '(current GPU has maximum of {} threads per block).'.format(
                    self.BLOCK_SIZE_2D, self.BLOCK_SIZE_2D**2,
                    max_threads_per_block))

        # threads per block
        nx, ny = self.params.values.sizes2d
        self._blocksize_2d = (
            nx if nx < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D,
            ny if ny < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D, 1)
        # blocks per grid
        self._gridsize_2d = (int(nx / self._blocksize_2d[0]),
                             int(ny / self._blocksize_2d[1]), 1)
Example #18
0
 def _upload_ref_vol(self, data):
     """Upload the reference volume into texture memory."""
     assert data.dtype == np.float32, "np.float32 is required"
     ref_vol = cuda.np_to_array(data, 'C')
     self._texture.set_array(ref_vol)
     return ref_vol
Example #19
0
def find_bubbles(I, scale=1., fil='kspace'):
	"""brute force method"""
	zeta = 40.
	Z = 12.
	RMAX = 30.
	RMIN = 1.
	mm = mmin(Z)
	smin = sig0(m2R(mm))
	deltac = Deltac(Z)
	fgrowth = deltac/1.686
	#fgrowth = pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True)
	"""find bubbbles for deltax box I"""
	kernel_source = open("find_bubbles.cu").read()
	kernel_code = kernel_source % {
        'DELTAC': deltac,
        'RMIN': RMIN,
        'SMIN': smin, 
        'ZETA': zeta
    }
	main_module = nvcc.SourceModule(kernel_code)
	if fil == 'rspace':
		kernel = main_module.get_function("real_tophat_kernel")
	elif fil == 'kspace':
		kernel = main_module.get_function("k_tophat_kernel")
	image_texture = main_module.get_texref("img")

	# Get contiguous image + shape.
	height, width, depth = I.shape
	I = np.float32(I.copy()*fgrowth)

	# Get block/grid size for steps 1-3.
	block_size =  (8,8,8)
	grid_size =   (width/(block_size[0])+1,
				height/(block_size[0])+1,
				depth/(block_size[0])+1)
	 # Initialize variables.
	ionized       = np.zeros([height,width,depth]) 
	ionized       = np.float32(ionized)
	width         = np.int32(width)

	# Transfer labels asynchronously.
	ionized_d = gpuarray.to_gpu_async(ionized)
	I_cu = cu.np_to_array(I, order='C')
	cu.bind_array_to_texref(I_cu, image_texture)

	
	R = RMAX
	while R > RMIN:
		print R
		Rpix = np.float32(R/scale)
		S0 = np.float32(sig0(R))
		start = cu.Event()
		end = cu.Event()
		start.record()
		kernel(ionized_d, width, Rpix, S0, block=block_size, grid=HII_grid_size)
		end.record()
		end.synchronize()
		R *= (1./1.5)

	ionized = ionized_d.get()
	return ionized
Example #20
0
def watershed(I, mask=None):
    kernel_source = open("Dwatershed.cu").read()
    main_module = nvcc.SourceModule(kernel_source)
    descent_kernel = main_module.get_function("descent_kernel")
    stabilize_kernel = main_module.get_function("stabilize_kernel")
    image_texture = main_module.get_texref("img")
    plateau_kernel = main_module.get_function("plateau_kernel")
    minima_kernel = main_module.get_function("minima_kernel")
    flood_kernel = main_module.get_function("flood_kernel")
    increment_kernel = main_module.get_function("increment_kernel")

    # Get contiguous image + shape.
    height, width, depth = I.shape
    I = np.float32(I.copy())
    if mask is None:
        mask = np.ones(I.shape)
    mask = np.int32(mask)

    # Get block/grid size for steps 1-3.
    block_size = (8, 8, 8)
    grid_size = (width / (block_size[0] - 2) + 1,
                 height / (block_size[0] - 2) + 1,
                 depth / (block_size[0] - 2) + 1)

    # # Get block/grid size for step 4.
    # block_size2 = (10,10,10)
    # grid_size2  = (width/(block_size2[0]-2)+1,
    #               height/(block_size2[0]-2)+1,
    #               depth/(block_size2[0]-2)+1)

    # Initialize variables.
    labeled = np.zeros([height, width, depth])
    labeled = np.float64(labeled)
    width = np.int32(width)
    height = np.int32(height)
    depth = np.int32(depth)
    count = np.int32([0])

    # Transfer labels asynchronously.
    labeled_d = gpu.to_gpu_async(labeled)
    counters_d = gpu.to_gpu_async(count)
    # mask_d = cu.np_to_array( mask, order='C' )
    # cu.bind_array_to_texref(mask_d, mask_texture)
    # Bind CUDA textures.
    #I_cu = cu.matrix_to_array(I, order='C')
    I_cu = cu.np_to_array(I, order='C')
    cu.bind_array_to_texref(I_cu, image_texture)

    # Step 1.
    descent_kernel(labeled_d,
                   width,
                   height,
                   depth,
                   block=block_size,
                   grid=grid_size)
    start_time = cu.Event()
    end_time = cu.Event()
    start_time.record()

    counters_d = gpu.to_gpu(np.int32([0]))
    #counters_d = gpu.to_gpu_async(np.int32([0]))
    old, new = -1, -2
    it = 0
    while old != new:
        it += 1
        old = new
        plateau_kernel(labeled_d,
                       counters_d,
                       width,
                       height,
                       depth,
                       block=block_size,
                       grid=grid_size)
        new = counters_d.get()[0]
    print 'plateau kernel', it - 2

    # Step 2.
    increment_kernel(labeled_d,
                     width,
                     height,
                     depth,
                     block=block_size,
                     grid=grid_size)

    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    it = 0

    while old != new:
        it += 1
        old = new
        minima_kernel(labeled_d,
                      counters_d,
                      width,
                      height,
                      depth,
                      block=block_size,
                      grid=grid_size)
        new = counters_d.get()[0]
    print 'minima kernel', it - 2

    # Step 3.
    # counters_d = gpu.to_gpu(np.int32([0]))
    # old, new = -1, -2; it = 0
    # while old != new:
    #   it +=1
    #   old = new
    #   plateau_kernel(labeled_d, counters_d, width,
    #   height, depth, block=block_size, grid=grid_size)
    #   new = counters_d.get()[0]
    # print 'plateau kernel', it-2

    # Step 4
    counters_d = gpu.to_gpu(np.int32([0]))
    old, new = -1, -2
    it = 0
    while old != new:
        it += 1
        old = new
        flood_kernel(labeled_d,
                     counters_d,
                     width,
                     height,
                     depth,
                     block=block_size,
                     grid=grid_size)
        new = counters_d.get()[0]
    print 'flood kernel', it - 2

    labels = labeled_d.get()
    labels = labels * mask

    # End GPU timers.
    end_time.record()
    end_time.synchronize()
    gpu_time = start_time.\
    time_till(end_time) * 1e-3

    # print str(gpu_time)
    #cu.DeviceAllocation.free(counters_d)
    del counters_d

    return labels