def __init__(self, volume, segmentation, voxelsize, origin=[0.0, 0.0, 0.0], stepsize=0.1, mode="linear"): #generate kernels self.mod = self.generateKernelModuleProjector() self.projKernel = self.mod.get_function("projectKernel") self.volumesize = volume.shape self.volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() self.segmentation = np.moveaxis(segmentation.astype(np.float32), [0, 1, 2], [2, 1, 0]).copy() # print("done swap") self.volume_gpu = cuda.np_to_array(self.volume, order='C') self.texref_volume = self.mod.get_texref("tex_density") cuda.bind_array_to_texref(self.volume_gpu, self.texref_volume) self.segmentation_gpu = cuda.np_to_array(self.segmentation, order='C') self.texref_segmentation = self.mod.get_texref("tex_segmentation") cuda.bind_array_to_texref(self.segmentation_gpu, self.texref_segmentation) if mode == "linear": self.texref_volume.set_filter_mode(cuda.filter_mode.LINEAR) self.texref_segmentation.set_filter_mode(cuda.filter_mode.LINEAR) self.voxelsize = voxelsize self.stepsize = np.float32(stepsize) self.origin = origin self.initialized = False print("initialized projector")
def project(self, volume_context, geometry_context, T_Nx4x4): assert self.cpu is not None, 'CPU ray casting is not supported.' image_size = self.target_detector.to_cpu().image_size pm_Nx3x4 = geometry_context.projection_matrix p_Nx12 = utils.constructProjectionParameter(pm_Nx3x4, np.array(image_size[:2]), T_Nx4x4) assert self.target_detector.cpu.image_size[2] == p_Nx12.shape[ 0], 'Unmatched detector channel and pose parameter channel.(Actual: {} != {})'.format( self.target_detector.cpu.image_size[2], p_Nx12.shape[0]) h_p_Nx12 = p_Nx12.astype(np.float32) d_p_Nx12 = driver.np_to_array(h_p_Nx12, order='C') t_p_Nx12 = KernelManager.Module.get_texture('t_proj_param_Nx12', d_p_Nx12) grid = (16, 16, 1) if Projector.grid is None: grid = tuple( np.uint32(np.ceil(image_size / Projector.block)).tolist()) KernelManager.Kernel.invoke(self.target_detector.image.gpudata, texrefs=[volume_context.volume, t_p_Nx12], block=Projector.block, grid=grid) # Display debug info # print_kernel = KernelManager.Module.get_kernel('print_device_params') # print_kernel.invoke(texrefs=[t_p_Nx12]) return self.target_detector.image
def _run(self, I, shift, rot, ratio, out): logger.debug("I.shape={}".format(I.shape)) # bind input image to texture _in_buf = cuda.np_to_array(I, 'C') self._shear_texture.set_array(_in_buf) # determine grid and block size _, nv, nu = out.shape block_sz = (32, 32, 1) grid_sz = (ceil(float(nu) / block_sz[0]), ceil(float(nv) / block_sz[1])) if (self._out_buf is None) or (self._out_buf.shape != out.shape): logger.debug("resize buffer to {}".format(out.shape)) self._out_buf = gpuarray.empty(out.shape, dtype=np.float32, order='C') # TODO create rotate kernel buffer area # execute nz, ny, nx = I.shape self._shear_kernel.prepared_call(grid_sz, block_sz, self._out_buf.gpudata, np.float32(shift), np.uint32(nu), np.uint32(nv), np.float32(ratio), np.uint32(nx), np.uint32(ny), np.float32(nz)) # TODO add rotate kernel call self._out_buf.get(out) # unbind texture _in_buf.free()
def setTexture(texref, numpy_array): #Upload data to GPU and bind to texture reference texref.set_array(cuda.np_to_array(numpy_array, order="C")) # Set texture parameters texref.set_filter_mode(cuda.filter_mode.LINEAR) #bilinear interpolation texref.set_address_mode(0, cuda.address_mode.CLAMP) #no indexing outside domain texref.set_address_mode(1, cuda.address_mode.CLAMP) texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [ np.int16, np.float32, np.float64, np.complex64, np.complex128 ]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = "fp_tex_cfloat" elif prec == np.complex128: fpName_str = "fp_tex_cdouble" elif prec == np.float64: fpName_str = "fp_tex_double" else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = """ #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8, 8, 8) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, npoints) cuGrid = ( npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), ) copy_texture.prepare("P", texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def tex2DToGPU(tex): nChannal = 1 if (len(tex.shape) == 2) else 3 if (nChannal == 3): #Add padding channal tex = np.dstack((tex, np.ones((tex.shape[0], tex.shape[1])))) tex = np.ascontiguousarray(tex).astype(np.float32) texGPUArray = cuda.make_multichannel_2d_array(tex, 'C') else: texGPUArray = cuda.np_to_array(tex, 'C') return texGPUArray
def initialize(self): """Allocate GPU memory and transfer the volume, segmentations to GPU.""" if self.initialized: raise RuntimeError("Close projector before initializing again.") # allocate and transfer volume texture to GPU # TODO: this axis-swap is messy and actually may be messing things up. Maybe use a FrameTransform in the Volume class instead? volume = self.volume.data volume = np.moveaxis(volume, [0, 1, 2], [2, 1, 0]).copy() # TODO: is this axis swap necessary? self.volume_gpu = cuda.np_to_array(volume, order='C') self.volume_texref = self.mod.get_texref("volume") cuda.bind_array_to_texref(self.volume_gpu, self.volume_texref) # set the (interpolation?) mode if self.mode == 'linear': self.volume_texref.set_filter_mode(cuda.filter_mode.LINEAR) else: raise RuntimeError # allocate and transfer segmentation texture to GPU # TODO: remove axis swap? # self.segmentations_gpu = [cuda.np_to_array(seg, order='C') for mat, seg in self.volume.materials.items()] self.segmentations_gpu = [cuda.np_to_array(np.moveaxis(seg, [0, 1, 2], [2, 1, 0]).copy(), order='C') for mat, seg in self.volume.materials.items()] self.segmentations_texref = [self.mod.get_texref(f"seg_{m}") for m, _ in enumerate(self.volume.materials)] for seg, texref in zip(self.segmentations_gpu, self.segmentations_texref): cuda.bind_array_to_texref(seg, texref) if self.mode == 'linear': texref.set_filter_mode(cuda.filter_mode.LINEAR) else: raise RuntimeError # allocate output image array on GPU (4 bytes to a float32) self.output_gpu = cuda.mem_alloc(self.output_size * 4) # allocate ijk_from_index matrix array on GPU (3x3 array x 4 bytes per float32) self.rt_kinv_gpu = cuda.mem_alloc(3 * 3 * 4) # Mark self as initialized. self.initialized = True
def test_3d_fp_textures(self): orden = "C" npoints = 32 for prec in [np.int16, np.float32, np.float64, np.complex64, np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = "fp_tex_cfloat" elif prec == np.complex128: fpName_str = "fp_tex_cdouble" elif prec == np.float64: fpName_str = "fp_tex_double" else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = """ #include <pycuda-helpers.hpp> texture<fpName, 3, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; int slice = blockIdx.z*blockDim.z + threadIdx.z; dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row); } """ myKern = myKern.replace("fpName", fpName_str) myKern = myKern.replace("cuPres", prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (8, 8, 8) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, npoints) cuGrid = ( npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), npoints // cuBlock[2] + 1 * (npoints % cuBlock[1] != 0), ) copy_texture.prepare("P", texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=False) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def test_2d_fp_texturesLayered(self): orden = "F" npoints = 32 for prec in [ np.int16, np.float32, np.float64, np.complex64, np.complex128 ]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints, npoints], order=orden, dtype=prec) A_cpu[:] = np.random.rand(npoints, npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape, dtype=prec, order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } ''' myKern = myKern.replace('fpName', fpName_str) myKern = myKern.replace('cuPres', prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (16, 16, 1) if cuBlock[0] > npoints: cuBlock = (npoints, npoints, 1) cuGrid = (npoints // cuBlock[0] + 1 * (npoints % cuBlock[0] != 0), npoints // cuBlock[1] + 1 * (npoints % cuBlock[1] != 0), 1) copy_texture.prepare('P', texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu, orden, allowSurfaceBind=True) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid, cuBlock, A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get() - np.transpose(A_cpu))) == np.array(0, dtype=prec) A_gpu.gpudata.free()
def test_2d_fp_texturesLayered(self): orden = "F" npoints = 32 for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]: prec_str = dtype_to_ctype(prec) if prec == np.complex64: fpName_str = 'fp_tex_cfloat' elif prec == np.complex128: fpName_str = 'fp_tex_cdouble' elif prec == np.float64: fpName_str = 'fp_tex_double' else: fpName_str = prec_str A_cpu = np.zeros([npoints,npoints],order=orden,dtype=prec) A_cpu[:] = np.random.rand(npoints,npoints)[:] A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden) myKern = ''' #include <pycuda-helpers.hpp> texture<fpName, cudaTextureType2DLayered, cudaReadModeElementType> mtx_tex; __global__ void copy_texture(cuPres *dest) { int row = blockIdx.x*blockDim.x + threadIdx.x; int col = blockIdx.y*blockDim.y + threadIdx.y; dest[row + col*blockDim.x*gridDim.x] = fp_tex2DLayered(mtx_tex, col, row, 1); } ''' myKern = myKern.replace('fpName',fpName_str) myKern = myKern.replace('cuPres',prec_str) mod = SourceModule(myKern) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") cuBlock = (16,16,1) if cuBlock[0]>npoints: cuBlock = (npoints,npoints,1) cuGrid = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),1) copy_texture.prepare('P',texrefs=[mtx_tex]) cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=True) mtx_tex.set_array(cudaArray) copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata) assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec) A_gpu.gpudata.free()
def ndarray_to_float_tex(tex_ref, ndarray, address_mode=cuda.address_mode.BORDER, filter_mode=cuda.filter_mode.LINEAR): if isinstance(ndarray, np.ndarray): cu_array = cuda.np_to_array(ndarray, 'C') elif isinstance(ndarray, gpuarray.GPUArray): cu_array = cuda.gpuarray_to_array(ndarray, 'C') else: raise TypeError( 'ndarray must be numpy.ndarray or pycuda.gpuarray.GPUArray') cuda.TextureReference.set_array(tex_ref, cu_array) cuda.TextureReference.set_address_mode( tex_ref, 0, address_mode) if ndarray.ndim >= 2: cuda.TextureReference.set_address_mode( tex_ref, 1, address_mode) if ndarray.ndim >= 3: cuda.TextureReference.set_address_mode( tex_ref, 2, address_mode) cuda.TextureReference.set_filter_mode( tex_ref, filter_mode) tex_ref.set_flags(tex_ref.get_flags( ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER)
def __init__(self, \ gpu_ctx, \ eta0, hu0, hv0, H, \ nx, ny, \ dx, dy, dt, \ g, f, r, \ angle=np.array([[0]], dtype=np.float32), \ t=0.0, \ theta=1.3, rk_order=2, \ coriolis_beta=0.0, \ max_wind_direction_perturbation = 0, \ wind_stress=WindStress.WindStress(), \ boundary_conditions=Common.BoundaryConditions(), \ boundary_conditions_data=Common.BoundaryConditionsData(), \ small_scale_perturbation=False, \ small_scale_perturbation_amplitude=None, \ small_scale_perturbation_interpolation_factor = 1, \ model_time_step=None, reportGeostrophicEquilibrium=False, \ use_lcg=False, \ write_netcdf=False, \ comm=None, \ netcdf_filename=None, \ ignore_ghostcells=False, \ courant_number=0.8, \ offset_x=0, offset_y=0, \ flux_slope_eps = 1.0e-1, \ desingularization_eps = 1.0e-1, \ depth_cutoff = 1.0e-5, \ block_width=32, block_height=8, num_threads_dt=256, block_width_model_error=16, block_height_model_error=16): """ Initialization routine eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells H: Depth from equilibrium defined on cell corners, (nx+5)*(ny+5) corners nx: Number of cells along x-axis ny: Number of cells along y-axis dx: Grid cell spacing along x-axis (20 000 m) dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) f: Coriolis parameter (1.2e-4 s^1), effectively as f = f + beta*y r: Bottom friction coefficient (2.4e-3 m/s) angle: Angle of rotation from North to y-axis t: Start simulation at time t theta: MINMOD theta used the reconstructions of the derivatives in the numerical scheme rk_order: Order of Runge Kutta method {1,2*,3} coriolis_beta: Coriolis linear factor -> f = f + beta*(y-y_0) max_wind_direction_perturbation: Large-scale model error emulation by per-time-step perturbation of wind direction by +/- max_wind_direction_perturbation (degrees) wind_stress: Wind stress parameters boundary_conditions: Boundary condition object small_scale_perturbation: Boolean value for applying a stochastic model error small_scale_perturbation_amplitude: Amplitude (q0 coefficient) for model error small_scale_perturbation_interpolation_factor: Width factor for correlation in model error model_time_step: The size of a data assimilation model step (default same as dt) reportGeostrophicEquilibrium: Calculate the Geostrophic Equilibrium variables for each superstep use_lcg: Use LCG as the random number generator. Default is False, which means using curand. write_netcdf: Write the results after each superstep to a netCDF file comm: MPI communicator desingularization_eps: Used for desingularizing hu/h flux_slope_eps: Used for setting zero flux for symmetric Riemann fan depth_cutoff: Used for defining dry cells netcdf_filename: Use this filename. (If not defined, a filename will be generated by SimWriter.) """ self.logger = logging.getLogger(__name__) assert( rk_order < 4 or rk_order > 0 ), "Only 1st, 2nd and 3rd order Runge Kutta supported" if (rk_order == 3): assert(r == 0.0), "3rd order Runge Kutta supported only without friction" # Sort out internally represented ghost_cells in the presence of given # boundary conditions ghost_cells_x = 2 ghost_cells_y = 2 #Coriolis at "first" cell x_zero_reference_cell = ghost_cells_x y_zero_reference_cell = ghost_cells_y # In order to pass it to the super constructor # Boundary conditions self.boundary_conditions = boundary_conditions if (boundary_conditions.isSponge()): nx = nx + boundary_conditions.spongeCells[1] + boundary_conditions.spongeCells[3] - 2*ghost_cells_x ny = ny + boundary_conditions.spongeCells[0] + boundary_conditions.spongeCells[2] - 2*ghost_cells_y x_zero_reference_cell += boundary_conditions.spongeCells[3] y_zero_reference_cell += boundary_conditions.spongeCells[2] #Compensate f for reference cell (first cell in internal of domain) north = np.array([np.sin(angle[0,0]), np.cos(angle[0,0])]) f = f - coriolis_beta * (x_zero_reference_cell*dx*north[0] + y_zero_reference_cell*dy*north[1]) x_zero_reference_cell = 0 y_zero_reference_cell = 0 A = None self.max_wind_direction_perturbation = max_wind_direction_perturbation super(CDKLM16, self).__init__(gpu_ctx, \ nx, ny, \ ghost_cells_x, \ ghost_cells_y, \ dx, dy, dt, \ g, f, r, A, \ t, \ theta, rk_order, \ coriolis_beta, \ y_zero_reference_cell, \ wind_stress, \ write_netcdf, \ ignore_ghostcells, \ offset_x, offset_y, \ comm, \ block_width, block_height) # Index range for interior domain (north, east, south, west) # so that interior domain of eta is # eta[self.interior_domain_indices[2]:self.interior_domain_indices[0], \ # self.interior_domain_indices[3]:self.interior_domain_indices[1] ] self.interior_domain_indices = np.array([-2,-2,2,2]) self._set_interior_domain_from_sponge_cells() defines={'block_width': block_width, 'block_height': block_height, 'KPSIMULATOR_DESING_EPS': str(desingularization_eps)+'f', 'KPSIMULATOR_FLUX_SLOPE_EPS': str(flux_slope_eps)+'f', 'KPSIMULATOR_DEPTH_CUTOFF': str(depth_cutoff)+'f'} #Get kernels self.kernel = gpu_ctx.get_kernel("CDKLM16_kernel.cu", defines=defines, compile_args={ # default, fast_math, optimal 'options' : ["--ftz=true", # false, true, true "--prec-div=false", # true, false, false, "--prec-sqrt=false", # true, false, false "--fmad=false"] # true, true, false #'options': ["--use_fast_math"] #'options': ["--generate-line-info"], #nvcc_options=["--maxrregcount=39"], #'arch': "compute_50", #'code': "sm_50" }, jit_compile_args={ #jit_options=[(cuda.jit_option.MAX_REGISTERS, 39)] } ) # Get CUDA functions and define data types for prepared_{async_}call() self.cdklm_swe_2D = self.kernel.get_function("cdklm_swe_2D") self.cdklm_swe_2D.prepare("iiffffffffiiPiPiPiPiPiPiPiPiffi") self.update_wind_stress(self.kernel, self.cdklm_swe_2D) # CUDA functions for finding max time step size: self.num_threads_dt = num_threads_dt self.num_blocks_dt = np.int32(self.global_size[0]*self.global_size[1]) self.update_dt_kernels = gpu_ctx.get_kernel("max_dt.cu", defines={'block_width': block_width, 'block_height': block_height, 'NUM_THREADS': self.num_threads_dt}) self.per_block_max_dt_kernel = self.update_dt_kernels.get_function("per_block_max_dt") self.per_block_max_dt_kernel.prepare("iifffPiPiPiPifPi") self.max_dt_reduction_kernel = self.update_dt_kernels.get_function("max_dt_reduction") self.max_dt_reduction_kernel.prepare("iPP") # Bathymetry self.bathymetry = Common.Bathymetry(gpu_ctx, self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, H, boundary_conditions) # Adjust eta for possible dry states Hm = self.downloadBathymetry()[1] eta0 = np.maximum(eta0, -Hm) # Create data by uploading to device self.gpu_data = Common.SWEDataArakawaA(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0) # Allocate memory for calculating maximum timestep host_dt = np.zeros((self.global_size[1], self.global_size[0]), dtype=np.float32) self.device_dt = Common.CUDAArray2D(self.gpu_stream, self.global_size[0], self.global_size[1], 0, 0, host_dt) host_max_dt_buffer = np.zeros((1,1), dtype=np.float32) self.max_dt_buffer = Common.CUDAArray2D(self.gpu_stream, 1, 1, 0, 0, host_max_dt_buffer) self.courant_number = courant_number ## Allocating memory for geostrophical equilibrium variables self.reportGeostrophicEquilibrium = np.int32(reportGeostrophicEquilibrium) self.geoEq_uxpvy = None self.geoEq_Kx = None self.geoEq_Ly = None if self.reportGeostrophicEquilibrium: dummy_zero_array = np.zeros((ny+2*ghost_cells_y, nx+2*ghost_cells_x), dtype=np.float32, order='C') self.geoEq_uxpvy = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.geoEq_Kx = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.geoEq_Ly = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.constant_equilibrium_depth = np.max(H) self.bc_kernel = Common.BoundaryConditionsArakawaA(gpu_ctx, \ self.nx, \ self.ny, \ ghost_cells_x, \ ghost_cells_y, \ self.boundary_conditions, \ boundary_conditions_data, \ ) # Small scale perturbation: self.small_scale_perturbation = small_scale_perturbation self.small_scale_model_error = None self.small_scale_perturbation_interpolation_factor = small_scale_perturbation_interpolation_factor if small_scale_perturbation: if small_scale_perturbation_amplitude is None: self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim(self, interpolation_factor=small_scale_perturbation_interpolation_factor, use_lcg=use_lcg, block_width=block_width_model_error, block_height=block_height_model_error) else: self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim(self, soar_q0=small_scale_perturbation_amplitude, interpolation_factor=small_scale_perturbation_interpolation_factor, use_lcg=use_lcg, block_width=block_width_model_error, block_height=block_height_model_error) # Data assimilation model step size self.model_time_step = model_time_step if model_time_step is None: self.model_time_step = self.dt self.total_time_steps = 0 if self.write_netcdf: self.sim_writer = SimWriter.SimNetCDFWriter(self, filename=netcdf_filename, ignore_ghostcells=self.ignore_ghostcells, \ offset_x=self.offset_x, offset_y=self.offset_y) #Upload data to GPU and bind to texture reference self.angle_texref = self.kernel.get_texref("angle_tex") self.angle_texref.set_array(cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C")) # Set texture parameters self.angle_texref.set_filter_mode(cuda.filter_mode.LINEAR) #bilinear interpolation self.angle_texref.set_address_mode(0, cuda.address_mode.CLAMP) #no indexing outside domain self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.angle_texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
def __init__(self, \ gpu_ctx, \ eta0, hu0, hv0, H, \ nx, ny, \ dx, dy, dt, \ g, f, r, \ subsample_f=10, \ angle=np.array([[0]], dtype=np.float32), \ subsample_angle=10, \ latitude=None, \ t=0.0, \ theta=1.3, rk_order=2, \ coriolis_beta=0.0, \ max_wind_direction_perturbation = 0, \ wind_stress=WindStress.WindStress(), \ boundary_conditions=Common.BoundaryConditions(), \ boundary_conditions_data=Common.BoundaryConditionsData(), \ small_scale_perturbation=False, \ small_scale_perturbation_amplitude=None, \ small_scale_perturbation_interpolation_factor = 1, \ model_time_step=None, reportGeostrophicEquilibrium=False, \ use_lcg=False, \ write_netcdf=False, \ comm=None, \ local_particle_id=0, \ super_dir_name=None, \ netcdf_filename=None, \ ignore_ghostcells=False, \ courant_number=0.8, \ offset_x=0, offset_y=0, \ flux_slope_eps = 1.0e-1, \ desingularization_eps = 1.0e-1, \ depth_cutoff = 1.0e-5, \ block_width=12, block_height=32, num_threads_dt=256, block_width_model_error=16, block_height_model_error=16): """ Initialization routine eta0: Initial deviation from mean sea level incl ghost cells, (nx+2)*(ny+2) cells hu0: Initial momentum along x-axis incl ghost cells, (nx+1)*(ny+2) cells hv0: Initial momentum along y-axis incl ghost cells, (nx+2)*(ny+1) cells H: Depth from equilibrium defined on cell corners, (nx+5)*(ny+5) corners nx: Number of cells along x-axis ny: Number of cells along y-axis dx: Grid cell spacing along x-axis (20 000 m) dy: Grid cell spacing along y-axis (20 000 m) dt: Size of each timestep (90 s) g: Gravitational accelleration (9.81 m/s^2) f: Coriolis parameter (1.2e-4 s^1), effectively as f = f + beta*y r: Bottom friction coefficient (2.4e-3 m/s) subsample_f: Subsample the coriolis f when creating texture by factor angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array (in radians) subsample_angle: Subsample the angles given as input when creating texture by factor latitude: Specify latitude. This will override any f and beta plane already set (in radians) t: Start simulation at time t theta: MINMOD theta used the reconstructions of the derivatives in the numerical scheme rk_order: Order of Runge Kutta method {1,2*,3} coriolis_beta: Coriolis linear factor -> f = f + beta*(y-y_0) max_wind_direction_perturbation: Large-scale model error emulation by per-time-step perturbation of wind direction by +/- max_wind_direction_perturbation (degrees) wind_stress: Wind stress parameters boundary_conditions: Boundary condition object small_scale_perturbation: Boolean value for applying a stochastic model error small_scale_perturbation_amplitude: Amplitude (q0 coefficient) for model error small_scale_perturbation_interpolation_factor: Width factor for correlation in model error model_time_step: The size of a data assimilation model step (default same as dt) reportGeostrophicEquilibrium: Calculate the Geostrophic Equilibrium variables for each superstep use_lcg: Use LCG as the random number generator. Default is False, which means using curand. write_netcdf: Write the results after each superstep to a netCDF file comm: MPI communicator local_particle_id: Local (for each MPI process) particle id desingularization_eps: Used for desingularizing hu/h flux_slope_eps: Used for setting zero flux for symmetric Riemann fan depth_cutoff: Used for defining dry cells super_dir_name: Directory to write netcdf files to netcdf_filename: Use this filename. (If not defined, a filename will be generated by SimWriter.) """ self.logger = logging.getLogger(__name__) assert (rk_order < 4 or rk_order > 0 ), "Only 1st, 2nd and 3rd order Runge Kutta supported" if (rk_order == 3): assert (r == 0.0 ), "3rd order Runge Kutta supported only without friction" # Sort out internally represented ghost_cells in the presence of given # boundary conditions ghost_cells_x = 2 ghost_cells_y = 2 #Coriolis at "first" cell x_zero_reference_cell = ghost_cells_x y_zero_reference_cell = ghost_cells_y # In order to pass it to the super constructor # Boundary conditions self.boundary_conditions = boundary_conditions #Compensate f for reference cell (first cell in internal of domain) north = np.array([np.sin(angle[0, 0]), np.cos(angle[0, 0])]) f = f - coriolis_beta * (x_zero_reference_cell * dx * north[0] + y_zero_reference_cell * dy * north[1]) x_zero_reference_cell = 0 y_zero_reference_cell = 0 A = None self.max_wind_direction_perturbation = max_wind_direction_perturbation super(CDKLM16, self).__init__(gpu_ctx, \ nx, ny, \ ghost_cells_x, \ ghost_cells_y, \ dx, dy, dt, \ g, f, r, A, \ t, \ theta, rk_order, \ coriolis_beta, \ y_zero_reference_cell, \ wind_stress, \ write_netcdf, \ ignore_ghostcells, \ offset_x, offset_y, \ comm, \ block_width, block_height, local_particle_id=local_particle_id) # Index range for interior domain (north, east, south, west) # so that interior domain of eta is # eta[self.interior_domain_indices[2]:self.interior_domain_indices[0], \ # self.interior_domain_indices[3]:self.interior_domain_indices[1] ] self.interior_domain_indices = np.array([-2, -2, 2, 2]) defines = { 'block_width': block_width, 'block_height': block_height, 'KPSIMULATOR_DESING_EPS': "{:.12f}f".format(desingularization_eps), 'KPSIMULATOR_FLUX_SLOPE_EPS': "{:.12f}f".format(flux_slope_eps), 'KPSIMULATOR_DEPTH_CUTOFF': "{:.12f}f".format(depth_cutoff), 'THETA': "{:.12f}f".format(self.theta), 'RK_ORDER': int(self.rk_order), 'NX': int(self.nx), 'NY': int(self.ny), 'DX': "{:.12f}f".format(self.dx), 'DY': "{:.12f}f".format(self.dy), 'GRAV': "{:.12f}f".format(self.g), 'FRIC': "{:.12f}f".format(self.r) } #Get kernels self.kernel = gpu_ctx.get_kernel( "CDKLM16_kernel.cu", defines=defines, compile_args={ # default, fast_math, optimal 'options': [ "--ftz=true", # false, true, true "--prec-div=false", # true, false, false, "--prec-sqrt=false", # true, false, false "--fmad=false" ] # true, true, false #'options': ["--use_fast_math"] #'options': ["--generate-line-info"], #nvcc_options=["--maxrregcount=39"], #'arch': "compute_50", #'code': "sm_50" }, jit_compile_args={ #jit_options=[(cuda.jit_option.MAX_REGISTERS, 39)] }) # Get CUDA functions and define data types for prepared_{async_}call() self.cdklm_swe_2D = self.kernel.get_function("cdklm_swe_2D") self.cdklm_swe_2D.prepare("fiPiPiPiPiPiPiPiPiffi") self.update_wind_stress(self.kernel, self.cdklm_swe_2D) # CUDA functions for finding max time step size: self.num_threads_dt = num_threads_dt self.num_blocks_dt = np.int32(self.global_size[0] * self.global_size[1]) self.update_dt_kernels = gpu_ctx.get_kernel("max_dt.cu", defines={ 'block_width': block_width, 'block_height': block_height, 'NUM_THREADS': self.num_threads_dt }) self.per_block_max_dt_kernel = self.update_dt_kernels.get_function( "per_block_max_dt") self.per_block_max_dt_kernel.prepare("iifffPiPiPiPifPi") self.max_dt_reduction_kernel = self.update_dt_kernels.get_function( "max_dt_reduction") self.max_dt_reduction_kernel.prepare("iPP") # Bathymetry self.bathymetry = Common.Bathymetry(gpu_ctx, self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, H, boundary_conditions) # Adjust eta for possible dry states Hm = self.downloadBathymetry()[1] eta0 = np.maximum(eta0, -Hm) # Create data by uploading to device self.gpu_data = Common.SWEDataArakawaA(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, eta0, hu0, hv0) # Allocate memory for calculating maximum timestep host_dt = np.zeros((self.global_size[1], self.global_size[0]), dtype=np.float32) self.device_dt = Common.CUDAArray2D(self.gpu_stream, self.global_size[0], self.global_size[1], 0, 0, host_dt) host_max_dt_buffer = np.zeros((1, 1), dtype=np.float32) self.max_dt_buffer = Common.CUDAArray2D(self.gpu_stream, 1, 1, 0, 0, host_max_dt_buffer) self.courant_number = courant_number ## Allocating memory for geostrophical equilibrium variables self.reportGeostrophicEquilibrium = np.int32( reportGeostrophicEquilibrium) self.geoEq_uxpvy = None self.geoEq_Kx = None self.geoEq_Ly = None if self.reportGeostrophicEquilibrium: dummy_zero_array = np.zeros( (ny + 2 * ghost_cells_y, nx + 2 * ghost_cells_x), dtype=np.float32, order='C') self.geoEq_uxpvy = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.geoEq_Kx = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.geoEq_Ly = Common.CUDAArray2D(self.gpu_stream, nx, ny, ghost_cells_x, ghost_cells_y, dummy_zero_array) self.constant_equilibrium_depth = np.max(H) self.bc_kernel = Common.BoundaryConditionsArakawaA(gpu_ctx, \ self.nx, \ self.ny, \ ghost_cells_x, \ ghost_cells_y, \ self.boundary_conditions, \ boundary_conditions_data, \ ) def subsample_texture(data, factor): ny, nx = data.shape dx, dy = 1 / nx, 1 / ny I = interp2d(np.linspace(0.5 * dx, 1 - 0.5 * dx, nx), np.linspace(0.5 * dy, 1 - 0.5 * dy, ny), data, kind='linear') new_nx, new_ny = max(2, nx // factor), max(2, ny // factor) new_dx, new_dy = 1 / new_nx, 1 / new_ny x_new = np.linspace(0.5 * new_dx, 1 - 0.5 * new_dx, new_nx) y_new = np.linspace(0.5 * new_dy, 1 - 0.5 * new_dy, new_ny) return I(x_new, y_new) # Texture for angle self.angle_texref = self.kernel.get_texref("angle_tex") if isinstance(angle, cuda.Array): # angle is already a texture, so we just set the texture reference self.angle_texref.set_array(angle) else: #Upload data to GPU and bind to texture reference if (subsample_angle and angle.size >= eta0.size): self.logger.info("Subsampling angle texture by factor " + str(subsample_angle)) self.logger.warning( "This will give inaccurate angle along the border!") angle = subsample_texture(angle, subsample_angle) self.angle_texref.set_array( cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C")) # Set texture parameters self.angle_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.angle_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.angle_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing # Texture for coriolis f self.coriolis_texref = self.kernel.get_texref("coriolis_f_tex") # Create the CPU coriolis if (latitude is not None): if (self.f != 0.0): raise RuntimeError( "Cannot specify both latitude and f. Make your mind up.") coriolis_f, _ = OceanographicUtilities.calcCoriolisParams(latitude) coriolis_f = coriolis_f.astype(np.float32) else: if (self.coriolis_beta != 0.0): if (angle.size != 1): raise RuntimeError( "non-constant angle cannot be combined with beta plane model (makes no sense)" ) #Generate coordinates for all cells, including ghost cells from center to center # [-3/2dx, nx+3/2dx] for ghost_cells_x == 2 x = np.linspace((-self.ghost_cells_x + 0.5) * self.dx, (self.nx + self.ghost_cells_x - 0.5) * self.dx, self.nx + 2 * self.ghost_cells_x) y = np.linspace((-self.ghost_cells_y + 0.5) * self.dy, (self.ny + self.ghost_cells_y - 0.5) * self.dy, self.ny + 2 * self.ghost_cells_x) self.logger.info( "Using latitude to create Coriolis f texture ({:f}x{:f} cells)" .format(x.size, y.size)) x, y = np.meshgrid(x, y) n = x * np.sin(angle[0, 0]) + y * np.cos( angle[0, 0]) #North vector coriolis_f = self.f + self.coriolis_beta * n else: if (self.f.size == 1): coriolis_f = np.array([[self.f]], dtype=np.float32) elif (self.f.shape == eta0.shape): coriolis_f = np.array(self.f, dtype=np.float32) else: raise RuntimeError( "The shape of f should match up with eta or be scalar." ) if (subsample_f and coriolis_f.size >= eta0.size): self.logger.info("Subsampling coriolis texture by factor " + str(subsample_f)) self.logger.warning( "This will give inaccurate coriolis along the border!") coriolis_f = subsample_texture(coriolis_f, subsample_f) #Upload data to GPU and bind to texture reference self.coriolis_texref.set_array( cuda.np_to_array(np.ascontiguousarray(coriolis_f, dtype=np.float32), order="C")) # Set texture parameters self.coriolis_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.coriolis_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.coriolis_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing # Small scale perturbation: self.small_scale_perturbation = small_scale_perturbation self.small_scale_model_error = None self.small_scale_perturbation_interpolation_factor = small_scale_perturbation_interpolation_factor if small_scale_perturbation: self.small_scale_model_error = OceanStateNoise.OceanStateNoise.fromsim( self, soar_q0=small_scale_perturbation_amplitude, interpolation_factor= small_scale_perturbation_interpolation_factor, use_lcg=use_lcg, block_width=block_width_model_error, block_height=block_height_model_error) # Data assimilation model step size self.model_time_step = model_time_step self.total_time_steps = 0 if model_time_step is None: self.model_time_step = self.dt if self.write_netcdf: self.sim_writer = SimWriter.SimNetCDFWriter(self, super_dir_name=super_dir_name, filename=netcdf_filename, \ ignore_ghostcells=self.ignore_ghostcells, offset_x=self.offset_x, offset_y=self.offset_y) # Update timestep if dt is given as zero if self.dt <= 0: self.updateDt()
def __init__(self, gpu_ctx, gpu_stream, nx, ny, dx, dy, boundaryConditions, staggered, soar_q0=None, soar_L=None, interpolation_factor=1, use_lcg=False, angle=np.array([[0]], dtype=np.float32), coriolis_f=np.array([[0]], dtype=np.float32), block_width=16, block_height=16): """ Initiates a class that generates small scale geostrophically balanced perturbations of the ocean state. (nx, ny): number of internal grid cells in the domain (dx, dy): size of each grid cell soar_q0: amplitude parameter for the perturbation, default: dx*1e-5 soar_L: length scale of the perturbation covariance, default: 0.74*dx*interpolation_factor interpolation_factor: indicates that the perturbation of eta should be generated on a coarse mesh, and then interpolated down to the computational mesh. The coarse mesh will then have (nx/interpolation_factor, ny/interpolation_factor) grid cells. use_lcg: LCG is a linear algorithm for generating a serie of pseudo-random numbers angle: Angle of rotation from North to y-axis as a texture (cuda.Array) or numpy array (block_width, block_height): The size of each GPU block """ self.use_lcg = use_lcg # Set numpy random state self.random_state = np.random.RandomState() # Make sure that all variables initialized within ifs are defined self.random_numbers = None self.rng = None self.seed = None self.host_seed = None self.gpu_ctx = gpu_ctx self.gpu_stream = gpu_stream self.nx = np.int32(nx) self.ny = np.int32(ny) self.dx = np.float32(dx) self.dy = np.float32(dy) self.staggered = np.int(0) if staggered: self.staggered = np.int(1) # The cutoff parameter is hard-coded. # The size of the cutoff determines the computational radius in the # SOAR function. Hence, the size of the local memory in the OpenCL # kernels has to be hard-coded. self.cutoff = np.int32(config.soar_cutoff) # Check that the interpolation factor plays well with the grid size: assert (interpolation_factor > 0 and interpolation_factor % 2 == 1), 'interpolation_factor must be a positive odd integer' assert (nx % interpolation_factor == 0 ), 'nx must be divisible by the interpolation factor' assert (ny % interpolation_factor == 0 ), 'ny must be divisible by the interpolation factor' self.interpolation_factor = np.int32(interpolation_factor) # The size of the coarse grid self.coarse_nx = np.int32(nx / self.interpolation_factor) self.coarse_ny = np.int32(ny / self.interpolation_factor) self.coarse_dx = np.float32(dx * self.interpolation_factor) self.coarse_dy = np.float32(dy * self.interpolation_factor) self.periodicNorthSouth = np.int32( boundaryConditions.isPeriodicNorthSouth()) self.periodicEastWest = np.int32( boundaryConditions.isPeriodicEastWest()) # Size of random field and seed # The SOAR function is a stencil which requires cutoff number of grid cells, # and the interpolation operator requires further 2 ghost cell values in each direction. # The random field must therefore be created with 2 + cutoff number of ghost cells. self.rand_ghost_cells_x = np.int32(2 + self.cutoff) self.rand_ghost_cells_y = np.int32(2 + self.cutoff) if self.periodicEastWest: self.rand_ghost_cells_x = np.int32(0) if self.periodicNorthSouth: self.rand_ghost_cells_y = np.int32(0) self.rand_nx = np.int32(self.coarse_nx + 2 * self.rand_ghost_cells_x) self.rand_ny = np.int32(self.coarse_ny + 2 * self.rand_ghost_cells_y) # Since normal distributed numbers are generated in pairs, we need to store half the number of # of seed values compared to the number of random numbers. self.seed_ny = np.int32(self.rand_ny) self.seed_nx = np.int32(np.ceil(self.rand_nx / 2)) # Generate seed: self.floatMax = 2147483648.0 if self.use_lcg: self.host_seed = self.random_state.rand( self.seed_ny, self.seed_nx) * self.floatMax self.host_seed = self.host_seed.astype(np.uint64, order='C') if not self.use_lcg: self.rng = XORWOWRandomNumberGenerator() else: self.seed = Common.CUDAArray2D(gpu_stream, self.seed_nx, self.seed_ny, 0, 0, self.host_seed, double_precision=True, integers=True) # Constants for the SOAR function: self.soar_q0 = np.float32(self.dx / 100000) if soar_q0 is not None: self.soar_q0 = np.float32(soar_q0) self.soar_L = np.float32(0.75 * self.coarse_dx) if soar_L is not None: self.soar_L = np.float32(soar_L) # Allocate memory for random numbers (xi) self.random_numbers_host = np.zeros((self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.random_numbers = Common.CUDAArray2D(self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate a second buffer for random numbers (nu) self.perpendicular_random_numbers_host = np.zeros( (self.rand_ny, self.rand_nx), dtype=np.float32, order='C') self.perpendicular_random_numbers = Common.CUDAArray2D( self.gpu_stream, self.rand_nx, self.rand_ny, 0, 0, self.random_numbers_host) # Allocate memory for coarse buffer if needed # Two ghost cells in each direction needed for bicubic interpolation self.coarse_buffer_host = np.zeros( (self.coarse_ny + 4, self.coarse_nx + 4), dtype=np.float32, order='C') self.coarse_buffer = Common.CUDAArray2D(self.gpu_stream, self.coarse_nx, self.coarse_ny, 2, 2, self.coarse_buffer_host) # Allocate extra memory needed for reduction kernels. # Currently: A single GPU buffer with 3x1 elements: [xi^T * xi, nu^T * nu, xi^T * nu] self.reduction_buffer = None reduction_buffer_host = np.zeros((1, 3), dtype=np.float32) self.reduction_buffer = Common.CUDAArray2D(self.gpu_stream, 3, 1, 0, 0, reduction_buffer_host) # Generate kernels self.kernels = gpu_ctx.get_kernel("ocean_noise.cu", \ defines={'block_width': block_width, 'block_height': block_height}, compile_args={ 'options': ["--use_fast_math", "--maxrregcount=32"] }) self.reduction_kernels = self.gpu_ctx.get_kernel("reductions.cu", \ defines={}) # Get CUDA functions and define data types for prepared_{async_}call() # Generate kernels self.squareSumKernel = self.reduction_kernels.get_function("squareSum") self.squareSumKernel.prepare("iiPP") self.squareSumDoubleKernel = self.reduction_kernels.get_function( "squareSumDouble") self.squareSumDoubleKernel.prepare("iiPPP") self.makePerpendicularKernel = self.kernels.get_function( "makePerpendicular") self.makePerpendicularKernel.prepare("iiPiPiP") self.uniformDistributionKernel = self.kernels.get_function( "uniformDistribution") self.uniformDistributionKernel.prepare("iiiPiPi") self.normalDistributionKernel = None if self.use_lcg: self.normalDistributionKernel = self.kernels.get_function( "normalDistribution") self.normalDistributionKernel.prepare("iiiPiPi") self.soarKernel = self.kernels.get_function("SOAR") self.soarKernel.prepare("iifffffiiPiPii") self.geostrophicBalanceKernel = self.kernels.get_function( "geostrophicBalance") self.geostrophicBalanceKernel.prepare("iiffiiffffPiPiPiPiPif") self.bicubicInterpolationKernel = self.kernels.get_function( "bicubicInterpolation") self.bicubicInterpolationKernel.prepare( "iiiiffiiiiffiiffffPiPiPiPiPif") #Compute kernel launch parameters self.local_size = (block_width, block_height, 1) self.local_size_reductions = (128, 1, 1) self.global_size_reductions = (1, 1) # Launch one thread for each seed, which in turns generates two iid N(0,1) self.global_size_random_numbers = ( \ int(np.ceil(self.seed_nx / float(self.local_size[0]))), \ int(np.ceil(self.seed_ny / float(self.local_size[1]))) \ ) # Launch on thread for each random number (in order to create perpendicular random numbers) self.global_size_perpendicular = ( \ int(np.ceil(self.rand_nx / float(self.local_size[0]))), \ int(np.ceil(self.rand_ny / float(self.local_size[1]))) \ ) # Launch one thread per SOAR-correlated result - need to write to two ghost # cells in order to do bicubic interpolation based on the result self.global_size_SOAR = ( \ int(np.ceil( (self.coarse_nx+4)/float(self.local_size[0]))), \ int(np.ceil( (self.coarse_ny+4)/float(self.local_size[1]))) \ ) # One thread per resulting perturbed grid cell self.global_size_geo_balance = ( \ int(np.ceil( (self.nx)/float(self.local_size[0]))), \ int(np.ceil( (self.ny)/float(self.local_size[1]))) \ ) # Texture for coriolis field self.coriolis_texref = self.kernels.get_texref("coriolis_f_tex") if isinstance(coriolis_f, cuda.Array): # coriolis_f is already a texture, so we just set the reference self.coriolis_texref.set_array(coriolis_f) else: #Upload data to GPU and bind to texture reference self.coriolis_texref.set_array( cuda.np_to_array(np.ascontiguousarray(coriolis_f, dtype=np.float32), order="C")) # Set texture parameters self.coriolis_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.coriolis_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.coriolis_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.coriolis_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing # FIXME! Allow different versions of coriolis, similar to CDKLM # Texture for angle towards north self.angle_texref = self.kernels.get_texref("angle_tex") if isinstance(angle, cuda.Array): # angle is already a texture, so we just set the reference self.angle_texref.set_array(angle) else: #Upload data to GPU and bind to texture reference self.angle_texref.set_array( cuda.np_to_array(np.ascontiguousarray(angle, dtype=np.float32), order="C")) # Set texture parameters self.angle_texref.set_filter_mode( cuda.filter_mode.LINEAR) #bilinear interpolation self.angle_texref.set_address_mode( 0, cuda.address_mode.CLAMP) #no indexing outside domain self.angle_texref.set_address_mode(1, cuda.address_mode.CLAMP) self.angle_texref.set_flags( cuda.TRSF_NORMALIZED_COORDINATES) #Use [0, 1] indexing
mod = SourceModule(source_file % {"NUMBER_OF_GENERATORS": output_width * output_height}, no_extern_c=True, include_dirs=[os.getcwd()]) init_rand_num_generators = mod.get_function("initRandNumGenerators") render = mod.get_function("render") #Settings for the volume density texture. texture = mod.get_texref("gVolumeTexture") texture.set_address_mode(0, cuda.address_mode.BORDER) texture.set_address_mode(1, cuda.address_mode.BORDER) texture.set_address_mode(2, cuda.address_mode.BORDER) texture.set_filter_mode(cuda.filter_mode.LINEAR) texture.set_format(cuda.array_format.FLOAT, 1) texture.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) texture.set_array(cuda.np_to_array(volume_data, order="F")) output = np.zeros((output_height, output_width, 3)).astype(np.float32) init_rand_num_generators(block=(block_size * block_size, 1, 1), grid=((output_width * output_height) // (block_size * block_size) + 1, 1, 1)) #GUI window = tk.Tk() canvas = tk.Canvas(window, width=output_width, height=output_height) canvas.pack() image_on_canvas = canvas.create_image(0, 0, anchor=tk.NW) for sample_no in range(1, number_of_samples + 1): render(cuda.InOut(output),
def __init__(self, vol, img, ray_step_mm=None, render_op='drr'): source = """ #include "cuda_math.h" //------------------------------ DATA STRUCTURES ------------------------------- struct sRenderParams { // 2D detector data uint2 sizes2D; float2 steps2D; // 3D image data uint3 sizes3D; float1 __padding1; float3 steps3D; float1 __padding2; float3 boxmin; float1 __padding3; float3 boxmax; float1 __padding4; // Source position float3 ray_org; float1 __padding5; // Step along rays float1 ray_step, __padding6; // Transformation from 2D image to 2D plane in WCS float T2D[16]; }; //-------------------------------- DEVICE CODE --------------------------------- // Device variables extern "C" { texture<float, cudaTextureType3D, cudaReadModeElementType> d_tex; } // Intersect ray with a 3D volume: // see https://wiki.aalto.fi/download/attachments/40023967/ // gpgpu.pdf?version=1&modificationDate=1265652539000 __device__ int intersectBox( float3 ray_org, float3 raydir, sRenderParams *d_params, float *tnear, float *tfar ) { // Compute intersection of ray with all six bbox planes float3 invR = make_float3(1.0f) / raydir; float3 tbot = invR * (d_params->boxmin - ray_org); float3 ttop = invR * (d_params->boxmax - ray_org); // Re-order intersections to find smallest and largest on each axis float3 tmin = fminf(ttop, tbot); float3 tmax = fmaxf(ttop, tbot); // Find the largest tmin and the smallest tmax float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); *tnear = largest_tmin; *tfar = smallest_tmax; return smallest_tmax > largest_tmin; } // Define DRR operator struct drr_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { acc += in; } }; // Define MIP operator struct mip_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { if(in > acc) acc = in; } }; // Define MINIP operator struct minip_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { if(in < acc) acc = in; } }; // Homogeneous transformation: // multiplication of a point w homog. transf. matrix /*static __inline__ __host__ __device__ float3 hom_trans(float*& Tx, float3& pos) { float xw = Tx[0]*pos.x + Tx[4]*pos.y + Tx[8]*pos.z + Tx[12]; float yw = Tx[1]*pos.x + Tx[5]*pos.y + Tx[9]*pos.z + Tx[13]; float zw = Tx[2]*pos.x + Tx[6]*pos.y + Tx[10]*pos.z + Tx[14]; return make_float3( xw, yw, zw ); }*/ static __inline__ __host__ __device__ float3 hom_trans(float*& Tx, float3& pos) { float xw = Tx[0]*pos.x + Tx[1]*pos.y + Tx[2]*pos.z + Tx[3]; float yw = Tx[4]*pos.x + Tx[5]*pos.y + Tx[6]*pos.z + Tx[7]; float zw = Tx[8]*pos.x + Tx[9]*pos.y + Tx[10]*pos.z + Tx[11]; return make_float3( xw, yw, zw ); } // Rendering kernel: // traverses the volume and performs linear interpolation extern "C" { __global__ void render_kernel( float* d_image, float* d_Tx, float* d_TxT2D, sRenderParams *d_params ) { // Resolve 2D image index float x = blockIdx.x*blockDim.x + threadIdx.x; float y = blockIdx.y*blockDim.y + threadIdx.y; if ( (uint(x) >= d_params->sizes2D.x) || (uint(y) >= d_params->sizes2D.y) ) return; float3 ray_org, pos2D; // Transform source position to volume space ray_org = hom_trans( d_Tx, d_params->ray_org ); // Create a point in 2D detector space pos2D = make_float3( x*d_params->steps2D.x, y*d_params->steps2D.y, 0.0f ); // Inline homogeneous transformation to volume space // ie., (x,y) pixel in 3D volume coordinate system pos2D = hom_trans( d_TxT2D, pos2D ); // Find eye ray in world space that points from the X-ray source // to the current pixel on the detector plane: // - ray origin is in the X-ray source (xs,ys,zs) // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs) float3 ray_dir = normalize( pos2D - ray_org ); // Find intersection with 3D volume float tnear, tfar; if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) ) return; // March along ray from front to back float dt = d_params->ray_step.x; float3 pos = make_float3( (ray_org.x + ray_dir.x*tnear) / d_params->steps3D.x, (ray_org.y + ray_dir.y*tnear) / d_params->steps3D.y, (ray_org.z + ray_dir.z*tnear) / d_params->steps3D.z); float3 step = make_float3( ray_dir.x * dt / d_params->steps3D.x, ray_dir.y * dt / d_params->steps3D.y, ray_dir.z * dt / d_params->steps3D.z); #ifdef RENDER_MINIP float acc = 1e+7; #else float acc = 0; #endif for( ; tnear<=tfar; tnear+=dt ) { // resample the volume float sample = tex3D( d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f ); #ifdef RENDER_MAXIP mip_operator::compute( sample, acc ); #elif RENDER_MINIP minip_operator::compute( sample, acc ); #elif RENDER_DRR drr_operator::compute( sample, acc ); #endif // update position pos += step; } // Write to the output buffer uint idx = uint(x) + uint(y) * d_params->sizes2D.x; d_image[idx] = acc; } } // Rendering kernel: // traverses the volume and performs linear interpolation // for selected points in the 2d image extern "C" { __global__ void render_kernel_idx( float* d_image, uint* d_idx, uint max_idx, float* d_Tx, float* d_TxT2D, sRenderParams *d_params ) { // Resolve 1D index uint idx = blockIdx.x*blockDim.x + threadIdx.x; if ( idx > max_idx ) return; uint idx_t = d_idx[ idx ]; // Resolve 2D image index uint y = idx_t / d_params->sizes2D.x; uint x = idx_t - y * d_params->sizes2D.x; //if ( (uint(x) >= d_params->sizes2D.x) || // (uint(y) >= d_params->sizes2D.y) ) // return; float3 ray_org, pos2D; // Transform souce position to volume space ray_org = hom_trans( d_Tx, d_params->ray_org ); // Create a point in 2D detector space pos2D = make_float3(float(x)*d_params->steps2D.x, float(y)*d_params->steps2D.y, 0.0f); // Inline homogeneous transformation to volume space // ie., (x,y) pixel in 3D volume coordinate system pos2D = hom_trans( d_TxT2D, pos2D ); // Find eye ray in world space that points from the X-ray source // to the current pixel on the detector plane: // - ray origin is in the X-ray source (xs,ys,zs) // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs) float3 ray_dir = normalize( pos2D - ray_org ); // Find intersection with 3D volume float tnear, tfar; if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) ) return; // March along ray from front to back float dt = d_params->ray_step.x; float3 pos = make_float3( (ray_org.x + ray_dir.x*tnear)/d_params->steps3D.x, (ray_org.y + ray_dir.y*tnear)/d_params->steps3D.y, (ray_org.z + ray_dir.z*tnear)/d_params->steps3D.z); float3 step = make_float3( ray_dir.x*dt/d_params->steps3D.x, ray_dir.y*dt/d_params->steps3D.y, ray_dir.z*dt/d_params->steps3D.z); #ifdef RENDER_MINIP float acc = 1e+7; #else float acc = 0; #endif for( ; tnear<=tfar; tnear+=dt ) { // resample the volume float sample = tex3D(d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f); #ifdef RENDER_MAXIP mip_operator::compute( sample, acc ); #elif RENDER_MINIP minip_operator::compute( sample, acc ); #elif RENDER_DRR drr_operator::compute( sample, acc ); #endif // update position pos += step; } // Write to the output buffer d_image[idx] = acc; } } """ if render_op not in self.VALID_RENDER_OPERATION: raise ValueError( 'Rendering operation "{}" is not valid.'.format(render_op)) cmodule = pycuda.compiler.SourceModule( source, options=['-DRENDER_{}'.format(render_op.upper())], include_dirs=[ "C:\\Users\\Ana\\Documents\\ROBOTSKI VID SEMINAR\\include" ], no_extern_c=True) # include_dirs=[os.path.join(os.getcwd(), 'include')], self._texture = cmodule.get_texref('d_tex') self._renderer = cmodule.get_function('render_kernel') self._renderer_idx = cmodule.get_function('render_kernel_idx') if ray_step_mm is None: ray_step_mm = float(np.linalg.norm(vol['spac']) / 2.0) self._params_d = cuda.mem_alloc(RenderParams.mem_size) self.params = RenderParams( RenderParams.Attributes( sizes2d=img['img'].shape[::-1], steps2d=img['spac'], sizes3d=vol['img'].shape[::-1], steps3d=vol['spac'], boxmin=np.array((0, 0, 0), dtype='float32').flatten(), boxmax=(np.array(vol['img'].shape[::-1]) - 1.0) * np.array(vol['spac']).astype('float32').flatten(), ray_org=img['SPos'].flatten(), ray_step=ray_step_mm, trans_2d=np.array(img['TPos'], dtype='float32').flatten()), self._params_d) # Copy array to texture memory self._texture.set_array( cuda.np_to_array(vol['img'].astype('float32'), order='C')) # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # self._texture.set_flags(cuda.TRSF_READ_AS_INTEGER) # self._texture.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # Perform linear interpolation self._texture.set_filter_mode(cuda.filter_mode.LINEAR) self._texture.set_address_mode(0, cuda.address_mode.CLAMP) self._texture.set_address_mode(1, cuda.address_mode.CLAMP) self._texture.set_address_mode(2, cuda.address_mode.CLAMP) # check max threads per block for current GPU max_threads_per_block = tools.DeviceData().max_threads if self.BLOCK_SIZE_1D is None: self.BLOCK_SIZE_1D = max_threads_per_block elif self.BLOCK_SIZE_1D > max_threads_per_block: raise ValueError( 'Parameter BLOCK_SIZE_1D={} exceeds maximal pool of threads per block ' '(current GPU has maximum of {} threads per block).'.format( self.BLOCK_SIZE_1D, max_threads_per_block)) if self.BLOCK_SIZE_2D is None: self.BLOCK_SIZE_2D = 2 while self.BLOCK_SIZE_2D**2 < max_threads_per_block: self.BLOCK_SIZE_2D *= 2 elif self.BLOCK_SIZE_2D**2 > max_threads_per_block: raise ValueError( 'Parameter BLOCK_SIZE_2D={} (squared=) exceeds maximal pool of threads per block ' '(current GPU has maximum of {} threads per block).'.format( self.BLOCK_SIZE_2D, self.BLOCK_SIZE_2D**2, max_threads_per_block)) # threads per block nx, ny = self.params.values.sizes2d self._blocksize_2d = ( nx if nx < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D, ny if ny < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D, 1) # blocks per grid self._gridsize_2d = (int(nx / self._blocksize_2d[0]), int(ny / self._blocksize_2d[1]), 1)
def _upload_ref_vol(self, data): """Upload the reference volume into texture memory.""" assert data.dtype == np.float32, "np.float32 is required" ref_vol = cuda.np_to_array(data, 'C') self._texture.set_array(ref_vol) return ref_vol
def find_bubbles(I, scale=1., fil='kspace'): """brute force method""" zeta = 40. Z = 12. RMAX = 30. RMIN = 1. mm = mmin(Z) smin = sig0(m2R(mm)) deltac = Deltac(Z) fgrowth = deltac/1.686 #fgrowth = pb.fgrowth(Z, cosmo['omega_M_0'], unnormed=True) """find bubbbles for deltax box I""" kernel_source = open("find_bubbles.cu").read() kernel_code = kernel_source % { 'DELTAC': deltac, 'RMIN': RMIN, 'SMIN': smin, 'ZETA': zeta } main_module = nvcc.SourceModule(kernel_code) if fil == 'rspace': kernel = main_module.get_function("real_tophat_kernel") elif fil == 'kspace': kernel = main_module.get_function("k_tophat_kernel") image_texture = main_module.get_texref("img") # Get contiguous image + shape. height, width, depth = I.shape I = np.float32(I.copy()*fgrowth) # Get block/grid size for steps 1-3. block_size = (8,8,8) grid_size = (width/(block_size[0])+1, height/(block_size[0])+1, depth/(block_size[0])+1) # Initialize variables. ionized = np.zeros([height,width,depth]) ionized = np.float32(ionized) width = np.int32(width) # Transfer labels asynchronously. ionized_d = gpuarray.to_gpu_async(ionized) I_cu = cu.np_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) R = RMAX while R > RMIN: print R Rpix = np.float32(R/scale) S0 = np.float32(sig0(R)) start = cu.Event() end = cu.Event() start.record() kernel(ionized_d, width, Rpix, S0, block=block_size, grid=HII_grid_size) end.record() end.synchronize() R *= (1./1.5) ionized = ionized_d.get() return ionized
def watershed(I, mask=None): kernel_source = open("Dwatershed.cu").read() main_module = nvcc.SourceModule(kernel_source) descent_kernel = main_module.get_function("descent_kernel") stabilize_kernel = main_module.get_function("stabilize_kernel") image_texture = main_module.get_texref("img") plateau_kernel = main_module.get_function("plateau_kernel") minima_kernel = main_module.get_function("minima_kernel") flood_kernel = main_module.get_function("flood_kernel") increment_kernel = main_module.get_function("increment_kernel") # Get contiguous image + shape. height, width, depth = I.shape I = np.float32(I.copy()) if mask is None: mask = np.ones(I.shape) mask = np.int32(mask) # Get block/grid size for steps 1-3. block_size = (8, 8, 8) grid_size = (width / (block_size[0] - 2) + 1, height / (block_size[0] - 2) + 1, depth / (block_size[0] - 2) + 1) # # Get block/grid size for step 4. # block_size2 = (10,10,10) # grid_size2 = (width/(block_size2[0]-2)+1, # height/(block_size2[0]-2)+1, # depth/(block_size2[0]-2)+1) # Initialize variables. labeled = np.zeros([height, width, depth]) labeled = np.float64(labeled) width = np.int32(width) height = np.int32(height) depth = np.int32(depth) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counters_d = gpu.to_gpu_async(count) # mask_d = cu.np_to_array( mask, order='C' ) # cu.bind_array_to_texref(mask_d, mask_texture) # Bind CUDA textures. #I_cu = cu.matrix_to_array(I, order='C') I_cu = cu.np_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, depth, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() counters_d = gpu.to_gpu(np.int32([0])) #counters_d = gpu.to_gpu_async(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new plateau_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'plateau kernel', it - 2 # Step 2. increment_kernel(labeled_d, width, height, depth, block=block_size, grid=grid_size) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new minima_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'minima kernel', it - 2 # Step 3. # counters_d = gpu.to_gpu(np.int32([0])) # old, new = -1, -2; it = 0 # while old != new: # it +=1 # old = new # plateau_kernel(labeled_d, counters_d, width, # height, depth, block=block_size, grid=grid_size) # new = counters_d.get()[0] # print 'plateau kernel', it-2 # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 it = 0 while old != new: it += 1 old = new flood_kernel(labeled_d, counters_d, width, height, depth, block=block_size, grid=grid_size) new = counters_d.get()[0] print 'flood kernel', it - 2 labels = labeled_d.get() labels = labels * mask # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) #cu.DeviceAllocation.free(counters_d) del counters_d return labels