def calculate_sizes(self, tl_args): hw_constrained_threads_per_block = cuda_tools.DeviceData().max_threads # T <= floor(MAX_shared / (13M + 8N)) from cuTauLeaping paper eq (5) # threads_per_block = math.floor( # max_shared_mem / (13 * tl_args.M + 8 * tl_args.N)) # HOWEVER, for my implementation: # type size var number # curandStateMRG32k3a 80 rstate 1 # uint 32 x N # float 32 c P # float 32 a M # u char 8 Xeta M # int 32 K M # int 32 x_prime N # T <= floor(Max_shared / (9M + 8N + 4P + 10) (bytes) max_shared_mem = cuda_tools.DeviceData().shared_memory shared_mem_constrained_threads_per_block = math.floor( max_shared_mem / (9 * tl_args.M + 8 * tl_args.N + 4 * len(tl_args.c)) + 10) max_threads_per_block = min(hw_constrained_threads_per_block, shared_mem_constrained_threads_per_block) warp_size = cuda_tools.DeviceData().warp_size # optimal T is a multiple of warp size max_warps_per_block = math.floor(max_threads_per_block / warp_size) max_optimal_threads_per_block = max_warps_per_block * warp_size if (max_optimal_threads_per_block >= 256) and (tl_args.U >= 2560): block_size = 256 elif max_optimal_threads_per_block >= 128 and (tl_args.U >= 1280): block_size = 128 elif max_optimal_threads_per_block >= 64 and (tl_args.U >= 640): block_size = 64 elif max_optimal_threads_per_block >= 32: block_size = 32 else: block_size = max_optimal_threads_per_block if tl_args.U <= 2: block_size = tl_args.U grid_size = int(math.ceil(float(tl_args.U) / float(block_size))) tl_args.U = int(grid_size * block_size) return grid_size, block_size
def setVariables(self): # compile the update functions for H and W as elementwise Matrix-Mult. # is not in skcuda H_size = self.rank * self.m W_size = self.n * self.rank max_threads = tools.DeviceData().max_threads self.block_H = int(np.min([H_size, max_threads])) self.block_W = int(np.min([W_size, max_threads])) self.grid_H = np.int(np.ceil(H_size/np.float32(self.block_H))) self.grid_W = np.int(np.ceil(W_size/np.float32(self.block_W))) mod_H = compiler.SourceModule(update_kernel_code % H_size) mod_W = compiler.SourceModule(update_kernel_code % W_size) self.update_H = mod_H.get_function("ew_md") self.update_W = mod_W.get_function("ew_md") # allocate the matrices on the GPU self.H_gpu = gpuarray.to_gpu(self.H) self.W_gpu = gpuarray.to_gpu(self.W) self.X_gpu = gpuarray.to_gpu(self.X) self.WTW_gpu = gpuarray.empty((self.rank, self.rank), np.float32) self.WTWH_gpu = gpuarray.empty(self.H.shape, np.float32) self.WTX_gpu = gpuarray.empty(self.H.shape, np.float32) self.XHT_gpu = gpuarray.empty(self.W.shape, np.float32) self.WH_gpu = gpuarray.empty(self.X.shape, np.float32) self.WHHT_gpu = gpuarray.empty(self.W.shape, np.float32)
def setVariables(self): n, m, r = self.n, self.m, self.rank # compile the matrix separations and G update functions for CUDA G_size = m * r FTF_size = r**2 max_threads = tools.DeviceData().max_threads self.block_G = int(np.min([G_size, max_threads])) self.grid_G = np.int(np.ceil(G_size/np.float32(self.block_G))) self.block_FTF = int(np.min([FTF_size, max_threads])) self.grid_FTF = np.int(np.ceil(FTF_size/np.float32(self.block_FTF))) mod_msepXTF = compiler.SourceModule(matrix_separation_code % G_size) mod_msepFTF = compiler.SourceModule(matrix_separation_code % FTF_size) mod_Gupdate = compiler.SourceModule(G_update_code % G_size) self.matrix_separationXTF = \ mod_msepXTF.get_function("matrix_separation") self.matrix_separationFTF = \ mod_msepFTF.get_function("matrix_separation") self.G_ew_update = mod_Gupdate.get_function("G_ew_update") # allocate the matrices on the GPU self.G_gpu = gpuarray.to_gpu(self.G) self.F_gpu = gpuarray.empty((n,r), np.float32) self.X_gpu = gpuarray.to_gpu(self.X) self.GTG_gpu = gpuarray.empty((r,r), np.float32) self.GTGinv_gpu = gpuarray.empty((r,r), np.float32) self.XG_gpu = gpuarray.empty((n,r), np.float32) self.XTF_gpu = gpuarray.empty((m,r), np.float32) self.FTF_gpu = gpuarray.empty((r,r), np.float32) self.XTFpos_gpu = gpuarray.empty((m,r), np.float32) self.XTFneg_gpu = gpuarray.empty((m,r), np.float32) self.FTFpos_gpu = gpuarray.empty((r,r), np.float32) self.FTFneg_gpu = gpuarray.empty((r,r), np.float32) self.GFTFneg_gpu = gpuarray.empty((m,r), np.float32) self.GFTFpos_gpu = gpuarray.empty((m,r), np.float32)
def get_kernel_function_info(a, W1=0, W2=1, W3=1): """Show kernel information Including 1. max #threads per block, 2. active warps per MP, 3. thread block per MP, 4. usage of shared memory, 5. const memory , 6. local memory 7. registers 8. hardware occupancy 9. limitation of the hardware occupancy """ import pycuda.tools as tl import pycuda.driver as dri dev = dri.Device(0) td = tl.DeviceData() if not W1: W1 = a.max_threads_per_block to = tl.OccupancyRecord(td, W1 * W2 * W3, a.shared_size_bytes, a.num_regs) print "***************************************" print " Function Info " print " -> max threads per block: %d / %d / %d" % \ (a.max_threads_per_block, dev.max_threads_per_block, dev.max_threads_per_multiprocessor) print " -> shared mem : %d / %d" % (a.shared_size_bytes, td.shared_memory) print " -> const mem : %d" % a.const_size_bytes print " -> local mem : %d" % a.local_size_bytes print " -> register : %d / %d" % (a.num_regs, td.registers) print " -> thread block per MP %d / %d" % \ (to.tb_per_mp, td.thread_blocks_per_mp) print " -> warps per MP %d / %d" % (to.warps_per_mp, td.warps_per_mp) print " -> occupancy %f" % to.occupancy print " -> limitation %s" % to.limited_by print " Block size : %dx%dx%d" % (W1, W2, W3) print "***************************************"
def run(self): # obtain a CUDA context driver.init() if self._card < 0: self._context = tools.make_default_context() else: self._context = driver.Device(self._card).make_context() if self._info: print "cuda-sim: running on device ", self._card, self._context.get_device().name(), \ self._context.get_device().pci_bus_id() # hack for SDE code self._device = 0 # compile code self._completeCode, self._compiledRunMethod = self._compile( self._stepCode) blocks, threads = self._get_optimal_gpu_param() if self._info: print "cuda-sim: threads/blocks:", threads, blocks # make multiples of initValues incase beta > 1 init_new = np.zeros( (len(self._initValues) * self._beta, self._speciesNumber)) for i in range(len(self._initValues)): for j in range(self._beta): for k in range(self._speciesNumber): init_new[i * self._beta + j][k] = self._initValues[i][k] self._initValues = copy.deepcopy(init_new) if self._info: print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, \ self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs occ = tools.OccupancyRecord( tools.DeviceData(), threads=threads, shared_mem=self._compiledRunMethod.shared_size_bytes, registers=self._compiledRunMethod.num_regs) print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy if self._timing: start = time.time() # number of device calls runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE))) for i in range(runs): # for last device call calculate number of remaining threads to run if i == runs - 1: runblocks = int(blocks % self._MAXBLOCKSPERDEVICE) if runblocks == 0: runblocks = self._MAXBLOCKSPERDEVICE else: runblocks = int(self._MAXBLOCKSPERDEVICE) if self._info: print "cuda-sim: Run", runblocks, "blocks." min_index = self._MAXBLOCKSPERDEVICE * i * threads max_index = min_index + threads * runblocks run_parameters = self._parameters[min_index / self._beta:max_index / self._beta] run_init_values = self._initValues[min_index:max_index] # first run store return Value if i == 0: self._returnValue = self._run_simulation( run_parameters, run_init_values, runblocks, threads) else: self._returnValue = np.append( self._returnValue, self._run_simulation(run_parameters, run_init_values, runblocks, threads), axis=0) self.output_cpu.put([self._card, self._returnValue]) self.output_cpu.close() # if self._timing: # print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round((time.time()-start),4), "s" if self._info: print "" # return the context self._context.pop() del self._context return self._returnValue
""" Pycuda modules """ import pycuda.driver as drv if auto_init_context: import pycuda.autoinit ctx = pycuda.autoinit.context else: drv.init() dev = drv.Device(0) ctx = dev.make_context(drv.ctx_flags.SCHED_AUTO | drv.ctx_flags.MAP_HOST) from pycuda.compiler import SourceModule import pycuda.tools as tl td = tl.DeviceData() cuda = drv """ ANUGA modules """ from anuga_cuda import kernel_path as kp from anuga_cuda import * #domain1 = domain_create() #domain2 = rearrange_domain(domain1) #domain2 = domain_create() domain1 = generate_merimbula_domain() domain2 = generate_merimbula_domain() sort_domain(domain2) N = domain2.number_of_elements
def run(self, parameters, initValues, timing=True, info=False): #check parameters and initValues for compability with pre-defined parameterNumber and spieciesNumber if (len(parameters[0]) != self._parameterNumber): print "Error: Number of parameters specified (" + str( self. _parameterNumber) + ") and given in parameter array (" + str( len(parameters[0])) + ") differ from each other!" exit() elif (len(initValues[0]) != self._speciesNumber): print "Error: Number of species specified (" + str( self._speciesNumber) + ") and given in species array (" + str( len(initValues[0])) + ") differ from each other!" exit() elif (len(parameters) != len(initValues)): print "Error: Number of sets of parameters (" + str( len(parameters)) + ") and species (" + str( len(initValues)) + ") do not match!" exit() if (self._compiledRunMethod == None and self._runtimeCompile): #compile to determine blocks and threads self._completeCode, self._compiledRunMethod = self._compileAtRuntime( self._stepCode, parameters) blocks, threads = self._getOptimalGPUParam(parameters) if info == True: print "cuda-sim: threads/blocks:", threads, blocks # real runtime compile #self._seedValue = seed #np.random.seed(self._seedValue) # make multiples of initValues initNew = np.zeros((len(initValues) * self._beta, self._speciesNumber)) for i in range(len(initValues)): for j in range(self._beta): for k in range(self._speciesNumber): initNew[i * self._beta + j][k] = initValues[i][k] initValues = initNew if info == True: print "cuda-sim: kernel mem local / shared / registers : ", self._compiledRunMethod.local_size_bytes, self._compiledRunMethod.shared_size_bytes, self._compiledRunMethod.num_regs occ = tools.OccupancyRecord( tools.DeviceData(), threads=threads, shared_mem=self._compiledRunMethod.shared_size_bytes, registers=self._compiledRunMethod.num_regs) print "cuda-sim: threadblocks per mp / limit / occupancy :", occ.tb_per_mp, occ.limited_by, occ.occupancy if timing: start = time.time() # number of device calls runs = int(math.ceil(blocks / float(self._MAXBLOCKSPERDEVICE))) for i in range(runs): # for last device call calculate number of remaining threads to run if (i == runs - 1): runblocks = int(blocks % self._MAXBLOCKSPERDEVICE) if (runblocks == 0): runblocks = self._MAXBLOCKSPERDEVICE else: runblocks = int(self._MAXBLOCKSPERDEVICE) if info == True: print "cuda-sim: Run", runblocks, "blocks." minIndex = self._MAXBLOCKSPERDEVICE * i * threads maxIndex = minIndex + threads * runblocks runParameters = parameters[minIndex / self._beta:maxIndex / self._beta] runInitValues = initValues[minIndex:maxIndex] #first run store return Value if (i == 0): returnValue = self._runSimulation(runParameters, runInitValues, runblocks, threads) else: returnValue = np.append(returnValue, self._runSimulation( runParameters, runInitValues, runblocks, threads), axis=0) if timing: print "cuda-sim: GPU blocks / threads / running time:", threads, blocks, round( (time.time() - start), 4), "s" if info: print "" return returnValue
def __init__(self, vol, img, ray_step_mm=None, render_op='drr'): source = """ #include "cuda_math.h" //------------------------------ DATA STRUCTURES ------------------------------- struct sRenderParams { // 2D detector data uint2 sizes2D; float2 steps2D; // 3D image data uint3 sizes3D; float1 __padding1; float3 steps3D; float1 __padding2; float3 boxmin; float1 __padding3; float3 boxmax; float1 __padding4; // Source position float3 ray_org; float1 __padding5; // Step along rays float1 ray_step, __padding6; // Transformation from 2D image to 2D plane in WCS float T2D[16]; }; //-------------------------------- DEVICE CODE --------------------------------- // Device variables extern "C" { texture<float, cudaTextureType3D, cudaReadModeElementType> d_tex; } // Intersect ray with a 3D volume: // see https://wiki.aalto.fi/download/attachments/40023967/ // gpgpu.pdf?version=1&modificationDate=1265652539000 __device__ int intersectBox( float3 ray_org, float3 raydir, sRenderParams *d_params, float *tnear, float *tfar ) { // Compute intersection of ray with all six bbox planes float3 invR = make_float3(1.0f) / raydir; float3 tbot = invR * (d_params->boxmin - ray_org); float3 ttop = invR * (d_params->boxmax - ray_org); // Re-order intersections to find smallest and largest on each axis float3 tmin = fminf(ttop, tbot); float3 tmax = fmaxf(ttop, tbot); // Find the largest tmin and the smallest tmax float largest_tmin = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z)); float smallest_tmax = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z)); *tnear = largest_tmin; *tfar = smallest_tmax; return smallest_tmax > largest_tmin; } // Define DRR operator struct drr_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { acc += in; } }; // Define MIP operator struct mip_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { if(in > acc) acc = in; } }; // Define MINIP operator struct minip_operator { static __inline__ __host__ __device__ void compute ( float &in, float &acc ) { if(in < acc) acc = in; } }; // Homogeneous transformation: // multiplication of a point w homog. transf. matrix /*static __inline__ __host__ __device__ float3 hom_trans(float*& Tx, float3& pos) { float xw = Tx[0]*pos.x + Tx[4]*pos.y + Tx[8]*pos.z + Tx[12]; float yw = Tx[1]*pos.x + Tx[5]*pos.y + Tx[9]*pos.z + Tx[13]; float zw = Tx[2]*pos.x + Tx[6]*pos.y + Tx[10]*pos.z + Tx[14]; return make_float3( xw, yw, zw ); }*/ static __inline__ __host__ __device__ float3 hom_trans(float*& Tx, float3& pos) { float xw = Tx[0]*pos.x + Tx[1]*pos.y + Tx[2]*pos.z + Tx[3]; float yw = Tx[4]*pos.x + Tx[5]*pos.y + Tx[6]*pos.z + Tx[7]; float zw = Tx[8]*pos.x + Tx[9]*pos.y + Tx[10]*pos.z + Tx[11]; return make_float3( xw, yw, zw ); } // Rendering kernel: // traverses the volume and performs linear interpolation extern "C" { __global__ void render_kernel( float* d_image, float* d_Tx, float* d_TxT2D, sRenderParams *d_params ) { // Resolve 2D image index float x = blockIdx.x*blockDim.x + threadIdx.x; float y = blockIdx.y*blockDim.y + threadIdx.y; if ( (uint(x) >= d_params->sizes2D.x) || (uint(y) >= d_params->sizes2D.y) ) return; float3 ray_org, pos2D; // Transform source position to volume space ray_org = hom_trans( d_Tx, d_params->ray_org ); // Create a point in 2D detector space pos2D = make_float3( x*d_params->steps2D.x, y*d_params->steps2D.y, 0.0f ); // Inline homogeneous transformation to volume space // ie., (x,y) pixel in 3D volume coordinate system pos2D = hom_trans( d_TxT2D, pos2D ); // Find eye ray in world space that points from the X-ray source // to the current pixel on the detector plane: // - ray origin is in the X-ray source (xs,ys,zs) // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs) float3 ray_dir = normalize( pos2D - ray_org ); // Find intersection with 3D volume float tnear, tfar; if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) ) return; // March along ray from front to back float dt = d_params->ray_step.x; float3 pos = make_float3( (ray_org.x + ray_dir.x*tnear) / d_params->steps3D.x, (ray_org.y + ray_dir.y*tnear) / d_params->steps3D.y, (ray_org.z + ray_dir.z*tnear) / d_params->steps3D.z); float3 step = make_float3( ray_dir.x * dt / d_params->steps3D.x, ray_dir.y * dt / d_params->steps3D.y, ray_dir.z * dt / d_params->steps3D.z); #ifdef RENDER_MINIP float acc = 1e+7; #else float acc = 0; #endif for( ; tnear<=tfar; tnear+=dt ) { // resample the volume float sample = tex3D( d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f ); #ifdef RENDER_MAXIP mip_operator::compute( sample, acc ); #elif RENDER_MINIP minip_operator::compute( sample, acc ); #elif RENDER_DRR drr_operator::compute( sample, acc ); #endif // update position pos += step; } // Write to the output buffer uint idx = uint(x) + uint(y) * d_params->sizes2D.x; d_image[idx] = acc; } } // Rendering kernel: // traverses the volume and performs linear interpolation // for selected points in the 2d image extern "C" { __global__ void render_kernel_idx( float* d_image, uint* d_idx, uint max_idx, float* d_Tx, float* d_TxT2D, sRenderParams *d_params ) { // Resolve 1D index uint idx = blockIdx.x*blockDim.x + threadIdx.x; if ( idx > max_idx ) return; uint idx_t = d_idx[ idx ]; // Resolve 2D image index uint y = idx_t / d_params->sizes2D.x; uint x = idx_t - y * d_params->sizes2D.x; //if ( (uint(x) >= d_params->sizes2D.x) || // (uint(y) >= d_params->sizes2D.y) ) // return; float3 ray_org, pos2D; // Transform souce position to volume space ray_org = hom_trans( d_Tx, d_params->ray_org ); // Create a point in 2D detector space pos2D = make_float3(float(x)*d_params->steps2D.x, float(y)*d_params->steps2D.y, 0.0f); // Inline homogeneous transformation to volume space // ie., (x,y) pixel in 3D volume coordinate system pos2D = hom_trans( d_TxT2D, pos2D ); // Find eye ray in world space that points from the X-ray source // to the current pixel on the detector plane: // - ray origin is in the X-ray source (xs,ys,zs) // - unit vector points to the point in detector plane (xw-xs,yw-ys,zw-zs) float3 ray_dir = normalize( pos2D - ray_org ); // Find intersection with 3D volume float tnear, tfar; if ( ! intersectBox(ray_org, ray_dir, d_params, &tnear, &tfar) ) return; // March along ray from front to back float dt = d_params->ray_step.x; float3 pos = make_float3( (ray_org.x + ray_dir.x*tnear)/d_params->steps3D.x, (ray_org.y + ray_dir.y*tnear)/d_params->steps3D.y, (ray_org.z + ray_dir.z*tnear)/d_params->steps3D.z); float3 step = make_float3( ray_dir.x*dt/d_params->steps3D.x, ray_dir.y*dt/d_params->steps3D.y, ray_dir.z*dt/d_params->steps3D.z); #ifdef RENDER_MINIP float acc = 1e+7; #else float acc = 0; #endif for( ; tnear<=tfar; tnear+=dt ) { // resample the volume float sample = tex3D(d_tex, pos.x+0.5f, pos.y+0.5f, pos.z+0.5f); #ifdef RENDER_MAXIP mip_operator::compute( sample, acc ); #elif RENDER_MINIP minip_operator::compute( sample, acc ); #elif RENDER_DRR drr_operator::compute( sample, acc ); #endif // update position pos += step; } // Write to the output buffer d_image[idx] = acc; } } """ if render_op not in self.VALID_RENDER_OPERATION: raise ValueError( 'Rendering operation "{}" is not valid.'.format(render_op)) cmodule = pycuda.compiler.SourceModule( source, options=['-DRENDER_{}'.format(render_op.upper())], include_dirs=[ "C:\\Users\\Ana\\Documents\\ROBOTSKI VID SEMINAR\\include" ], no_extern_c=True) # include_dirs=[os.path.join(os.getcwd(), 'include')], self._texture = cmodule.get_texref('d_tex') self._renderer = cmodule.get_function('render_kernel') self._renderer_idx = cmodule.get_function('render_kernel_idx') if ray_step_mm is None: ray_step_mm = float(np.linalg.norm(vol['spac']) / 2.0) self._params_d = cuda.mem_alloc(RenderParams.mem_size) self.params = RenderParams( RenderParams.Attributes( sizes2d=img['img'].shape[::-1], steps2d=img['spac'], sizes3d=vol['img'].shape[::-1], steps3d=vol['spac'], boxmin=np.array((0, 0, 0), dtype='float32').flatten(), boxmax=(np.array(vol['img'].shape[::-1]) - 1.0) * np.array(vol['spac']).astype('float32').flatten(), ray_org=img['SPos'].flatten(), ray_step=ray_step_mm, trans_2d=np.array(img['TPos'], dtype='float32').flatten()), self._params_d) # Copy array to texture memory self._texture.set_array( cuda.np_to_array(vol['img'].astype('float32'), order='C')) # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # self._texture.set_flags(cuda.TRSF_READ_AS_INTEGER) # self._texture.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # Perform linear interpolation self._texture.set_filter_mode(cuda.filter_mode.LINEAR) self._texture.set_address_mode(0, cuda.address_mode.CLAMP) self._texture.set_address_mode(1, cuda.address_mode.CLAMP) self._texture.set_address_mode(2, cuda.address_mode.CLAMP) # check max threads per block for current GPU max_threads_per_block = tools.DeviceData().max_threads if self.BLOCK_SIZE_1D is None: self.BLOCK_SIZE_1D = max_threads_per_block elif self.BLOCK_SIZE_1D > max_threads_per_block: raise ValueError( 'Parameter BLOCK_SIZE_1D={} exceeds maximal pool of threads per block ' '(current GPU has maximum of {} threads per block).'.format( self.BLOCK_SIZE_1D, max_threads_per_block)) if self.BLOCK_SIZE_2D is None: self.BLOCK_SIZE_2D = 2 while self.BLOCK_SIZE_2D**2 < max_threads_per_block: self.BLOCK_SIZE_2D *= 2 elif self.BLOCK_SIZE_2D**2 > max_threads_per_block: raise ValueError( 'Parameter BLOCK_SIZE_2D={} (squared=) exceeds maximal pool of threads per block ' '(current GPU has maximum of {} threads per block).'.format( self.BLOCK_SIZE_2D, self.BLOCK_SIZE_2D**2, max_threads_per_block)) # threads per block nx, ny = self.params.values.sizes2d self._blocksize_2d = ( nx if nx < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D, ny if ny < self.BLOCK_SIZE_2D else self.BLOCK_SIZE_2D, 1) # blocks per grid self._gridsize_2d = (int(nx / self._blocksize_2d[0]), int(ny / self._blocksize_2d[1]), 1)