def diffuse_pycuda(u): nx,ny = np.int32(u.shape) alpha = np.float32(0.645) dx = np.float32(3.5/(nx-1)) dy = np.float32(3.5/(ny-1)) dt = np.float32(1e-05) time = np.float32(0.4) nt = np.int32(np.ceil(time/dt)) # print nt u[0,:]=200 u[:,0]=200 u = u.astype(np.float32) u_prev = u.copy() u_d = cuda.mem_alloc(u.size*u.dtype.itemsize) u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize) cuda.memcpy_htod(u_d, u) cuda.memcpy_htod(u_prev_d, u_prev) BLOCKSIZE = 16 gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1) blockSize = (BLOCKSIZE,BLOCKSIZE,1) for t in range(nt+1): copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize) update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize) cuda.memcpy_dtoh(u, u_d) return u
def __compute_sub_gaussian_gpu(self, sub_partitions): if sub_partitions < 1: raise Exception("You can't have less than 1 partition") elif sub_partitions > self.pts.shape[0]: raise Exception("sub partitions need to be smaller than pts size") # Delta Partitions d_part = self.pts.shape[0]/sub_partitions # Does the correct partitioning alloc_size = self.pts.shape[0]/sub_partitions * 2 * self.pts.itemsize self.pts_gpu = cuda.mem_alloc(alloc_size) self.pts[:, 0] = (self.pts[:, 0] - self.axis[0])/(self.axis[1] - self.axis[0]) self.pts[:, 1] = (self.pts[:, 1] - self.axis[2])/(self.axis[3] - self.axis[2]) for partition in range(sub_partitions): sub_pts = self.pts[partition*d_part:(partition+1)*d_part, :] self.__compute_guassian_on_pts(sub_pts) self.pts_gpu.free() # See's if there is a remainder of points to work with if self.pts.shape[0] % sub_partitions: alloc_size = (self.pts.shape[0] % sub_partitions) * (2 * self.pts.itemsize) self.pts_gpu = cuda.mem_alloc(alloc_size) self.__compute_guassian_on_pts(self.pts[sub_partitions*d_part:, :]) self.pts_gpu.free()
def main(): (h, w), d = (826,1169), 3 #img1.size, len(img1_arr[0][0]) if LINEAR: thread_x, thread_y, thread_z = 128,1,1 block_x, block_y = (w*h*d)/thread_x, 1 if (w*h*d)%thread_x: block_x += 1 else: thread_x, thread_y, thread_z = 16, 8, d block_x, block_y = h / thread_x, w / thread_y if h % thread_x: block_x += 1 if w % thread_y: block_y += 1 #print (h,w,d), (thread_x,thread_y,thread_z), (block_x,block_y) image_data_size = 2896782 * 4 a_gpu = cuda.mem_alloc(image_data_size) b_gpu = cuda.mem_alloc(image_data_size) c_gpu = cuda.mem_alloc(image_data_size) image_path_pairs = [] for i in xrange(50): page_num = i + 1 path1, path2 = 'form1.%d.png'%page_num, 'form2.%d.png'%page_num image_path_pairs.append((path1,path2)) do_work(image_path_pairs, a_gpu, b_gpu, c_gpu, (thread_x, thread_y, thread_z), (block_x, block_y))
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n): # create Cheetah template and fill in variables for Poisson kernal template = Template(poisson_blending_source) template.BLOCK_DIM_X = b_size[0] template.BLOCK_DIM_Y = b_size[1] template.WIDTH = dest_im.shape[1] template.HEIGHT = dest_im.shape[0] template.RGB = RGB template.NEIGHBORS = neighbors # compile the CUDA kernel poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel") # alloc memory in GPU out_image = np.array(dest_im, dtype =np.uint8) d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes) cu.memcpy_htod(d_source, source_im) cu.memcpy_htod(d_destination, dest_im) cu.memcpy_htod(d_buffer, interior_buffer) # calls CUDA for Poisson Blending n # of times for i in range(n): poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size) # retrieves the final output image and returns cu.memcpy_dtoh(out_image, d_destination) return out_image
def calc_psd(self,bitloads,xtalk): #Number of expected permutations Ncombinations=self.K #Check if this is getting hairy and assign grid/block dimensions (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations) #How many individual lk's memdim=blockCount*threadCount threadshare_grid=(blockCount,1) threadshare_block=(threadCount,1,1) #Memory (We get away with the NCombinations because calpsd checks against it) d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes) d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes) d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes) d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes) h_p=np.zeros((self.K,self.N)).astype(self.type) cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32)) cuda.memcpy_htod(d_XTG,xtalk.astype(self.type)) #Go solve #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){ self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid) cuda.Context.synchronize() cuda.memcpy_dtoh(h_p,d_p) d_a.free() d_bitload.free() d_XTG.free() d_p.free() return h_p.astype(np.float64)
def cuda_crossOver(sola, solb): """ """ sol_len = len(sola); a_gpu = cuda.mem_alloc(sola.nbytes); b_gpu = cuda.mem_alloc(solb.nbytes); cuda.memcpy_htod(a_gpu, sola); cuda.memcpy_htod(b_gpu, solb); func = mod.get_function("crossOver"); func(a_gpu,b_gpu, block=(sol_len,1,1)); a_new = numpy.empty_like(sola); b_new = numpy.empty_like(solb); cuda.memcpy_dtoh(a_new, a_gpu); cuda.memcpy_dtoh(b_new, b_gpu); if debug == True: print "a:", a; print "b:",b; print "new a:",a_new; print "new b:",b_new; return a_new,b_new;
def get_spharms_l_eq_2(theta, phi, selected_Modes_gpu, rslt_gpu): modelist = np.array(sorted([mode[1] for mode in selected_modes])).astype(np.int32) modelist_gpu = cuda.mem_alloc(modelist.nbytes) # nsampslen = np.array(len(theta), ndmin=1).astype(np.int32) nmodeslen = np.array(len(modelist), ndmin=1).astype(np.int32) nsamps_gpu = cuda.mem_alloc(nsamps.nbytes) nmodes_gpu = cuda.mem_alloc(nmodeslen.nbytes) cuda.memcpy_htod(nsamps_gpu, nsamps) cuda.memcpy_htod(nmodes_gpu, nmodeslen) # cuda.memcpy_htod(theta_gpu, theta) # cuda.memcpy_htod(phi_gpu, phi) cuda.memcpy_htod(modelist_gpu, modelist) # Get and compile the cuda function sph = mod.get_function("compute_sph_harmonics_l_eq_2") result_gpu = cuda.mem_alloc(theta_m.nbytes * len(modelist) * 2) blk = (1024,1,1) grd = (1,1,1) sph(theta, phi, modelist_gpu, nmodes_gpu, nsamps_gpu, rslt_gpu, block=blk, grid=grd) # cuda.memcpy_dtoh(result, result_gpu) # print(result[0:9]) # print(len(result)) return
def alloc(self, dim, stream=None): """ Ensure that this object's framebuffers are large enough to handle the given dimensions, allocating new ones if not. If ``stream`` is not None and a reallocation is necessary, the stream will be synchronized before the old buffers are deallocated. """ nbins = dim.ah * dim.astride if self.nbins >= nbins: return if self.nbins is not None: self.free() try: self.d_front = cuda.mem_alloc(16 * nbins) self.d_back = cuda.mem_alloc(16 * nbins) self.d_side = cuda.mem_alloc(16 * nbins) self.nbins = nbins except cuda.MemoryError, e: # If a frame that's too large sneaks by the task distributor, we # don't want to kill the server, but we also don't want to leave # it stuck without any free memory to complete the next alloc. # TODO: measure free mem and only take tasks that fit (but that # should be done elsewhere) self.free(stream) raise e
def prepare_device_arrays(self): self.maxLayers = self.grid_prop.GetMaxLayers() nczbins_fine = len(self.czcen_fine) numLayers = np.zeros(nczbins_fine,dtype=np.int32) densityInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE) distanceInLayer = np.zeros((nczbins_fine*self.maxLayers),dtype=self.FTYPE) self.grid_prop.GetNumberOfLayers(numLayers) self.grid_prop.GetDensityInLayer(densityInLayer) self.grid_prop.GetDistanceInLayer(distanceInLayer) # Copy all these earth info arrays to device: self.d_numLayers = cuda.mem_alloc(numLayers.nbytes) self.d_densityInLayer = cuda.mem_alloc(densityInLayer.nbytes) self.d_distanceInLayer = cuda.mem_alloc(distanceInLayer.nbytes) cuda.memcpy_htod(self.d_numLayers,numLayers) cuda.memcpy_htod(self.d_densityInLayer,densityInLayer) cuda.memcpy_htod(self.d_distanceInLayer,distanceInLayer) self.d_ecen_fine = cuda.mem_alloc(self.ecen_fine.nbytes) self.d_czcen_fine = cuda.mem_alloc(self.czcen_fine.nbytes) cuda.memcpy_htod(self.d_ecen_fine,self.ecen_fine) cuda.memcpy_htod(self.d_czcen_fine,self.czcen_fine) return
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs): # Determine number of threads and blocks for the GPU number_of_blobs = np.int32(len(r_vectors)) threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs) # Get parameters from arguments L = kwargs.get('periodic_length') eps = kwargs.get('repulsion_strength') b = kwargs.get('debye_length') blob_radius = kwargs.get('blob_radius') # Reshape arrays x = np.reshape(r_vectors, number_of_blobs * 3) f = np.empty_like(x) # Allocate GPU memory x_gpu = cuda.mem_alloc(x.nbytes) f_gpu = cuda.mem_alloc(f.nbytes) # Copy data to the GPU (host to device) cuda.memcpy_htod(x_gpu, x) # Get blob-blob force function force = mod.get_function("calc_blob_blob_force") # Compute mobility force product force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) # Copy data from GPU to CPU (device to host) cuda.memcpy_dtoh(f, f_gpu) return np.reshape(f, (number_of_blobs, 3))
def __init__(self, max_size, offsets=None): """ Create a sorter. The sorter will hold on to internal resources for as long as it is alive, including an 'offsets' array of size 4*max_size. To share this cost, you may pass in an array of at least this size to __init__ (to, for instance, share across different bit-widths in a multi-pass sort). """ self.init_mod() self.max_size = max_size assert max_size % self.group_size == 0 max_grids = max_size / self.group_size if offsets is None: self.doffsets = cuda.mem_alloc(self.max_size * 4) else: self.doffsets = offsets self.dpfxs = cuda.mem_alloc(max_grids * self.radix_size * 4) self.dlocals = cuda.mem_alloc(max_grids * self.radix_size * 4) # There are probably better ways to choose how many condensation # groups to launch. TODO: maybe pick one if I care self.ncond = 32 self.dcond = cuda.mem_alloc(self.radix_size * self.ncond * 4) self.dglobal = cuda.mem_alloc(self.radix_size * 4)
def __init__(self, init_data, n_generators): self.ctx = curr_gpu.make_context() self.module = pycuda.compiler.SourceModule(kernels_cuda_src, no_extern_c=True) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) ################################################################################################################ self.width_mat = np.int32(init_data.shape[0]) # self.gpu_init_data = ga.to_gpu(init_data) self.gpu_init_data = cuda.mem_alloc(init_data.nbytes) cuda.memcpy_htod(self.gpu_init_data, init_data) self.cpu_new_data = np.zeros_like(init_data, dtype=np.float32) print("size new data = ", self.cpu_new_data.nbytes / 10 ** 6) (free, total) = cuda.mem_get_info() print(("Global memory occupancy:%f%% free" % (free * 100 / total))) print(("Global free memory :%i Mo free" % (free / 10 ** 6))) self.gpu_new_data = cuda.mem_alloc(self.cpu_new_data.nbytes) cuda.memcpy_htod(self.gpu_new_data, self.cpu_new_data) # self.gpu_new_data = ga.to_gpu(self.cpu_new_data) self.cpu_vect_sum = np.zeros((self.width_mat,), dtype=np.float32) self.gpu_vect_sum = cuda.mem_alloc(self.cpu_vect_sum.nbytes) cuda.memcpy_htod(self.gpu_vect_sum, self.cpu_vect_sum) # self.gpu_vect_sum = ga.to_gpu(self.cpu_vect_sum) ################################################################################################################ self.init_rng = self.module.get_function("init_rng") self.gen_rand_mat = self.module.get_function("gen_rand_mat") self.sum_along_axis = self.module.get_function("sum_along_axis") self.norm_along_axis = self.module.get_function("norm_along_axis") self.init_vect_sum = self.module.get_function("init_vect_sum") self.copy_mat = self.module.get_function("copy_mat") ################################################################################################################ self.n_generators = n_generators seed = 1 self.rng_states = cuda.mem_alloc( n_generators * characterize.sizeof("curandStateXORWOW", "#include <curand_kernel.h>") ) self.init_rng( np.int32(n_generators), self.rng_states, np.uint64(seed), np.uint64(0), block=(64, 1, 1), grid=(n_generators // 64 + 1, 1), ) (free, total) = cuda.mem_get_info() size_block_x = 32 size_block_y = 32 n_blocks_x = int(self.width_mat) // (size_block_x) + 1 n_blocks_y = int(self.width_mat) // (size_block_y) + 1 self.grid = (n_blocks_x, n_blocks_y, 1) self.block = (size_block_x, size_block_y, 1)
def confirmInitialization(featuresForSOM,somMatrix): #allocate memory for the somcuda on the device somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes) somBytesPerRow = np.int32(somMatrix.strides[0]) somNumberOfRows = np.int32(somMatrix.shape[0]) somNumberOfColumns = np.int32(somMatrix.shape[1]) pycuda.memcpy_htod(somMatrixPtr,somMatrix) #allocate space for bmu index bmu = np.zeros(somMatrixRows).astype(np.float32) bmuPtr = pycuda.mem_alloc(bmu.nbytes) pycuda.memcpy_htod(bmuPtr,bmu) bmuIndex = np.zeros(somMatrixRows).astype(np.int32) bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes) pycuda.memcpy_htod(bmuIndexPtr,bmuIndex) intraDayOffset = features.columns.get_loc('Ret_121') dayOffset = features.columns.get_loc('Ret_PlusOne') objVal = 0.0; objSampSize=0.0 r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] nodeHitMatrix = np.array(r).astype(np.float32) hitCountDict = defaultdict(list) samples = [x for x in range (0, somMatrixRows*somMatrixColumns)] if len(samples) >= len(featuresForSOM): samples = [x for x in range (0, len(featuresForSOM))] for i in samples: feats = featuresForSOM.loc[i].as_matrix().astype(np.float32) featuresPtr = pycuda.mem_alloc(feats.nbytes) pycuda.memcpy_htod(featuresPtr,feats) #find the BMU computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1)) pycuda.memcpy_dtoh(bmu,bmuPtr) pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr) block = np.argmin(bmu) thread = bmuIndex[block] val = hitCountDict[(block,thread)] if val == None or len(val) == 0: hitCountDict[(block,thread)] = [1,i] else: hitCountDict[(block,thread)][0] += 1 val = np.int32(hitCountDict[(block,thread)])[0] if val == 1: val = 0x0000ff00 elif val <= 10: val = 0x000000ff elif val <= 100: val = 0x00ff0000 else: val = 0x00ffffff bval = (val & 0x000000ff) gval = ((val & 0x0000ff00) >> 8) rval = ((val & 0x00ff0000) >> 16) nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0] fig20 = plt.figure(20,figsize=(6*3.13,4*3.13)) fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20) ax = plt.subplot(111) somplot = plt.imshow(nodeHitMatrix,interpolation="none") plt.show() plt.pause(0.1)
def set_refsmiles(self,refsmilesmat,refcountsmat,reflengths,refmags=None): #{{{ """Sets the reference SMILES set to use Lingo matrix *refsmilesmat*, count matrix *refcountsmat*, and length vector *reflengths*. If *refmags* is provided, it will be used as the magnitude vector; else, the magnitude vector will be computed (on the GPU) from the count matrix. Because of hardware limitations, the reference matrices (*refsmilesmat* and *refcountsmat*) must have no more than 32,768 rows (molecules) and 65,536 columns (Lingos). Larger computations must be performed in tiles. """ # Set up lingo and count matrices on device #{{{ if self.usePycudaArray: # Set up using PyCUDA CUDAArray support self.gpu.rsmiles = cuda.matrix_to_array(refsmilesmat,order='C') self.gpu.rcounts = cuda.matrix_to_array(refcountsmat,order='C') self.gpu.tex2lr.set_array(self.gpu.rsmiles) self.gpu.tex2cr.set_array(self.gpu.rcounts) else: # Manually handle setup temprlmat = self._padded_array(refsmilesmat) if temprlmat.shape[1] > 65536 or temprlmat.shape[0] > 32768: raise ValueError("Error: reference matrix is not allowed to have more than 64K columns (LINGOs) or 32K rows (molecules) (both padded to multiple of 16). Dimensions = (%d,%d)."%temprlmat.shape) self.gpu.rsmiles = cuda.mem_alloc(temprlmat.nbytes) cuda.memcpy_htod_async(self.gpu.rsmiles,temprlmat,stream=self.gpu.stream) temprcmat = self._padded_array(refcountsmat) self.gpu.rcounts = cuda.mem_alloc(temprcmat.nbytes) cuda.memcpy_htod_async(self.gpu.rcounts,temprcmat,stream=self.gpu.stream) descriptor = cuda.ArrayDescriptor() descriptor.width = temprcmat.shape[1] descriptor.height = temprcmat.shape[0] descriptor.format = cuda.array_format.UNSIGNED_INT32 descriptor.num_channels = 1 self.gpu.tex2lr.set_address_2d(self.gpu.rsmiles,descriptor,temprlmat.strides[0]) self.gpu.tex2cr.set_address_2d(self.gpu.rcounts,descriptor,temprcmat.strides[0]) self.gpu.stream.synchronize() del temprlmat del temprcmat #}}} self.rlengths = reflengths self.rshape = refsmilesmat.shape self.nref = refsmilesmat.shape[0] # Copy reference lengths to GPU self.gpu.rl_gpu = cuda.to_device(reflengths) # Allocate buffers for query set magnitudes self.gpu.rmag_gpu = cuda.mem_alloc(reflengths.nbytes) if refmags is not None: cuda.memcpy_htod(self.gpu.rmag_gpu,refmags) else: # Calculate query set magnitudes on GPU magthreads = 256 self.gpu.refMagKernel(self.gpu.rmag_gpu,self.gpu.rl_gpu,numpy.int32(self.nref),block=(magthreads,1,1),grid=(30,1),shared=magthreads*4,texrefs=[self.gpu.tex2cr]) return
def computeAvgDistancetoBMU(currentIter,iterationDistance, features, nodeHitMatrix, somMatrixPtr, somMatrix, featureStatsMatrix, featuresPtr, featureCount, somBytesPerRow, somNumberOfRows, somNumberOfColumns): adjustNodes = {} sampSize = 0 cumDistance = 0.0 nodeHitMatrix.fill(0) hitCountDict.clear() if len(featuresForSOM) < 100: sampSize = len(featuresForSOM) elif currentIter < len(featuresForSOM): sampSize = int(currentIter) if sampSize == 0: sampSize = min(somNumberOfRows*somNumberOfColumns,len(featuresForSOM)) else: sampSize = len(featuresForSOM) samples = [x for x in range (0,sampSize)] #allocate space for bmu bmu = np.zeros(somMatrixRows).astype(np.float32) bmuPtr = pycuda.mem_alloc(bmu.nbytes) pycuda.memcpy_htod(bmuPtr,bmu) #allocate space for bmu index bmuIndex = np.zeros(somMatrixRows).astype(np.int32) bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes) pycuda.memcpy_htod(bmuIndexPtr,bmuIndex) for i in samples: feats = featuresForSOM.loc[i].as_matrix().astype(np.float32) featuresPtr = pycuda.mem_alloc(feats.nbytes) pycuda.memcpy_htod(featuresPtr,feats) #find the BMU computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(featureCount), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1)) pycuda.memcpy_dtoh(bmu,bmuPtr) pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr) cumDistance += np.min(bmu) block = np.argmin(bmu) thread = bmuIndex[block] adjustNodes[i]=[block,thread] val = hitCountDict[(block,thread)] if val == None or len(val) == 0: hitCountDict[(block,thread)] = [1,i] else: hitCountDict[(block,thread)][0] += 1 val = np.int32(hitCountDict[(block,thread)])[0] if val == 1: val = 0x0000ff00 elif val <= 10: val = 0x000000ff elif val <= 100: val = 0x00ff0000 else: val = 0x00ffffff bval = (val & 0x000000ff) gval = ((val & 0x0000ff00) >> 8) rval = ((val & 0x00ff0000) >> 16) nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0] iterationDistance.append(cumDistance/sampSize) iterationCount.append(currentIter) return cumDistance/sampSize
def gfx_init( self ) : try : print 'compiling' self.prog = sh.compile_program_vfg( 'shad/balls' ) print 'compiled' self.loc_mmv = sh.get_loc(self.prog,'modelview' ) self.loc_mp = sh.get_loc(self.prog,'projection') self.l_color = sh.get_loc(self.prog,'color' ) self.l_size = sh.get_loc(self.prog,'ballsize' ) except ValueError as ve : print "Shader compilation failed: " + str(ve) sys.exit(0) # glUseProgram( self.prog ) # glUniform1i( pointsid , 0 ); # glUseProgram( 0 ) # # cuda init # self.grid = (int(self.BOX),int(self.BOX)) self.block = (1,1,int(self.BOX)) print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid)) # print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK # print cuda_driver.device_attribute.MAX_BLOCK_DIM_X # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z floatbytes = np.dtype(np.float32).itemsize self.gpos = glGenBuffers(1) glBindBuffer( GL_ARRAY_BUFFER , self.gpos ) glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW ) glBindBuffer( GL_ARRAY_BUFFER , 0 ) self.df1 = cuda_driver.mem_alloc( self.f.nbytes ) self.df2 = cuda_driver.mem_alloc( self.f.nbytes ) cuda_driver.memcpy_htod( self.df1 , self.f ) cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q ) mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' ) self.collision = mod.get_function("collision_step") self.collision.prepare( "Piii" ) self.streaming = mod.get_function("streaming_step") self.streaming.prepare( "PPiii" ) self.colors = mod.get_function("colors") self.colors.prepare( "PPiii" )
def __init__(self, filename): Pattern.__init__(self, filename) if self.n&1023 != 0: raise ValueError('Number of patterns must be a multiple of 1024.') self.patterns_gpu = cuda.mem_alloc(self.patterns.nbytes) cuda.memcpy_htod(self.patterns_gpu, self.patterns) self.input_gpu = cuda.mem_alloc(4*((40*8)+16)) self.result_gpu = gpuarray.empty((40,self.n), dtype=numpy.float32, allocator=cuda.mem_alloc)
def _initME(self): """Initializes the MotionEnergy CUDA functions.""" logging.debug('initME') # register all device functions for easy access # imported from motion_energy_device.py self.dev_conv1 = mod.get_function("dev_conv1") self.dev_convn = mod.get_function("dev_convn") self.dev_accumDiffStims = mod.get_function("dev_accumDiffStims") self.dev_filt2dir = mod.get_function("dev_filt2dir") self.dev_edges = mod.get_function("dev_edges") self.dev_fullRect2 = mod.get_function("dev_fullRect2") self.dev_mean3 = mod.get_function("dev_mean3") self.dev_normalize = mod.get_function("dev_normalize") self.dev_split_gray = mod.get_function("dev_split_gray") self.dev_split_RGB = mod.get_function("dev_split_RGB") self.dev_sub = mod.get_function("dev_sub") self.dev_ave = mod.get_function("dev_ave") self.dev_sum = mod.get_function("dev_sum") self.dev_scaleHalfRect = mod.get_function("dev_scaleHalfRect") self.dev_scale = mod.get_function("dev_scale") self.dev_split_gray = mod.get_function("dev_split_gray") self.dev_split_RGB = mod.get_function("dev_split_RGB") self.dev_memcpy_dtod = mod.get_function("dev_memcpy_dtod") # for quick access: the size in bytes of nrX*nrY floats self.szXY = self.sizeofFloat * self.nrX * self.nrY # V1 filter responses self.d_resp = cuda.mem_alloc(self.szXY*self.nrFilters*self.nrScales) # V1 complex cell responses self.d_respV1c = cuda.mem_alloc(self.szXY*self.nrDirs) # stim frame self.d_stim = cuda.mem_alloc(self.szXY*self.nrC) # stim frame buffer (last nrT frames) self.d_stimBuf = cuda.mem_alloc(self.szXY*self.nrT) # I'm not sure if this memset works as expected... for now, memcpy an # array of zeros # cuda.memset_d32(self.d_stimBuf, 0, self.nrX*self.nrY*self.nrT) tmp = np.zeros(self.nrX*self.nrY*self.nrT).astype(np.float32) cuda.memcpy_htod(self.d_stimBuf, tmp) self.d_diffV1GausBufT = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize) self.d_scalingStimBuf = cuda.mem_alloc(self.szXY*self.nrT) self.d_v1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize) self.d_diffV1GausBuf = cuda.mem_alloc(self.szXY*self.v1GaussFiltSize) self.d_pop = cuda.mem_alloc(self.szXY*self.nrScales) self.d_scalingFilt = mod.get_global("d_scalingFilt")[0] self.d_v1GaussFilt = mod.get_global("d_v1GaussFilt")[0] self.d_complexV1Filt = mod.get_global("d_complexV1Filt")[0] self.d_normV1filt = mod.get_global("d_normV1filt")[0] self.d_diff1filt = mod.get_global("d_diff1filt")[0] self.d_diff2filt = mod.get_global("d_diff2filt")[0] self.d_diff3filt = mod.get_global("d_diff3filt")[0]
def stepN(self,positions,velocities,n): x_gpu = cuda.mem_alloc(positions.nbytes) v_gpu = cuda.mem_alloc(velocities.nbytes) cuda.memcpy_htod(x_gpu,positions) cuda.memcpy_htod(v_gpu,velocities) import numpy as np self.cuBoris(x_gpu, v_gpu, np.int32(n), block=(1024,1,1), grid=(self.numParts/1024 + 1,1)) cuda.memcpy_dtoh(positions,x_gpu) cuda.memcpy_dtoh(velocities,v_gpu)
def CudaRPN(inPath, outPath, mycode, mydata, **kw): """CudaRPN implements the interface to the CUDA run environment. """ verbose = kw.get('verbose', False) BLOCK_SIZE = 1024 # Kernel grid and block size STACK_SIZE = 64 # OFFSETS = 64 # unary_operator_names = {'plus': '+', 'minus': '-'} function = Function( start=len(hardcase), bss=64, handcode=kw.get('handcode')) with Timing('Total execution time'): with Timing('Get and convert image data to gpu ready'): im = Image.open(inPath) px = array(im).astype(float32) function.assemble(mycode, mydata, verbose=True) function.disassemble(verbose=True) cx = array(function.final).astype(int32) dx = array(function.data).astype(float32) with Timing('Allocate mem to gpu'): d_px = mem_alloc(px.nbytes) memcpy_htod(d_px, px) d_cx = mem_alloc(cx.nbytes) memcpy_htod(d_cx, cx) d_dx = mem_alloc(dx.nbytes) memcpy_htod(d_dx, dx) with Timing('Kernel execution time'): block = (BLOCK_SIZE, 1, 1) checkSize = int32(im.size[0]*im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) kernel = INCLUDE + HEAD + function.body + convolve + TAIL sourceCode = kernel % { 'pixelwidth': 3, 'stacksize': STACK_SIZE, 'case': function.case} with open("RPN_sourceCode.c", "w") as target: print>>target, sourceCode module = SourceModule(sourceCode) func = module.get_function("RPN") func(d_px, d_cx, d_dx, checkSize, block=block, grid=grid) with Timing('Get data from gpu and convert'): RPNPx = empty_like(px) memcpy_dtoh(RPNPx, d_px) RPNPx = uint8(RPNPx) with Timing('Save image time'): pil_im = Image.fromarray(RPNPx, mode="RGB") pil_im.save(outPath) # Output final statistics if verbose: print '%40s: %s%s' % ('Target image', outPath, im.size) print Timing.text
def add(slice_a, slice_b): slice_c = np.empty_like(slice_a) a_gpu = cuda.mem_alloc(slice_a.nbytes) cuda.memcpy_htod(a_gpu, slice_a) b_gpu = cuda.mem_alloc(slice_b.nbytes) cuda.memcpy_htod(b_gpu, slice_b) c_gpu = cuda.mem_alloc(slice_c.nbytes) start = time.time() func(a_gpu, b_gpu, c_gpu, block=(BLOCK_SIZE, BLOCK_SIZE, 1)) end = time.time() cuda.memcpy_dtoh(slice_c, c_gpu) return (slice_c, end-start)
def advect(self, b, d, d_0, u, v, dt): size = self.size - 2 dt_0 = dt * (size) bX = size bY = size gX = 1 gY = 1 u_gpu = cuda.mem_alloc(u[1:-1, 1:-1].nbytes) cuda.memcpy_htod(u_gpu, u[1:-1, 1:-1].reshape(size**2)) #u_gpu = cuda.In(u[1:-1, 1:-1].reshape(size**2)) if self.debug and not np.array_equal(u, np.zeros((self.size, self.size))): print ">>> U" print u print "<<< U" v_gpu = cuda.mem_alloc(v[1:-1, 1:-1].nbytes) cuda.memcpy_htod(v_gpu, v[1:-1, 1:-1].reshape(size**2)) #v_gpu = cuda.In(v[1:-1, 1:-1].reshape(size**2)) d_0_gpu = cuda.mem_alloc(d_0.nbytes) cuda.memcpy_htod(d_0_gpu, d_0.reshape(self.size**2)) #d_0_gpu = cuda.In(d_0.reshape(self.size**2)) #d_gpu = cuda.mem_alloc(d[1:-1, 1:-1].nbytes) #cuda.memcpy_htod(d_gpu, d[1:-1, 1:-1].reshape(size**2)) d_gpu = cuda.Out(d[1:-1, 1:-1].reshape(size**2)) if self.debug: print ">>> Entry >>>" print d[1:-1, 1:-1] print "<<< Kernel Launch <<<" self.func_easy(u_gpu, v_gpu, d_gpu, d_0_gpu, block=(bX, bY, 1), grid=(gX, gY)) #d_result = np.empty_like(d[1:-1, 1:-1]).astype(np.float32) #cuda.memcpy_dtod(d_result, d_gpu, d_result.nbytes) if self.debug: print ">>> Result >>>" print d[1:-1, 1:-1] print "<<< End Result <<<" #d[1:-1, 1:-1] = d_result self.set_boundary(b, d)
def blobs_potential(r_vectors, *args, **kwargs): ''' This function compute the energy of the blobs. ''' # Determine number of threads and blocks for the GPU number_of_blobs = np.int32(len(r_vectors)) threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs) # Get parameters from arguments periodic_length = kwargs.get('periodic_length') debye_length_wall = kwargs.get('debye_length_wall') eps_wall = kwargs.get('repulsion_strength_wall') debye_length = kwargs.get('debye_length') eps = kwargs.get('repulsion_strength') weight = kwargs.get('weight') blob_radius = kwargs.get('blob_radius') # Reshape arrays x = np.reshape(r_vectors, number_of_blobs * 3) # Allocate CPU memory U = np.empty(number_of_blobs) # Allocate GPU memory utype = np.float64(1.) x_gpu = cuda.mem_alloc(x.nbytes) u_gpu = cuda.mem_alloc(U.nbytes) # Copy data to the GPU (host to device) cuda.memcpy_htod(x_gpu, x) # Get pair interaction function potential_from_position_blobs = mod.get_function("potential_from_position_blobs") # Compute pair interactions potential_from_position_blobs(x_gpu, u_gpu, number_of_blobs, np.float64(periodic_length[0]), np.float64(periodic_length[1]), np.float64(debye_length_wall), np.float64(eps_wall), np.float64(debye_length), np.float64(eps), np.float64(weight), np.float64(blob_radius), block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) # Copy data from GPU to CPU (device to host) cuda.memcpy_dtoh(U, u_gpu) return np.sum(U)
def prepare(self, P): n = len(P.state_(self.eqs._diffeq_names_nonzero[0])) var_len = len(dict.fromkeys(self.eqs._diffeq_names))+1 # +1 needed to store t for index,varname in enumerate(self.eqs._diffeq_names): self.index_to_varname.append(varname) self.varname_to_index[varname]= index if varname in self.eqs._diffeq_names_nonzero : self.index_nonzero.append(index) self.S_in = cuda.pagelocked_zeros((n,var_len),numpy.float64) self.S_out = cuda.pagelocked_zeros((n,var_len),numpy.float64) nbytes = n * var_len * numpy.dtype(numpy.float64).itemsize self.S_in_gpu = cuda.mem_alloc(nbytes) self.S_out_gpu = cuda.mem_alloc(nbytes) Z = zeros((n,var_len)) self.A_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.A_gpu, Z) self.B_gpu = cuda.mem_alloc(nbytes) cuda.memcpy_htod(self.B_gpu, Z) self.S_temp_gpu = cuda.mem_alloc(nbytes) modFun={} self.applyFun = {} for x in self.index_nonzero: s = self.eqs._function_C_String[self.index_to_varname[x]] args_fun =[] for i in xrange(var_len): args_fun.append("S_temp["+str(i)+" + blockIdx.x * var_len]") modFun[x] = SourceModule(""" __device__ double f"""+ s +""" __global__ void applyFun(double *A,double *B,double *S_in,double *S_temp, int x, int var_len) { int idx = x + blockIdx.x * var_len; S_temp[idx] = 0; B[idx] = f("""+",".join(args_fun)+"""); S_temp[idx] = 1; A[idx] = f("""+",".join(args_fun)+""") - B[idx]; B[idx] /= A[idx]; S_temp[idx] = S_in[idx]; } """) self.applyFun[x] = modFun[x].get_function("applyFun") self.applyFun[x].prepare(['P','P','P','P','i','i'],block=(1,1,1)) self.calc_dict = {} self.already_calc = {}
def processor(frame): """Applies the frame_filter 2D array to each channel of the image""" # allocate memory and transfer from host to device d_frame_in, d_frame_out = cu.mem_alloc(frame.nbytes), cu.mem_alloc(frame.nbytes) #, cu.mem_alloc(offset.nbytes), cu.mem_alloc(F.nbytes) cu.memcpy_htod(d_frame_in, frame) cu.memcpy_htod(d_frame_out, frame) filter_kernel(d_frame_in, d_frame_out, block=b_size, grid= g_size) # transfer from device to host cu.memcpy_dtoh(frame, d_frame_out) return frame
def execute(positions, num_particles, num_frames): #Get host positions: cpuPos = numpy.array(positions, dtype=numpy.float32) #Allocate position space on device: devPos = cuda.mem_alloc(cpuPos.nbytes) #Copy positions: cuda.memcpy_htod(devPos, cpuPos) #Allocate device velocities: devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes) cuda.memset_d32(devVels, 0, 2 * num_particles) # #Copy velocities: # cuda.memcpy_htod(devVels, cpuVels) #Allocate and initialize device in bounds to false: #inBounds = numpy.zeros(num_particles, dtype=bool) devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes) cuda.memset_d8(devInBounds, True, num_particles) # inB = numpy.zeros(num_particles, dtype=numpy.bool) # cuda.memcpy_dtoh(inB, devInBounds) # print inB # cuda.memcpy_htod(devInBounds, inBounds) # numBlocks = 1#(num_particles // 512) + 1; grid_dim = ((num_particles // NUM_THREADS) + 1, 1) print grid_dim runframe = module.get_function("runframe") frames = [None] * num_frames for i in range(num_frames): runframe(devPos, devVels, devInBounds, numpy.int32(num_particles), grid=grid_dim, block=(NUM_THREADS, 1, 1)) #Get the positions from device: cuda.memcpy_dtoh(cpuPos, devPos) frames[i] = cpuPos.copy() #frames[i] = copy(cpuPos) #write_frame(out, cpuPos, num_particles) #Simulation destination file: # out = open(OUTPUT_FILE, 'w') # write_header(out, num_particles) # for frame in frames: # write_frame(out, frame, num_particles) #clean up... #out.close() devPos.free() devVels.free() devInBounds.free()
def _gpuAlloc(self): #Get GPU information self.freeMem = cuda.mem_get_info()[0] * .5 * .8 # limit memory use to 80% of available self.maxPossRows = np.int(np.floor(self.freeMem / (4 * self.totalCols))) # multiply by 4 as that is size of float # set max rows to smaller number to save memory usage if self.totalRows < self.maxPossRows: print "reducing max rows to reduce memory use on GPU" self.maxPossRows = self.totalRows # create pagelocked buffers and GPU arrays self.to_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.from_gpu_buffer = cuda.pagelocked_empty((self.maxPossRows , self.totalCols), np.float32) self.data_gpu = cuda.mem_alloc(self.to_gpu_buffer.nbytes) self.result_gpu = cuda.mem_alloc(self.from_gpu_buffer.nbytes)
def test_multi_context(self): if drv.get_version() < (2,0,0): return if drv.get_version() >= (2,2,0): if drv.Context.get_device().compute_mode == drv.compute_mode.EXCLUSIVE: return mem_a = drv.mem_alloc(50) ctx2 = drv.Context.get_device().make_context() mem_b = drv.mem_alloc(60) del mem_a del mem_b ctx2.detach()
def test(cls, count, correctness=False): keys = np.uint32(np.random.randint(0, 1<<cls.radix_bits, size=count)) dkeys = cuda.to_device(keys) dout_a = cuda.mem_alloc(count * 4) dout_b = cuda.mem_alloc(count * 4) sorter = cls(count) stream = cuda.Stream() def test_stub(shift, trials=10, rounds=1): # Run once so that evt_a doesn't include initialization time sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_a = cuda.Event().record(stream) for i in range(trials): buf = sorter.multisort(dout_a, dout_b, dkeys, count, shift, rounds, stream=stream) evt_b = cuda.Event().record(stream) evt_b.synchronize() dur = evt_b.time_since(evt_a) / (rounds * trials) print '%6.1f,\t%4.0f,\t%4.0f' % (dur, count / (dur * 1000), count * sorter.radix_bits / (dur * 32 * 1000)) if shift == 0 and correctness: print '\nTesting correctness' out = cuda.from_device(buf, (count,), np.uint32) sort = np.sort(keys) if np.all(out == sort): print 'Correct' else: nz = np.nonzero(out != sort)[0] print sorted(set(nz >> 13)) for i in nz: print i, out[i-1:i+2], sort[i-1:i+2] assert False, 'Oh no' for b in range(cls.radix_bits - 3): print '%2d (%2d sig bits),\t' % (cls.radix_bits, cls.radix_bits - b), test_stub(b) if not correctness: for r in range(2,3): keys[:] = np.uint32( np.random.randint(0, 1<<(cls.radix_bits*r), count)) cuda.memcpy_htod(dkeys, keys) print '%2d x %d,\t\t\t' % (cls.radix_bits, r), test_stub(0, rounds=r) print
def set_round_drill( self , size ) : sx , sy = self.get_scale() nx , ny = int(size / sx + .5) , int(size / sy + .5) self.drillflat = False print 'Setting round drill:' print size print sx , sy print nx , ny self.hdrill = np.zeros( (nx,ny) , np.float32 ) size /= 2.0 for x in range(nx) : for y in range(ny) : fx = (x-int(nx/2+.5)) * sx fy = (y-int(ny/2+.5)) * sy ts = size*size - fx*fx - fy*fy self.hdrill[x,y] = -m.sqrt( ts ) + size if ts > 0 else size*2 self.drillrad = size print self.hdrill print self.drillrad self.cdrill = cuda_driver.mem_alloc( self.hdrill.nbytes ) cuda_driver.memcpy_htod( self.cdrill , self.hdrill ) self.grid = map( int , ( m.ceil(nx/22.0) , m.ceil(ny/22.0) ) ) self.block = ( min(nx,22) , min(ny,22) , 1 ) print self.grid print self.block
d_plotting_information['gpu_s1pf_lb_acc'] = pycuda.gpuarray.to_gpu( a_s1pf_lb_acc) d_plotting_information['gpu_s1pf_mean_acc'] = pycuda.gpuarray.to_gpu( a_s1pf_mean_acc) d_plotting_information['gpu_s1pf_ub_acc'] = pycuda.gpuarray.to_gpu( a_s1pf_ub_acc) # get random seeds setup local_gpu_setup_kernel = pycuda.compiler.SourceModule( cuda_full_observables_production.cuda_full_observables_production_code, no_extern_c=True).get_function('setup_kernel') local_rng_states = drv.mem_alloc( np.int32(num_blocks * block_dim) * pycuda.characterize.sizeof( 'curandStateXORWOW', '#include <curand_kernel.h>')) local_gpu_setup_kernel(np.int32(int(num_blocks * block_dim)), local_rng_states, np.uint64(0), np.uint64(0), grid=(int(num_blocks), 1), block=(int(block_dim), 1, 1)) # get observables function gpu_observables_func = SourceModule( cuda_full_observables_production.cuda_full_observables_production_code, no_extern_c=True).get_function( 'gpu_full_observables_production_with_log_hist_no_fv') gpu_observables_func_arrays = SourceModule( cuda_full_observables_production.cuda_full_observables_production_code,
X, Y = np.meshgrid(x, y) img = X * Y img = np.asarray(img, float) plt.figure(1) plt.imshow(img) plt.colorbar() plt.title('Input image') print(img.dtype) """Moving the data to the device and allocating space for the result.""" # --- Move the image from host to device d_img = cuda.mem_alloc(img.nbytes) cuda.memcpy_htod(d_img, img) d_img2 = cuda.mem_alloc(img.nbytes) """Operating the 2D fftshift.""" fftshift2D(d_img, d_img2, np.int32(M), np.int32(N), block = blockDim, grid = gridDim) img2 = np.empty_like(img) cuda.memcpy_dtoh(img2, d_img2) plt.figure(2) plt.imshow(img2) plt.colorbar() plt.title('Output image')
def cudaArrayMalloc(state,sourceMod): global stateCUDA, cudaStep stateCUDA = cuda.mem_alloc(state.nbytes) cuda.memcpy_htod(stateCUDA, state) #copy state to GPU cudaStep = sourceMod.get_function("stepTestInterface")
# prepare engine with open(cfg.weight, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) # ------------------------------------------------------------------------------------------------------------ # Since also the inference procedure are done on GPU, so any other CUDA relevant operation should be excluded, # e.g. CUDA operation in PyTorch, or some unexpected error may occur. # ------------------------------------------------------------------------------------------------------------ # detect images
def _runSimulation(self, parameters, initValues, blocks, threads, in_atol=1e-12, in_rtol=1e-6): totalThreads = threads * blocks experiments = len(parameters) neqn = self._speciesNumber # compile timer = time.time() ## print "Init Common..", init_common_Kernel = self._completeCode.get_function("init_common") init_common_Kernel(block=(threads, 1, 1), grid=(blocks, 1)) ## print "finished in", round(time.time()-timer,4), "s" start_time = time.time() # output array ret_xt = np.zeros( [totalThreads, 1, self._resultNumber, self._speciesNumber]) # calculate sizes of work spaces isize = 20 + self._speciesNumber rsize = 22 + self._speciesNumber * max(16, self._speciesNumber + 9) # local variables t = np.zeros([totalThreads], dtype=np.float64) jt = np.zeros([totalThreads], dtype=np.int32) neq = np.zeros([totalThreads], dtype=np.int32) itol = np.zeros([totalThreads], dtype=np.int32) iopt = np.zeros([totalThreads], dtype=np.int32) rtol = np.zeros([totalThreads], dtype=np.float64) iout = np.zeros([totalThreads], dtype=np.int32) tout = np.zeros([totalThreads], dtype=np.float64) itask = np.zeros([totalThreads], dtype=np.int32) istate = np.zeros([totalThreads], dtype=np.int32) atol = np.zeros([totalThreads], dtype=np.float64) liw = np.zeros([totalThreads], dtype=np.int32) lrw = np.zeros([totalThreads], dtype=np.int32) iwork = np.zeros([isize * totalThreads], dtype=np.int32) rwork = np.zeros([rsize * totalThreads], dtype=np.float64) y = np.zeros([self._speciesNumber * totalThreads], dtype=np.float64) for i in range(totalThreads): neq[i] = neqn #t[i] = self._timepoints[0] t[i] = 0 itol[i] = 1 itask[i] = 1 istate[i] = 1 iopt[i] = 0 jt[i] = 2 atol[i] = in_atol rtol[i] = in_rtol liw[i] = isize lrw[i] = rsize try: # initial conditions for j in range(self._speciesNumber): # loop over species y[i * self._speciesNumber + j] = initValues[i][j] ret_xt[i, 0, 0, j] = initValues[i][j] except IndexError: pass # allocate on device d_t = driver.mem_alloc(t.size * t.dtype.itemsize) d_jt = driver.mem_alloc(jt.size * jt.dtype.itemsize) d_neq = driver.mem_alloc(neq.size * neq.dtype.itemsize) d_liw = driver.mem_alloc(liw.size * liw.dtype.itemsize) d_lrw = driver.mem_alloc(lrw.size * lrw.dtype.itemsize) d_itol = driver.mem_alloc(itol.size * itol.dtype.itemsize) d_iopt = driver.mem_alloc(iopt.size * iopt.dtype.itemsize) d_rtol = driver.mem_alloc(rtol.size * rtol.dtype.itemsize) d_iout = driver.mem_alloc(iout.size * iout.dtype.itemsize) d_tout = driver.mem_alloc(tout.size * tout.dtype.itemsize) d_itask = driver.mem_alloc(itask.size * itask.dtype.itemsize) d_istate = driver.mem_alloc(istate.size * istate.dtype.itemsize) d_y = driver.mem_alloc(y.size * y.dtype.itemsize) d_atol = driver.mem_alloc(atol.size * atol.dtype.itemsize) d_iwork = driver.mem_alloc(iwork.size * iwork.dtype.itemsize) d_rwork = driver.mem_alloc(rwork.size * rwork.dtype.itemsize) # copy to device driver.memcpy_htod(d_t, t) driver.memcpy_htod(d_jt, jt) driver.memcpy_htod(d_neq, neq) driver.memcpy_htod(d_liw, liw) driver.memcpy_htod(d_lrw, lrw) driver.memcpy_htod(d_itol, itol) driver.memcpy_htod(d_iopt, iopt) driver.memcpy_htod(d_rtol, rtol) driver.memcpy_htod(d_iout, iout) driver.memcpy_htod(d_tout, tout) driver.memcpy_htod(d_itask, itask) driver.memcpy_htod(d_istate, istate) driver.memcpy_htod(d_y, y) driver.memcpy_htod(d_atol, atol) driver.memcpy_htod(d_iwork, iwork) driver.memcpy_htod(d_rwork, rwork) param = np.zeros((totalThreads, self._parameterNumber), dtype=np.float32) try: for i in range(len(parameters)): for j in range(self._parameterNumber): param[i][j] = parameters[i][j] except IndexError: pass # parameter texture ary = sim.create_2D_array(param) sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4, totalThreads) self._param_tex.set_array(ary) if self._dt <= 0: start_time = time.time() #for i in range(1,self._resultNumber): for i in range(0, self._resultNumber): for j in range(totalThreads): tout[j] = self._timepoints[i] driver.memcpy_htod(d_tout, tout) self._compiledRunMethod(d_neq, d_y, d_t, d_tout, d_itol, d_rtol, d_atol, d_itask, d_istate, d_iopt, d_rwork, d_lrw, d_iwork, d_liw, d_jt, block=(threads, 1, 1), grid=(blocks, 1)) driver.memcpy_dtoh(t, d_t) driver.memcpy_dtoh(y, d_y) driver.memcpy_dtoh(istate, d_istate) for j in range(totalThreads): for k in range(self._speciesNumber): ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k] # end of loop over time points else: tt = self._timepoints[0] start_time = time.time() #for i in range(1,self._resultNumber): for i in range(0, self._resultNumber): while 1: next_time = min(tt + self._dt, self._timepoints[i]) for j in range(totalThreads): tout[j] = next_time driver.memcpy_htod(d_tout, tout) self._compiledRunMethod(d_neq, d_y, d_t, d_tout, d_itol, d_rtol, d_atol, d_itask, d_istate, d_iopt, d_rwork, d_lrw, d_iwork, d_liw, d_jt, block=(threads, 1, 1), grid=(blocks, 1)) driver.memcpy_dtoh(t, d_t) driver.memcpy_dtoh(y, d_y) driver.memcpy_dtoh(istate, d_istate) if np.abs(next_time - self._timepoints[i]) < 1e-5: tt = next_time break tt = next_time for j in range(totalThreads): for k in range(self._speciesNumber): ret_xt[j, 0, i, k] = y[j * self._speciesNumber + k] # end of loop over time points return ret_xt[0:experiments]
def Decrypt(): #Initialize Timers if cfg.DEBUG_IMAGES: misc_timer = np.zeros(6) else: misc_timer = np.zeros(5) perf_timer = np.zeros(5) overall_time = perf_counter() # Read input image misc_timer[0] = overall_time img = cv2.imread(cfg.ENC_OUT, 1) if img is None: print("File does not exist!") raise SystemExit(0) dim = img.shape misc_timer[1] = perf_counter() # Read log file with open(cfg.LOG, "r") as f: width = int(f.readline()) height = int(f.readline()) rounds = int(f.readline()) #fracID = int(f.readline()) misc_timer[1] = perf_counter() - misc_timer[1] # Flatten image to vector and send to GPU imgArr = np.asarray(img).reshape(-1) gpuimgIn = cuda.mem_alloc(imgArr.nbytes) gpuimgOut = cuda.mem_alloc(imgArr.nbytes) cuda.memcpy_htod(gpuimgIn, imgArr) misc_timer[0] = perf_counter() - misc_timer[0] - misc_timer[1] # Warm-Up GPU for accurate benchmarking if cfg.DEBUG_TIMER: funcTemp = cf.mod.get_function("WarmUp") funcTemp(grid=(1,1,1), block=(1,1,1)) # Inverse Permutation: Intra-row/column rotation perf_timer[0] = perf_counter() U = cf.genRelocVec(dim[0],dim[1],cfg.P1LOG, ENC=False) # Col-rotation | len(U)=n, values from 0->m V = cf.genRelocVec(dim[1],dim[0],cfg.P2LOG, ENC=False) # Row-rotation | len(V)=m, values from 0->n perf_timer[0] = perf_counter() - perf_timer[0] misc_timer[2] = perf_counter() gpuU = cuda.mem_alloc(U.nbytes) gpuV = cuda.mem_alloc(V.nbytes) cuda.memcpy_htod(gpuU, U) cuda.memcpy_htod(gpuV, V) func = cf.mod.get_function("Dec_GenCatMap") misc_timer[2] = perf_counter() - misc_timer[2] perf_timer[1] = perf_counter() for i in range(cfg.PERM_ROUNDS): func(gpuimgIn, gpuimgOut, gpuU, gpuV, grid=(dim[0],dim[1],1), block=(3,1,1)) gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn perf_timer[1] = perf_counter() - perf_timer[1] if cfg.DEBUG_IMAGES: misc_timer[5] += cf.interImageWrite(gpuimgIn, "OUT_1", len(imgArr), dim) # Inverse Fractal XOR Phase temp_timer = perf_counter() fractal, misc_timer[3] = cf.getFractal(dim[0]) fracArr = np.asarray(fractal).reshape(-1) gpuFrac = cuda.mem_alloc(fracArr.nbytes) cuda.memcpy_htod(gpuFrac, fracArr) func = cf.mod.get_function("FracXOR") misc_timer[3] = perf_counter() - temp_timer perf_timer[2] = perf_counter() func(gpuimgIn, gpuimgOut, gpuFrac, grid=(dim[0]*dim[1],1,1), block=(3,1,1)) perf_timer[2] = perf_counter() - perf_timer[2] gpuimgIn, gpuimgOut = gpuimgOut, gpuimgIn if cfg.DEBUG_IMAGES: misc_timer[5] += cf.interImageWrite(gpuimgIn, "OUT_2", len(imgArr), dim) # Ar Phase: Cat-map Iterations misc_timer[4] = perf_counter() imgShuffle = np.arange(start=0, stop=len(imgArr)/3, dtype=np.uint32) gpuShuffIn = cuda.mem_alloc(imgShuffle.nbytes) gpuShuffOut = cuda.mem_alloc(imgShuffle.nbytes) cuda.memcpy_htod(gpuShuffIn, imgShuffle) func = cf.mod.get_function("ArMapTable") misc_timer[4] = perf_counter() - misc_timer[4] # Recalculate mapping to generate lookup table perf_timer[3] = perf_counter() for i in range(rounds): func(gpuShuffIn, gpuShuffOut, grid=(dim[0],dim[1],1), block=(1,1,1)) gpuShuffIn, gpuShuffOut = gpuShuffOut, gpuShuffIn perf_timer[3] = perf_counter() - perf_timer[3] # Apply mapping gpuShuffle = gpuShuffIn func = cf.mod.get_function("ArMapTabletoImg") perf_timer[4] = perf_counter() func(gpuimgIn, gpuimgOut, gpuShuffle, grid=(dim[0]*dim[1],1,1), block=(3,1,1)) perf_timer[4] = perf_counter() - perf_timer[4] if cfg.DEBUG_IMAGES: misc_timer[5] += cf.interImageWrite(gpuimgOut, "OUT_3", len(imgArr), dim) # Transfer vector back to host and reshape into original dimensions if needed temp_timer = perf_counter() cuda.memcpy_dtoh(imgArr, gpuimgOut) img = (np.reshape(imgArr,dim)).astype(np.uint8) if height!=width: img = cv2.resize(img,(height,width),interpolation=cv2.INTER_CUBIC) dim = img.shape cv2.imwrite(cfg.DEC_OUT, img) misc_timer[0] += perf_counter() - temp_timer # Print timing statistics if cfg.DEBUG_TIMER: overall_time = perf_counter() - overall_time perf = np.sum(perf_timer) misc = np.sum(misc_timer) print("\nTarget: {} ({}x{})".format(cfg.ENC_IN, dim[1], dim[0])) print("\nPERF. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(perf, perf/overall_time*100)) print("Shuffle Gen: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[0], perf_timer[0]/overall_time*100)) print("Perm. Kernel: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[1], perf_timer[1]/overall_time*100)) print("XOR Kernel: \t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[2], perf_timer[2]/overall_time*100)) print("LUT Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[3], perf_timer[3]/overall_time*100)) print("Mapping Kernel:\t{0:9.7f}s ({1:5.2f}%)".format(perf_timer[4], perf_timer[4]/overall_time*100)) print("\nMISC. OPS: \t{0:9.7f}s ({1:5.2f}%)".format(misc, misc/overall_time*100)) print("I/O:\t\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[0], misc_timer[0]/overall_time*100)) print("Log Read:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[1], misc_timer[1]/overall_time*100)) print("Permute Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[2], misc_timer[2]/overall_time*100)) print("FracXOR Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[3], misc_timer[3]/overall_time*100)) print("LUT Misc:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[4], misc_timer[4]/overall_time*100)) if cfg.DEBUG_IMAGES: print("Debug Images:\t{0:9.7f}s ({1:5.2f}%)".format(misc_timer[5], misc_timer[5]/overall_time*100)) print("\nNET TIME:\t{0:7.5f}s\n".format(overall_time))
def __call__(self, input_im, row_kernel, col_kernel, result=None, input_shape=None, row_shape=None, col_shape=None, **kwargs): self.ctx.push() use_cached_buffers = kwargs.get("use_cached_buffers", True) if input_im.__class__ == numpy.ndarray and input_im.dtype != numpy.float32: raise KernelMustUseFloat32Exception (input_dev, input_shape, input_type) = self.transfer_to_device(input_im) (row_dev, row_shape, row_type) = self.transfer_to_device(row_kernel) (col_dev, col_shape, col_type) = self.transfer_to_device(col_kernel) if input_shape is None: raise UnknownArrayShapeException if row_shape is None: raise UnknownArrayShapeException if col_shape is None: raise UnknownArrayShapeException row_tile_width = 128 col_tile_width = 16 col_tile_height = 48 col_hstride = 8 assert numpy.mod(row_shape[0], 2) == 1, "Kernels must be of odd width" row_kernel_radius = row_shape[0] / 2 coallescing_quantum = 16 row_kernel_radius_aligned = (row_kernel_radius / coallescing_quantum) * coallescing_quantum if row_kernel_radius_aligned == 0: row_kernel_radius_aligned = coallescing_quantum assert numpy.mod(col_shape[0], 2) == 1, "Kernels must be of odd width" col_kernel_radius = col_shape[0] / 2 #build_args = (im_shape, row_kernel_radius, row_kernel_radius_aligned, row_tile_width, col_kernel_radius, col_tile_width, col_tile_height, col_hstride build_args = (input_type, input_shape, row_kernel_radius, row_kernel_radius_aligned, row_tile_width, col_kernel_radius, col_tile_width, col_tile_height, col_hstride) if build_args in self.cached_programs: prg = self.cached_programs[build_args] else: prg = self.build_program(*build_args) row_local_size = (row_kernel_radius_aligned + row_tile_width + row_kernel_radius, 1, 1) row_group_size = (int_div_up(input_shape[1], row_tile_width), input_shape[0]) row_global_size = (row_local_size[0] * row_group_size[0], row_local_size[1] * row_group_size[1]) col_local_size = (col_tile_width, col_hstride, 1) col_group_size = (int_div_up(input_shape[1], col_tile_width), int_div_up(input_shape[0], col_tile_height)) col_global_size = (col_local_size[0] * col_group_size[0], col_local_size[1] * col_group_size[1]) #print col_local_size #print col_group_size #print col_global_size # a device buffer for the intermediate result intermediate_dev = None if (use_cached_buffers) and ((input_shape, input_type) in self.cached_intermediate_buffers): intermediate_dev = self.cached_intermediate_buffers[(input_shape, input_type)] else: dummy = numpy.array([1], dtype=input_type) intermediate_dev = cuda.mem_alloc(input_shape[0] * input_shape[1] * dummy.itemsize) self.cached_intermediate_buffers[(input_shape, input_type)] = intermediate_dev # a device buffer for the result, if not already supplied result_dev = None if result is None or result.__class__ == numpy.ndarray: # need to make or repurpose a device buffer if (use_cached_buffers) and ((input_shape, input_type) in self.cached_result_buffers): result_dev = self.cached_result_buffers[(input_shape, input_type)] #print "Here" #print(result_dev) else: dummy = numpy.array([1], dtype=input_type) result_dev = cuda.mem_alloc(input_shape[0] * input_shape[1] * dummy.itemsize) self.cached_result_buffers[(input_shape, input_type)] = result_dev self.cached_shapes[result_dev] = input_shape self.cached_types[result_dev] = input_type else: # assume that result is a device buffer already (possibly not a safe assumption) result_dev = result #t = Timer() try: f = prg.get_function("separable_convolution_row") f(intermediate_dev, input_dev, row_dev, grid=[int(e) for e in row_group_size], block=[int(e) for e in row_local_size]) self.ctx.synchronize() except Exception as e: print(input_shape) print(intermediate_dev) print(input_dev) print(row_dev) print(row_global_size) print(row_local_size) raise e try: f = prg.get_function("separable_convolution_col") f(result_dev, intermediate_dev, col_dev, grid=[int(e) for e in col_group_size], block=[int(e) for e in col_local_size]) self.ctx.synchronize() except Exception as e: print(input_shape) print(result_dev) print(intermediate_dev) print(row_dev) print(row_shape) raise e #print("Elapsed: %f" % t.elapsed) if kwargs.get("readback_from_device", False): if result is None: result = self.transfer_from_device(result_dev, shape=input_shape) else: self.transfer_from_device(result_dev, result) else: result = result_dev self.ctx.pop() return result
def allocate_GPU_mem(self): self.pyramid_d = cuda.mem_alloc(self.pyramid.nbytes) self.pyrlevel_d = cuda.mem_alloc(self.pyrlevelCones.nbytes) self.fov_d = cuda.mem_alloc(self.pyrlevelCones.nbytes)
def __init__(self, path, workspace): # parameters self.path = path # config from path try: yaml_path = self.path + "/cfg.yaml" print("Opening config file %s" % yaml_path) self.CFG = yaml.load(open(yaml_path, 'r')) except Exception as e: print(e) print("Error opening cfg.yaml file from trained model.") quit() # get the data parserModule = imp.load_source("parserModule", booger.TRAIN_PATH + '/tasks/classification/dataset/' + self.CFG["dataset"]["name"] + '/parser.py') self.parser = parserModule.Parser(img_prop=self.CFG["dataset"]["img_prop"], img_means=self.CFG["dataset"]["img_means"], img_stds=self.CFG["dataset"]["img_stds"], classes=self.CFG["dataset"]["labels"], train=False) # some useful data self.data_h, self.data_w, self.data_d = self.parser.get_img_size() self.means, self.stds = self.parser.get_means_stds() self.means = np.array(self.means, dtype=np.float32) self.stds = np.array(self.stds, dtype=np.float32) self.nclasses = self.parser.get_n_classes() # try to deserialize the engine first self.engine = None self.engine_serialized_path = path + "/model.trt" try: with open(self.engine_serialized_path, "rb") as f: self.runtime = trt.Runtime(TRT_LOGGER) self.engine = self.runtime.deserialize_cuda_engine(f.read()) except Exception as e: print("Could not deserialize engine. Generate instead. Error: ", e) self.engine = None # architecture definition from onnx if no engine is there # get weights? if self.engine is None: try: # basic stuff for onnx parser self.model_path = path + "/model.onnx" self.builder = trt.Builder(TRT_LOGGER) self.network = self.builder.create_network() self.onnxparser = trt.OnnxParser(self.network, TRT_LOGGER) self.model = open(self.model_path, 'rb') self.onnxparser.parse(self.model.read()) print("Successfully ONNX weights from ", self.model_path) except Exception as e: print("Couldn't load ONNX network. Error: ", e) quit() print("Wait while tensorRT profiles the network and build engine") # trt parameters try: self.builder.max_batch_size = 1 self.builder.max_workspace_size = workspace self.builder.fp16_mode = self.builder.platform_has_fast_fp16 print("Platform has fp16 mode: ", self.builder.platform_has_fast_fp16) print("Calling build_cuda_engine") self.engine = self.builder.build_cuda_engine(self.network) assert(self.engine is not None) except Exception as e: print("Failed creating engine for TensorRT. Error: ", e) quit() print("Done generating tensorRT engine.") # serialize for later print("Serializing tensorRT engine for later (for example in the C++ interface)") try: self.serialized_engine = self.engine.serialize() with open(self.engine_serialized_path, "wb") as f: f.write(self.serialized_engine) except Exception as e: print("Couln't serialize engine. Not critical, so I continue. Error: ", e) else: print("Successfully opened engine from inference directory.") print("WARNING: IF YOU WANT TO PROFILE FOR THIS COMPUTER DELETE model.trt FROM THAT DIRECTORY") # create execution context self.context = self.engine.create_execution_context() # Determine dimensions and create CUDA memory buffers # to hold host inputs/outputs. self.d_input_size = self.data_h * self.data_w * self.data_d * 4 self.d_output_size = self.nclasses * 4 # Allocate device memory for inputs and outputs. self.d_input = cuda.mem_alloc(self.d_input_size) self.d_output = cuda.mem_alloc(self.d_output_size) # Create a stream in which to copy inputs/outputs and run inference. self.stream = cuda.Stream()
# h_input = cuda.pagelocked_empty(engine.get_binding_shape(0).volume(), dtype=np.float32) # h_output = cuda.pagelocked_empty(engine.get_binding_shape(1).volume(), dtype=np.float32) # d_input = cuda.mem_alloc(h_input.nbytes) # d_output = cuda.mem_alloc(h_output.nbytes) with builder.build_cuda_engine(network) as engine: output = np.empty(10, dtype = np.float32) # Alocate device memory d_input = cuda.mem_alloc(1 * img.nbytes) d_output = cuda.mem_alloc(1 * output.nbytes) bindings=[int(d_input), int(d_output)] stream = cuda.Stream() with engine.create_execution_context() as context: cuda.memcpy_htod_async(d_input, img, stream) context.execute_async(bindings = bindings, stream_handle=stream.handle) cuda.memcpy_dtoh_async(output, d_output, stream) stream.synchronize() print("true label : ", label)
def _run_simulation(self, parameters, init_values, blocks, threads): total_threads = blocks * threads experiments = len(parameters) # simulation specific parameters param = np.zeros( (total_threads / self._beta + 1, self._parameterNumber), dtype=np.float32) try: for i in range(experiments): for j in range(self._parameterNumber): param[i][j] = parameters[i][j] except IndexError: pass if not self._putIntoShared: # parameter texture ary = sim.create_2D_array(param) sim.copy2D_host_to_array(ary, param, self._parameterNumber * 4, total_threads / self._beta + 1) self._param_tex.set_array(ary) shared_memory_parameters = 0 else: # parameter shared Mem shared_memory_parameters = self._parameterNumber * ( threads / self._beta + 2) * 4 shared_memory_per_block_for_rng = threads / self._warp_size * self._state_words * 4 shared_tot = shared_memory_per_block_for_rng + shared_memory_parameters if self._putIntoShared: parameters_input = np.zeros(self._parameterNumber * total_threads / self._beta, dtype=np.float32) species_input = np.zeros(self._speciesNumber * total_threads, dtype=np.float32) result = np.zeros(self._speciesNumber * total_threads * self._resultNumber, dtype=np.float32) # non coalesced try: for i in range(len(init_values)): for j in range(self._speciesNumber): species_input[i * self._speciesNumber + j] = init_values[i][j] except IndexError: pass if self._putIntoShared: try: for i in range(experiments): for j in range(self._parameterNumber): parameters_input[i * self._parameterNumber + j] = parameters[i][j] except IndexError: pass # set seeds using python rng seeds = np.zeros(total_threads / self._warp_size * self._state_words, dtype=np.uint32) for i in range(len(seeds)): seeds[i] = np.uint32(4294967296 * np.random.uniform(0, 1)) # seeds[i] = np.random.random_integers(0,4294967295) species_gpu = driver.mem_alloc(species_input.nbytes) if self._putIntoShared: parameters_gpu = driver.mem_alloc(parameters_input.nbytes) seeds_gpu = driver.mem_alloc(seeds.nbytes) result_gpu = driver.mem_alloc(result.nbytes) driver.memcpy_htod(species_gpu, species_input) if self._putIntoShared: driver.memcpy_htod(parameters_gpu, parameters_input) driver.memcpy_htod(seeds_gpu, seeds) driver.memcpy_htod(result_gpu, result) # run code if self._putIntoShared: self._compiledRunMethod(species_gpu, parameters_gpu, seeds_gpu, result_gpu, block=(threads, 1, 1), grid=(blocks, 1), shared=shared_tot) else: self._compiledRunMethod(species_gpu, seeds_gpu, result_gpu, block=(threads, 1, 1), grid=(blocks, 1), shared=shared_tot) # fetch from GPU memory driver.memcpy_dtoh(result, result_gpu) # reshape result result = result[0:experiments * self._beta * self._resultNumber * self._speciesNumber] result.shape = (experiments, self._beta, self._resultNumber, self._speciesNumber) return result
def get_ptr(array): ptr = cuda.mem_alloc(MatrixStruct.mem_size) mat = MatrixStruct(array, ptr) return ptr, mat
def calculation(in_queue, out_queue): device_num, params = in_queue.get() chunk_size = params['chunk_size'] chunks_num = params['chunks_num'] particles = params['particles'] state = params['state'] representation = params['representation'] quantities = params['quantities'] decoherence = params['decoherence'] if decoherence is not None: decoherence_steps = decoherence['steps'] decoherence_coeff = decoherence['coeff'] else: decoherence_steps = 0 decoherence_coeff = 1 binning = params['binning'] if binning is not None: s = set() for names, _, _ in binning: s.update(names) quantities = sorted(list(s)) c_dtype = numpy.complex128 c_ctype = 'double2' s_dtype = numpy.float64 s_ctype = 'double' Fs = [] cuda.init() device = cuda.Device(device_num) ctx = device.make_context() free, total = cuda.mem_get_info() max_chunk_size = float(total) / len(quantities) / numpy.dtype( c_dtype).itemsize / 1.1 max_chunk_size = 10**int(numpy.log(max_chunk_size) / numpy.log(10)) #print free, total, max_chunk_size if max_chunk_size > chunk_size: subchunk_size = chunk_size subchunks_num = 1 else: assert chunk_size % max_chunk_size == 0 subchunk_size = max_chunk_size subchunks_num = chunk_size / subchunk_size buffers = [] for quantity in sorted(quantities): buffers.append(GPUArray(subchunk_size, c_dtype)) stream = cuda.Stream() # compile code try: source = TEMPLATE.render(c_ctype=c_ctype, s_ctype=s_ctype, particles=particles, state=state, representation=representation, quantities=quantities, decoherence_coeff=decoherence_coeff) except: print exceptions.text_error_template().render() raise try: module = SourceModule(source, no_extern_c=True) except: for i, l in enumerate(source.split("\n")): print i + 1, ":", l raise kernel_initialize = module.get_function("initialize") kernel_calculate = module.get_function("calculate") kernel_decoherence = module.get_function("decoherence") # prepare call parameters gen_block_size = min(kernel_initialize.max_threads_per_block, kernel_calculate.max_threads_per_block) gen_grid_size = device.get_attribute( cuda.device_attribute.MULTIPROCESSOR_COUNT) gen_block = (gen_block_size, 1, 1) gen_grid = (gen_grid_size, 1, 1) num_gen = gen_block_size * gen_grid_size assert num_gen <= 20000 # prepare RNG states #seeds = to_gpu(numpy.ones(size, dtype=numpy.uint32)) seeds = to_gpu( numpy.random.randint(0, 2**32 - 1, size=num_gen).astype(numpy.uint32)) state_type_size = sizeof("curandStateXORWOW", "#include <curand_kernel.h>") states = cuda.mem_alloc(num_gen * state_type_size) #prev_stack_size = cuda.Context.get_limit(cuda.limit.STACK_SIZE) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, 1<<14) # 16k kernel_initialize(states, seeds.gpudata, block=gen_block, grid=gen_grid, stream=stream) #cuda.Context.set_limit(cuda.limit.STACK_SIZE, prev_stack_size) # run calculation args = [states] + [buf.gpudata for buf in buffers] + [numpy.int32(subchunk_size)] if binning is None: results = { quantity: numpy.zeros( (decoherence_steps + 1, chunks_num * subchunks_num), c_dtype) for quantity in quantities } for i in xrange(chunks_num * subchunks_num): kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) for k in xrange(decoherence_steps + 1): if k > 0: kernel_decoherence(*args, block=gen_block, grid=gen_grid, stream=stream) for j, quantity in enumerate(sorted(quantities)): F = (gpuarray.sum(buffers[j], stream=stream) / buffers[j].size).get() results[quantity][k, i] = F for quantity in sorted(quantities): results[quantity] = results[quantity].reshape( decoherence_steps + 1, chunks_num, subchunks_num).mean(2).real.tolist() out_queue.put(results) else: bin_accums = [ numpy.zeros(tuple([binnum] * len(vals)), numpy.int64) for vals, binnum, _ in binning ] bin_edges = [None] * len(binning) for i in xrange(chunks_num * subchunks_num): bin_edges = [] kernel_calculate(*args, block=gen_block, grid=gen_grid, stream=stream) results = { quantity: buffers[j].get().real for j, quantity in enumerate(sorted(quantities)) } for binparam, bin_accum in zip(binning, bin_accums): qnames, binnum, ranges = binparam sample_lines = [results[quantity] for quantity in qnames] sample = numpy.concatenate( [arr.reshape(subchunk_size, 1) for arr in sample_lines], axis=1) hist, edges = numpy.histogramdd(sample, binnum, ranges) bin_accum += hist bin_edges.append(numpy.array(edges)) results = [[acc.tolist(), edges.tolist()] for acc, edges in zip(bin_accums, bin_edges)] out_queue.put(results) #ctx.pop() ctx.detach()
def run_pnpoly(context, cc): #read kernel into string with open('pnpoly.cu', 'r') as f: kernel_string = f.read() #compile the kernels module = SourceModule(kernel_string, arch='compute_' + cc, code='sm_' + cc, cache_dir=False, no_extern_c=True) pnpoly_kernel = module.get_function("cn_pnpoly") #set the number of points and the number of vertices size = numpy.int32(2e7) vertices = 600 #allocate page-locked device-mapped host memory points = allocate(2 * size, numpy.float32) bitmap = allocate(size, numpy.int32) vertices = allocate(2 * vertices, numpy.float32) # HINT: need to reference constant memory # d_bitmap = numpy.intp(bitmap.base.get_device_pointer()) d_points = numpy.intp(points.base.get_device_pointer()) #generate/read input data numpy.copyto(points, numpy.random.randn(2 * size).astype(numpy.float32)) numpy.copyto(vertices, numpy.fromfile("vertices.dat", dtype=numpy.float32)) #allocate gpu device memory for storing the vertices d_vertices = drv.mem_alloc(vertices.nbytes) #copy from host memory to GPU device memory drv.memcpy_htod(d_vertices, vertices) # HINT: need to also copy memory to constant array # #kernel arguments gpu_args = [d_bitmap, d_points, d_vertices, size] #setup thread block sizes threads = (256, 1, 1) grid = (int(numpy.ceil(size / float(threads[0]))), 1) #create events for time measurement start = drv.Event() end = drv.Event() #warm up the device a bit before measurement context.synchronize() for i in range(5): pnpoly_kernel(*gpu_args, block=threads, grid=grid) context.synchronize() #run the kernel and measure time using events start.record() pnpoly_kernel(*gpu_args, block=threads, grid=grid) end.record() context.synchronize() print("cn_pnpoly took", end.time_since(start), "ms.") #compute the reference answer using the reference kernel reference = allocate(size, numpy.int32) d_reference = numpy.intp(reference.base.get_device_pointer()) reference_kernel = module.get_function("cn_pnpoly_reference_kernel") ref_args = [d_reference, d_points, d_vertices, size] context.synchronize() start.record() reference_kernel(*ref_args, block=threads, grid=grid) end.record() context.synchronize() print("reference kernel took", end.time_since(start), "ms.") #check if the result is the same test = numpy.sum(numpy.absolute(bitmap - reference)) == 0 if test != True: print("answer:") print(bitmap) print("reference:") print(reference) else: print("ok!")
def get_cov(A, blocks=None, threads=None): rows, cols = A.shape rows = int(rows) cols = int(cols) # Assign block and thread size if blocks and threads and blocks <= 1024 and threads <= 1024: blockCount = blocks threadCount = threads else: # Number of threads per block if rows >= 1024: threadCount = 1024 else: threadCount = rows # Number of blocks per grid if cols >= 1024: blockCount = 1024 else: blockCount = cols # Host Memory means = np.zeros(cols) means = means.astype(np.float32) covariances = np.zeros(cols * cols) covariances = covariances.astype(np.float32) # Allocate on device d_A = cuda.mem_alloc(A.size * A.dtype.itemsize) d_means = cuda.mem_alloc(means.size * means.dtype.itemsize) d_covariances = cuda.mem_alloc(covariances.size * covariances.dtype.itemsize) # Copy from host to device cuda.memcpy_htod(d_A, A) cuda.memcpy_htod(d_means, means) cuda.memcpy_htod(d_covariances, covariances) # # Number of threads per block # if rows >= 1024: # threadCount = 1024 # else: # threadCount = rows # # Number of blocks per grid # blockCount = cols # Start GPU time start = cuda.Event() end = cuda.Event() start.record() # Run Kernel func( np.int32(cols), np.int32(rows), d_A, d_means, d_covariances, block=(threadCount, 1, 1), grid=(blockCount, 1), shared=threadCount * A.dtype.itemsize, ) # End GPU time end.record() end.synchronize() ms = start.time_till(end) # Copy result to host cuda.memcpy_dtoh(covariances, d_covariances) # Return Covariance Matrix return np.resize(covariances, (cols, cols)), ms
numpy.getbuffer() needed due to lack of new-style buffer interface for scalar numpy arrays as of numpy version 1.9.1 see: https://github.com/inducer/pycuda/pull/60 """ cuda.memcpy_htod(int(struct_arr_ptr), numpy.getbuffer(numpy.int32(array.size))) cuda.memcpy_htod( int(struct_arr_ptr) + 8, numpy.getbuffer(numpy.uintp(int(self.data)))) def __str__(self): return str(cuda.from_device(self.data, self.shape, self.dtype)) struct_arr = cuda.mem_alloc(2 * DoubleOpStruct.mem_size) do2_ptr = int(struct_arr) + DoubleOpStruct.mem_size array1 = DoubleOpStruct(numpy.array([1, 2, 3], dtype=numpy.float32), struct_arr) array2 = DoubleOpStruct(numpy.array([0, 4], dtype=numpy.float32), do2_ptr) print("original arrays") print(array1) print(array2) mod = SourceModule(""" struct DoubleOperation { int datalen, __padding; // so 64-bit ptrs can be aligned float *ptr; };
#print "#### Data length:", len(data) #dataLength = len(data) dataLength = dataTimeSize pool = genome.Pool(db, 'sell', 'AUDUSD', endDate) data = numpy.array(data).astype(numpy.float32) printFreeMemory() print "Data size ", data.nbytes/1024, " KB" printFreeMemory() data_gpu = cuda.mem_alloc(data.nbytes) cuda.memcpy_htod(data_gpu, data) trees = [] for x in range(poolSize): trees.append( genome.randomTree(treeLength) ) ### Main Loop generations = 0 dataDim = math.floor(dataLength/64.0) evalArray = None lastTrees = None winCount = None lossCount = None
# pycuda tutorial from: https://documen.tician.de/pycuda/tutorial.html import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy a = numpy.random.randn(4,4) b = numpy.random.randn(4, 4, 2) a = a.astype(numpy.float32) b = b.astype(numpy.float32) a_gpu = cuda.mem_alloc(a.nbytes) b_gpu = cuda.mem_alloc(b.nbytes) cuda.memcpy_htod(a_gpu, a) cuda.memcpy_htod(b_gpu, b) mod = SourceModule(""" __global__ void doublify(float *a, float ***b) { int idx = threadIdx.x + threadIdx.y * blockDim.x; a[idx] *= 2; b[threadIdx.x][threadIdx.y][0] = (float) (threadIdx.x + 100.0 * threadIdx.y); } """) func = mod.get_function("doublify")
file_data.close() start = time.time() for i in range(0,numPoints): for j in range(0, numDims+1): dataT[j][i] = data[i][j] ### ## allocate memory on device ### X_gpu = cuda.mem_alloc(data.nbytes) X_t_gpu = cuda.mem_alloc(dataT.nbytes) weights_gpu = cuda.mem_alloc(weights.nbytes) old_weights_gpu = cuda.mem_alloc(weights.nbytes) distances_gpu = cuda.mem_alloc(weights.nbytes) labels_gpu = cuda.mem_alloc(labels.nbytes) error_gpu = cuda.mem_alloc(labels.nbytes) prob_gpu = cuda.mem_alloc(labels.nbytes) ### ## transfer data to gpu ### cuda.memcpy_htod(X_gpu, data) cuda.memcpy_htod(X_t_gpu, dataT) cuda.memcpy_htod(weights_gpu, weights)
a1 = np.zeros(1, dtype=np.float64) b1 = np.zeros(10, dtype=np.float64) c1 = np.zeros(100, dtype=np.float64) print a1 print b1 print c1 a1_addr = drv.to_device(a1) b1_addr = drv.to_device(b1) c1_addr = drv.to_device(c1) #print int(a1_addr) #print sys.getsizeof(int(b1_addr)) twod_gpu = drv.mem_alloc(3 * 8) address = np.array([int(a1_addr), int(b1_addr), int(c1_addr)]).astype(np.uint64) #print address drv.memcpy_htod(twod_gpu, address) #diag_kernel(drv.InOut(a1), drv.InOut(b1), drv.InOut(c1), block=(32,1,1)) diag_kernel(twod_gpu, block=(32, 1, 1)) drv.memcpy_dtoh(a1, a1_addr) print a1 drv.memcpy_dtoh(b1, b1_addr) print b1 drv.memcpy_dtoh(c1, c1_addr) print c1
def __init__(self, stream, cache_file=""): trt.IInt8MinMaxCalibrator.__init__(self) self.stream = stream self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) self.cache_file = cache_file stream.reset()
def get_distance(self, msa): if not isinstance(msa, MultipleSeqAlignment): raise TypeError("Must provide a MultipleSeqAlignment object.") i = 0 for record in msa: record.index = i i += 1 names = [record.id for record in msa] indices = [record.index for record in msa] dm = DistanceMatrix(names) pair_combinations = list(itertools.combinations( msa, 2)) # in order to combine take from here. combinations = len(pair_combinations) seqLength = len(pair_combinations[0][0]) # host arrays host_combinations = [] for pair in range(combinations): couple = [ "%s" % (pair_combinations[pair][0].seq), "%s" % (pair_combinations[pair][1].seq) ] host_combinations.extend(couple) host_names = self.scoring_matrix.names attributes = len(host_names) hst_scoring_matrix = [] for name in host_names: sequence = self.scoring_matrix[name] hst_scoring_matrix.extend(sequence) host_scoring_matrix = np.array(hst_scoring_matrix) host_scoring_matrix = host_scoring_matrix.astype(np.float64) host_d_matrix = np.zeros((combinations, ), dtype=float) host_d_matrix = host_d_matrix.astype(np.float64) host_names = np.asarray(host_names) host_combinations = np.asarray(host_combinations) ###GPU code start = cuda.Event() end = cuda.Event() # get the optimum block size based on dataset size if (combinations < 128): BLOCKSIZE = 128 elif (combinations < 256): BLOCKSIZE = 256 elif (combinations < 512): BLOCKSIZE = 512 else: BLOCKSIZE = 1024 # Allocate GPU device memory device_scoring_matrix = cuda.mem_alloc(host_scoring_matrix.nbytes) device_names = cuda.mem_alloc(sys.getsizeof(host_names)) device_combinations = cuda.mem_alloc(sys.getsizeof(host_combinations)) device_d_matrix = cuda.mem_alloc(host_d_matrix.nbytes) # Memcopy from host to device cuda.memcpy_htod(device_combinations, host_combinations) cuda.memcpy_htod(device_names, host_names) cuda.memcpy_htod(device_scoring_matrix, host_scoring_matrix) mod = SourceModule(""" #include <stdio.h> #include <string.h> #include <stdlib.h> __global__ void DeviceDM(char device_combinations[] , char device_names[], int n, int N, const int seqLength, double *device_scoring_matrix, double *device_d_matrix) { const int tid = threadIdx.y + blockIdx.y* blockDim.y; if (tid >= N) return; int start1= (tid*2)*(seqLength); int start2= (tid*2+1)*(seqLength); char skip_letters[] = {'-', '*'}; int score = 0; int max_score = 0; if(device_scoring_matrix){ double max_score1 = 0.0; double max_score2 = 0.0; for(int i=0; i < seqLength; i++){ char l1 = device_combinations[start1+i]; char l2 = device_combinations[start2+i]; int l1rank = 0; int l2rank = 0; if(!(l1==skip_letters[0] || l1==skip_letters[1] || l2==skip_letters[0] || l2==skip_letters[1])){ for(int i=0; i< n; i++){ if(l1==device_names[i]){ l1rank=i; } if(l2==device_names[i]){ l2rank=i; } if(l1rank!=0 && l2rank!=0){ break; } } max_score1 = max_score1 + device_scoring_matrix[l1rank* n + l1rank]; max_score2 = max_score2 + device_scoring_matrix[l2rank* n + l2rank]; score += device_scoring_matrix[l1rank*n + l2rank]; } } if(max_score1>=max_score2){ max_score= max_score1; }else{ max_score= max_score2; } }else{ for(int i=0; i < seqLength; i++){ char l1 = device_combinations[start1+i]; char l2 = device_combinations[start2+i]; if(!(l1==skip_letters[0] || l1==skip_letters[1] || l2==skip_letters[0] || l2==skip_letters[1])){ if(l1==l2){ score= score + 1; } } } max_score = seqLength; } if(max_score == 0){ device_d_matrix[tid]=1; }else{ device_d_matrix[tid]=1 - (score * 1.0 / max_score); } } """) # --- Define a reference to the __global__ function and call it 1 - (score * 1.0 / max_score); DeviceDM = mod.get_function("DeviceDM") blockDim = (1, BLOCKSIZE, 1) gridDim = (1, combinations / BLOCKSIZE + 1, 1) start.record() DeviceDM(device_combinations, device_names, np.int32(attributes), np.int32(combinations), np.int32(seqLength), device_scoring_matrix, device_d_matrix, block=blockDim, grid=gridDim) end.record() end.synchronize() secs = start.time_till(end) * 1e-3 #print("Processing time = %fs" % (secs)) distance_matrix_list = np.empty_like(host_d_matrix) cuda.memcpy_dtoh(distance_matrix_list, device_d_matrix) device_d_matrix.free() device_combinations.free() device_names.free() device_scoring_matrix.free() final_distance_matrix = distance_matrix_list.tolist() for pair in range(combinations): dm[pair_combinations[pair][0].id, pair_combinations[pair][1].id] = final_distance_matrix[pair] return dm
def __init__(self, input_layers, stream): trt.infer.EntropyCalibrator.__init__(self) self.input_layers = input_layers self.stream = stream self.d_input = cuda.mem_alloc(self.stream.calibration_data.nbytes) stream.reset()
out[idx] = 0; } else{ out[idx] = in[gidx]; } } } } } } """) b = np.random.randn(25, 37, 12).astype(np.float32) out = np.random.randn(25, 37, 12, 60, 5, 5).astype(np.float32) b_gpu = cuda.mem_alloc(b.nbytes) out_gpu = cuda.mem_alloc(out.nbytes) cuda.memcpy_htod(b_gpu, b) start = time.time() func = mod.get_function("UnrollBKernel") func(b_gpu, out_gpu, grid=(37, 12, 25), block=(1, 1, 1)) end = time.time() a_doubled = np.empty_like(out) cuda.memcpy_dtoh(a_doubled, out_gpu) x = 0 # for i in a_doubled[0,0,0]: # print(i) # print(x)
# # as above statement create double precision data and nvidia supports only single precious of data so convert it. h_list_a = h_list_a.astype(np.float32) h_list_b = h_list_b.astype(np.float32) h_list_out = np.empty_like(h_list_a) # #pass this data from host to device #step 1: alloc data on device first d_list_a = cuda_driver.mem_alloc(h_list_a.nbytes) d_list_b = cuda_driver.mem_alloc(h_list_b.nbytes) d_list_out = cuda_driver.mem_alloc(h_list_b.nbytes) # #step 2: send data to alloced device cuda_driver.memcpy_htod(d_list_a, h_list_a) cuda_driver.memcpy_htod(d_list_b, h_list_b) # # #write cuda kernel and compile
binding_idx_offset = selected_profile * num_binding_per_profile # Specify input shapes. These must be within the min/max bounds of the active profile # Note that input shapes can be specified on a per-inference basis, but in this case, we only have a single shape. input_shape = (args.batch_size, max_seq_length) input_nbytes = trt.volume(input_shape) * trt.int32.itemsize for binding in range(3): context.set_binding_shape(binding_idx_offset + binding, input_shape) assert context.all_binding_shapes_specified # Create a stream in which to copy inputs/outputs and run inference. stream = cuda.Stream() # Allocate device memory for inputs. d_inputs = [cuda.mem_alloc(input_nbytes) for binding in range(3)] # Allocate output buffer by querying the size from the context. This may be different for different input shapes. h_output = cuda.pagelocked_empty(tuple( context.get_binding_shape(binding_idx_offset + 3)), dtype=np.float32) d_output = cuda.mem_alloc(h_output.nbytes) def inference(features, tokens): global h_output _NetworkOutput = collections.namedtuple( # pylint: disable=invalid-name "NetworkOutput", ["start_logits", "end_logits", "feature_index"]) networkOutputs = []
{ __global__ void init_rng(int nthreads, curandState *s, unsigned long long seed, unsigned long long offset) { int id = blockIdx.x*blockDim.x + threadIdx.x; if (id >= nthreads) return; curand_init(seed+id, id, offset, &s[id]); } } // extern "C" """ rng_states_gpu = cuda.mem_alloc( NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK * characterize.sizeof('curandStateXORWOW', '#include <curand_kernel.h>')) module = SourceModule(init_rng_src, no_extern_c=True) init_rng = module.get_function('init_rng') init_rng(np.int32(NUM_RUNS * 32 * NUM_RUNS_PER_BLOCK), rng_states_gpu, np.uint32(time.time()), np.uint64(0), block=(32, NUM_RUNS_PER_BLOCK, 1), grid=(NUM_RUNS, 1)) is_simiulation = 0 if NUM_RUNS == 1: is_simulation = 1 defines = "#define NUM_ROUTES " + str(NUM_ROUTES) + "\n" +\ "#define NUM_STOPS " + str(NUM_STOPS) + "\n" +\
def convert_image_rgb(self, image): global program start = time.time() iplanes = image.get_planes() w = image.get_width() h = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() debug("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert iplanes==ImageWrapper.PACKED, "must use packed format as input" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) divs = get_subsampling_divs(self.dst_format) #copy packed rgb pixels to GPU: upload_start = time.time() stream = driver.Stream() mem = numpy.frombuffer(pixels, dtype=numpy.byte) in_buf = driver.mem_alloc(len(pixels)) hmem = driver.register_host_memory(mem, driver.mem_host_register_flags.DEVICEMAP) pycuda.driver.memcpy_htod_async(in_buf, mem, stream) out_bufs = [] out_strides = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] out_stride = roundup(self.dst_width/x_div, 4) out_height = roundup(self.dst_height/y_div, 2) out_buf, out_stride = driver.mem_alloc_pitch(out_stride, out_height, 4) out_bufs.append(out_buf) out_strides.append(out_stride) out_sizes.append((out_stride, out_height)) #ensure uploading has finished: stream.synchronize() #we can now unpin the host memory: hmem.base.unregister() debug("allocation and upload took %.1fms", 1000.0*(time.time() - upload_start)) kstart = time.time() kargs = [in_buf, numpy.int32(stride)] for i in range(3): kargs.append(out_bufs[i]) kargs.append(numpy.int32(out_strides[i])) blockw, blockh = 16, 16 #figure out how many pixels we process at a time in each dimension: xdiv = max([x[0] for x in divs]) ydiv = max([x[1] for x in divs]) gridw = max(1, w/blockw/xdiv) if gridw*2*blockw<w: gridw += 1 gridh = max(1, h/blockh/ydiv) if gridh*2*blockh<h: gridh += 1 debug("calling %s%s, with grid=%s, block=%s", self.kernel_function_name, tuple(kargs), (gridw, gridh), (blockw, blockh, 1)) self.kernel_function(*kargs, block=(blockw,blockh,1), grid=(gridw, gridh)) #we can now free the GPU source buffer: in_buf.free() kend = time.time() debug("%s took %.1fms", self.kernel_function_name, (kend-kstart)*1000.0) self.frames += 1 #copy output YUV channel data to host memory: read_start = time.time() pixels = [] strides = [] for i in range(3): x_div, y_div = divs[i] out_size = out_sizes[i] #direct full plane async copy keeping current GPU padding: plane = driver.aligned_empty(out_size, dtype=numpy.byte) driver.memcpy_dtoh_async(plane, out_bufs[i], stream) pixels.append(plane.data) stride = out_strides[min(len(out_strides)-1, i)] strides.append(stride) stream.synchronize() #the copying has finished, we can now free the YUV GPU memory: #(the host memory will be freed by GC when 'pixels' goes out of scope) for out_buf in out_bufs: out_buf.free() self.cuda_context.synchronize() read_end = time.time() debug("strides=%s", strides) debug("read back took %.1fms, total time: %.1f", (read_end-read_start)*1000.0, 1000.0*(time.time()-start)) return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
float fa = a*M_PI/8.0; // go around traction circle and try different moves float v1 = max(min(v + AxMAX*dt*cos(fa), (float)VMAX), (float)VMIN); float k1 = max(min(sin(fa)*AyMAX/(v*v), STEER_LIMIT_K), -STEER_LIMIT_K); float t1 = theta + k1*v*dt; float c = vlookup(Vprev, x+v1*cos(t1)*dt, y+v1*sin(t1)*dt, t1, v1, dt); bestcost = min(bestcost, c); } V[di] = pathcost + penalty + bestcost; } """ % (STEER_LIMIT_K, NANGLES, NSPEEDS, GRID_RES, xsize, ysize, VMIN, VMAX, AxMAX, AyMAX, TIMESTEP, homex, homey)) valueiter = mod.get_function("valueiter") V0_gpu = cuda.mem_alloc(NSPEEDS * NANGLES * xsize * ysize * 4) V = np.zeros((NSPEEDS, NANGLES, ysize, xsize), np.float32) + 1000. cuda.memcpy_htod(V0_gpu, V) ye_in = gpuarray.to_gpu(ye) tk_in = gpuarray.to_gpu(tk) tang_in = gpuarray.to_gpu(tang) del ye del tk del tang s = trange(110) v0 = np.sum(V, dtype=np.float64) for j in s: for i in range(20): valueiter(V0_gpu,
z[n] = x[n] + y[n]; } } """.replace('real', real_cpp)) add = mod.get_function("add") EPSILON = 1e-15 NUM_REPEATS = 10 a = 1.23 b = 2.34 c = 3.57 N = 100000000 h_x = numpy.full((N, 1), a, dtype=real_py) h_y = numpy.full((N, 1), b, dtype=real_py) h_z = numpy.zeros_like(h_x, dtype=real_py) d_x = drv.mem_alloc(h_x.nbytes) d_y = drv.mem_alloc(h_y.nbytes) d_z = drv.mem_alloc(h_z.nbytes) drv.memcpy_htod(d_x, h_x) drv.memcpy_htod(d_y, h_y) t_sum = 0 t2_sum = 0 for repeat in range(NUM_REPEATS + 1): start = drv.Event() stop = drv.Event() start.record() add(d_x, d_y, d_z, numpy.int32(N),