def _debug_print( self ) : cuda_driver.memcpy_dtoh( self.f , self.df1 ) np.set_printoptions( 3 , 10000 , linewidth = 200 , suppress = True ) print '#'*80 print self.f
def calc_blob_blob_forces_pycuda(r_vectors, *args, **kwargs): # Determine number of threads and blocks for the GPU number_of_blobs = np.int32(len(r_vectors)) threads_per_block, num_blocks = set_number_of_threads_and_blocks(number_of_blobs) # Get parameters from arguments L = kwargs.get('periodic_length') eps = kwargs.get('repulsion_strength') b = kwargs.get('debye_length') blob_radius = kwargs.get('blob_radius') # Reshape arrays x = np.reshape(r_vectors, number_of_blobs * 3) f = np.empty_like(x) # Allocate GPU memory x_gpu = cuda.mem_alloc(x.nbytes) f_gpu = cuda.mem_alloc(f.nbytes) # Copy data to the GPU (host to device) cuda.memcpy_htod(x_gpu, x) # Get blob-blob force function force = mod.get_function("calc_blob_blob_force") # Compute mobility force product force(x_gpu, f_gpu, np.float64(eps), np.float64(b), np.float64(blob_radius), np.float64(L[0]), np.float64(L[1]), np.float64(L[2]), number_of_blobs, block=(threads_per_block, 1, 1), grid=(num_blocks, 1)) # Copy data from GPU to CPU (device to host) cuda.memcpy_dtoh(f, f_gpu) return np.reshape(f, (number_of_blobs, 3))
def loop(iterations): ts = 0 while(ts<iterations): ' To avoid overwrites a temporary copy is made of F ' T[:] = F cuda.memcpy_htod(T_gpu, T) ' Propagate ' prop(F_gpu, T_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Calculate density and get bounceback from obstacle nodes ' density(F_gpu, BOUND_gpu, BOUNCEBACK_gpu, DENSITY_gpu, UX_gpu, UY_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Calculate equilibrium ' eq(F_gpu, FEQ_gpu, DENSITY_gpu, UX_gpu, UY_gpu, U_SQU_gpu, U_C2_gpu, U_C4_gpu, U_C6_gpu, U_C8_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Transfer bounceback to obstacle nodes ' bounceback(F_gpu, BOUNCEBACK_gpu, BOUND_gpu, block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) ' Copy F to host for copy to T in beginning of loop ' cuda.memcpy_dtoh(F, F_gpu) ts += 1
def convolution_cuda(sourceImage, filterx, filtery): # Perform separable convolution on sourceImage using CUDA. # Operates on floating point images with row-major storage. destImage = sourceImage.copy() assert sourceImage.dtype == 'float32', 'source image must be float32' (imageHeight, imageWidth) = sourceImage.shape assert filterx.shape == filtery.shape == (KERNEL_W, ) , 'Kernel is compiled for a different kernel size! Try changing KERNEL_W' filterx = numpy.float32(filterx) filtery = numpy.float32(filtery) DATA_W = iAlignUp(imageWidth, 16) DATA_H = imageHeight BYTES_PER_WORD = 4 # 4 for float32 DATA_SIZE = DATA_W * DATA_H * BYTES_PER_WORD KERNEL_SIZE = KERNEL_W * BYTES_PER_WORD # Prepare device arrays destImage_gpu = cuda.mem_alloc_like(destImage) sourceImage_gpu = cuda.mem_alloc_like(sourceImage) intermediateImage_gpu = cuda.mem_alloc_like(sourceImage) cuda.memcpy_htod(sourceImage_gpu, sourceImage) cuda.memcpy_htod(d_Kernel_rows, filterx) # The kernel goes into constant memory via a symbol defined in the kernel cuda.memcpy_htod(d_Kernel_columns, filtery) # Call the kernels for convolution in each direction. blockGridRows = (iDivUp(DATA_W, ROW_TILE_W), DATA_H) blockGridColumns = (iDivUp(DATA_W, COLUMN_TILE_W), iDivUp(DATA_H, COLUMN_TILE_H)) threadBlockRows = (KERNEL_RADIUS_ALIGNED + ROW_TILE_W + KERNEL_RADIUS, 1, 1) threadBlockColumns = (COLUMN_TILE_W, 8, 1) DATA_H = numpy.int32(DATA_H) DATA_W = numpy.int32(DATA_W) convolutionRowGPU(intermediateImage_gpu, sourceImage_gpu, DATA_W, DATA_H, grid=[int(e) for e in blockGridRows], block=[int(e) for e in threadBlockRows]) convolutionColumnGPU(destImage_gpu, intermediateImage_gpu, DATA_W, DATA_H, numpy.int32(COLUMN_TILE_W * threadBlockColumns[1]), numpy.int32(DATA_W * threadBlockColumns[1]), grid=[int(e) for e in blockGridColumns], block=[int(e) for e in threadBlockColumns]) # Pull the data back from the GPU. cuda.memcpy_dtoh(destImage, destImage_gpu) return destImage
def fromSourceFile(): import numpy as np import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule #random data np.random.seed(1) a = np.random.randn(4,4) a = a.astype(np.float32) #read code and get function mod = SourceModule(open('simple.cu').read()) func = mod.get_function("doublify") #allocate memory on the GPU a_gpu = cuda.mem_alloc(a.nbytes) #transfer to the GPU memory cuda.memcpy_htod(a_gpu, a) #execute func(a_gpu, block=(4,4,1)) #collect results a_doubled = np.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) print a_doubled print a_doubled / (a*2)
def test_pycuda(self): """ Test pycuda installation with small example. :return: :rtype: """ try: import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np a = np.random.randn(4, 4) print(a) a= a.astype(np.float32) a_gpu = cuda.mem_alloc(a.nbytes) cuda.memcpy_htod(a_gpu, a) mod = SourceModule( """ __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """ ) func = mod.get_function("doublify") func(a_gpu, block=(4,4,1)) a_doubled = np.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) #print(a_doubled) #print(a) except Exception: self.fail('Still not working')
def test_constant_memory(self): # contributed by Andrew Wagner module = SourceModule(""" __constant__ float const_array[32]; __global__ void copy_constant_into_global(float* global_result_array) { global_result_array[threadIdx.x] = const_array[threadIdx.x]; } """) copy_constant_into_global = module.get_function("copy_constant_into_global") const_array, _ = module.get_global('const_array') host_array = np.random.randint(0,255,(32,)).astype(np.float32) global_result_array = drv.mem_alloc_like(host_array) drv.memcpy_htod(const_array, host_array) copy_constant_into_global( global_result_array, grid=(1, 1), block=(32, 1, 1)) host_result_array = np.zeros_like(host_array) drv.memcpy_dtoh(host_result_array, global_result_array) assert (host_result_array == host_array).all
def poisson_parallel(source_im, dest_im, b_size, g_size, RGB, neighbors, interior_buffer, n): # create Cheetah template and fill in variables for Poisson kernal template = Template(poisson_blending_source) template.BLOCK_DIM_X = b_size[0] template.BLOCK_DIM_Y = b_size[1] template.WIDTH = dest_im.shape[1] template.HEIGHT = dest_im.shape[0] template.RGB = RGB template.NEIGHBORS = neighbors # compile the CUDA kernel poisson_blending_kernel = cuda_compile(template, "poisson_blending_kernel") # alloc memory in GPU out_image = np.array(dest_im, dtype =np.uint8) d_source, d_destination, d_buffer= cu.mem_alloc(source_im.nbytes), cu.mem_alloc(dest_im.nbytes), cu.mem_alloc(interior_buffer.nbytes) cu.memcpy_htod(d_source, source_im) cu.memcpy_htod(d_destination, dest_im) cu.memcpy_htod(d_buffer, interior_buffer) # calls CUDA for Poisson Blending n # of times for i in range(n): poisson_blending_kernel(d_source, d_destination, d_buffer, block=b_size, grid=g_size) # retrieves the final output image and returns cu.memcpy_dtoh(out_image, d_destination) return out_image
def runTest(self): nx, ny, nz, str_f, pt0, pt1, is_array = self.args slice_xyz = common.slices_two_points(pt0, pt1) # generate random source if is_array: shape = common.shape_two_points(pt0, pt1) value = np.random.rand(*shape).astype(np.float32) else: value = np.random.ranf() # instance fields = Fields(0, nx, ny, nz, '', 'single') tfunc = lambda tstep: np.sin(0.03*tstep) incident = IncidentDirect(fields, str_f, pt0, pt1, tfunc, value) # host allocations eh = np.zeros(fields.ns_pitch, dtype=fields.dtype) # verify eh[slice_xyz] = fields.dtype(value) * fields.dtype(tfunc(1)) fields.update_e() fields.update_h() copy_eh_buf = fields.get_buf(str_f) copy_eh = np.zeros_like(eh) cuda.memcpy_dtoh(copy_eh, copy_eh_buf) original = eh[slice_xyz] copy = copy_eh[slice_xyz] norm = np.linalg.norm(original - copy) self.assertEqual(norm, 0, '%s, %g' % (self.args, norm)) fields.context_pop()
def fromGPU(self, shared_mem, buff_dtype=np.float32 ): buff = np.frombuffer(shared_mem.get_obj(), dtype=buff_dtype) buff = buff[:self.buffer_nnets*self.buffer_nsamples] buff = buff.reshape( (self.buffer_nnets, self.buffer_nsamples) ) cuda.memcpy_dtoh(buff, self.gpu_data) return buff
def calcV1complex(self, stim, speed): """Compute V1 complex cell responses of a frame.""" # allocate stim on device self._loadInput(stim) # convolve the stimulus with separate V1 filters self._calcV1linear() # rectify linear response to get V1 simple cell firing rate self._calcV1rect() # spatial pooling to get V1 complex self._calcV1blur() # divisive normalization self._calcV1normalize() # steer filters in specified directions self._calcV1direction(speed) # get data from device res = np.zeros(self.nrX*self.nrY*self.nrDirs).astype(np.float32) cuda.memcpy_dtoh(res, self.d_respV1c) return res
def scenario_inplace_padded_C2R(batch,tic,toc): n = array([2*BENG_CHANNELS_],int32) inembed = array([16*(BENG_CHANNELS//16+1)],int32) onembed = array([2*inembed[0]],int32) plan = cufft.cufftPlanMany(1, n.ctypes.data, inembed.ctypes.data, 1, inembed[0], onembed.ctypes.data, 1, onembed[0], cufft.CUFFT_C2R, batch) data_shape = (batch,inembed[0]) cpu_data = standard_normal(data_shape) + 1j * standard_normal(data_shape) cpu_data = cpu_data.astype(complex64) gpu_data = cuda.mem_alloc(8*batch*inembed[0]) # complex64 cuda.memcpy_htod(gpu_data,cpu_data) tic.record() cufft.cufftExecC2R(plan,int(gpu_data),int(gpu_data)) toc.record() toc.synchronize() cpu_result = np.empty(batch*onembed[0],dtype=np.float32) cuda.memcpy_dtoh(cpu_result,gpu_data) cpu_result = cpu_result.reshape((batch,onembed[0]))[:,:2*BENG_CHANNELS_]/(2*BENG_CHANNELS_) result = irfft(cpu_data[:,:BENG_CHANNELS],axis=-1) print 'Batched in-place scenario' print 'test passed:',np.allclose(cpu_result,result) print 'GPU time:', tic.time_till(toc),' ms = ',tic.time_till(toc)/(batch*0.5*13.128e-3),' x real (both SB)'
def calculate (self, data, f_high, f_bins): import pycuda.driver as driver import pycuda.compiler as compiler import pycuda.autoinit log = logging.getLogger("astroplpython.function.signal") log.debug("CULSP.calculate() called") log.debug("Orig Data:"+str(data)) log.debug(" TODO: Calculate blocksize") log.debug("set up GPU, allocate memory for working") a_gpu = driver.mem_alloc(data.size * data.dtype.itemsize) log.debug("push data into GPU memory") driver.memcpy_htod(a_gpu, data) log.debug("compile and run the culsp_kernel on data in the GPU") culsp_func = compiler.SourceModule(self._kernelStr).get_function("culsp_kernel") culsp_func (a_gpu, block=(4,4,1)) log.debug("pull data from GPU back into main memory") result = np.empty_like(data) driver.memcpy_dtoh(result, a_gpu) log.debug("return result") return result
def cuda_crossOver(sola, solb): """ """ sol_len = len(sola); a_gpu = cuda.mem_alloc(sola.nbytes); b_gpu = cuda.mem_alloc(solb.nbytes); cuda.memcpy_htod(a_gpu, sola); cuda.memcpy_htod(b_gpu, solb); func = mod.get_function("crossOver"); func(a_gpu,b_gpu, block=(sol_len,1,1)); a_new = numpy.empty_like(sola); b_new = numpy.empty_like(solb); cuda.memcpy_dtoh(a_new, a_gpu); cuda.memcpy_dtoh(b_new, b_gpu); if debug == True: print "a:", a; print "b:",b; print "new a:",a_new; print "new b:",b_new; return a_new,b_new;
def calc_bandwidth_d2h( s ): t1 = datetime.now() cuda.memcpy_dtoh( s.a, s.dev_a ) dt = datetime.now() - t1 dt_float = dt.seconds + dt.microseconds*1e-6 return s.nbytes/dt_float/gbytes
def interior_buffer(source_im, dest_im, b_size, g_size, RGB, neighbors): # create Cheetah template and fill in variables for mask kernel mask_template = Template(mask_source) mask_template.BLOCK_DIM_X = b_size[0] mask_template.BLOCK_DIM_Y = b_size[1] mask_template.WIDTH = dest_im.shape[1] mask_template.HEIGHT = dest_im.shape[0] mask_template.RGB = RGB mask_template.NEIGHBORS = neighbors # compile the CUDA kernel mask_kernel = cuda_compile(mask_template, "mask_kernel") # alloc memory to GPU d_source = cu.mem_alloc(source_im.nbytes) cu.memcpy_htod(d_source, source_im) # sends to GPU filter out interior points in the mask mask_kernel(d_source, block=b_size, grid=g_size) # retrieves interior point buffer from GPU inner_buffer = np.array(dest_im, dtype =np.uint8) cu.memcpy_dtoh(inner_buffer, d_source) # returns the interior buffer return inner_buffer
def test_prepared_invocation(self): a = np.random.randn(4,4).astype(np.float32) a_gpu = drv.mem_alloc(a.size * a.dtype.itemsize) drv.memcpy_htod(a_gpu, a) mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*blockDim.x; a[idx] *= 2; } """) func = mod.get_function("doublify") func.prepare("P") func.prepared_call((1, 1), (4,4,1), a_gpu, shared_size=20) a_doubled = np.empty_like(a) drv.memcpy_dtoh(a_doubled, a_gpu) print (a) print (a_doubled) assert la.norm(a_doubled-2*a) == 0 # now with offsets func.prepare("P") a_quadrupled = np.empty_like(a) func.prepared_call((1, 1), (15,1,1), int(a_gpu)+a.dtype.itemsize) drv.memcpy_dtoh(a_quadrupled, a_gpu) assert la.norm(a_quadrupled[1:]-4*a[1:]) == 0
def calc_psd(self,bitloads,xtalk): #Number of expected permutations Ncombinations=self.K #Check if this is getting hairy and assign grid/block dimensions (warpcount,warpperblock,threadCount,blockCount) = self._workload_calc(Ncombinations) #How many individual lk's memdim=blockCount*threadCount threadshare_grid=(blockCount,1) threadshare_block=(threadCount,1,1) #Memory (We get away with the NCombinations because calpsd checks against it) d_a=cuda.mem_alloc(np.zeros((Ncombinations*self.N*self.N)).astype(self.type).nbytes) d_p=cuda.mem_alloc(np.zeros((Ncombinations*self.N)).astype(self.type).nbytes) d_bitload=cuda.mem_alloc(np.zeros((self.K*self.N)).astype(np.int32).nbytes) d_XTG=cuda.mem_alloc(np.zeros((self.K*self.N*self.N)).astype(self.type).nbytes) h_p=np.zeros((self.K,self.N)).astype(self.type) cuda.memcpy_htod(d_bitload,util.mat2arr(bitloads).astype(np.int32)) cuda.memcpy_htod(d_XTG,xtalk.astype(self.type)) #Go solve #__global__ void calc_psd(FPT *A, FPT *P, FPT *d_XTG, int *current_b, int N){ self.k_calcpsd(d_a,d_p,d_XTG,d_bitload,np.int32(Ncombinations),block=threadshare_block,grid=threadshare_grid) cuda.Context.synchronize() cuda.memcpy_dtoh(h_p,d_p) d_a.free() d_bitload.free() d_XTG.free() d_p.free() return h_p.astype(np.float64)
def test_compare_order(): ''' compare_order between C(row-major), F(column-major) ''' compare_order = mod_cu.get_function('compare_order') nx, ny = 3, 4 f_1d = np.arange(nx*ny, dtype='f8') f_2d_C = f_1d.reshape((nx,ny), order='C') f_2d_F = f_1d.reshape((nx,ny), order='F') print '' print 'f_1d_C\n\n', f_1d print 'f_2d_C\n', f_2d_C print 'f_2d_F\n', f_2d_F print '' print 'after cuda' ret_f_1d = np.zeros_like(f_1d) f_1d_gpu = cuda.mem_alloc_like(f_1d) f_2d_C_gpu = cuda.to_device(f_2d_C) compare_order(f_2d_C_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_C\n', ret_f_1d f_2d_F_gpu = cuda.to_device(f_2d_F) compare_order(f_2d_F_gpu, f_1d_gpu, block=(nx*ny,1,1), grid=(1,1)) cuda.memcpy_dtoh(ret_f_1d, f_1d_gpu) print 'f_1d from f_2d_F\n', ret_f_1d
def GPU(): im = Image.open(sys.argv[1]) print sys.argv[1], ": ", im.format, im.size, im.mode, '\n' pixels = np.array(im.getdata()) #r, g, b = im.split() print pixels #print pixels[:,0].nbytes pixels = np.array(im) gpu = cuda.mem_alloc(pixels.nbytes) cuda.memcpy_htod(gpu, pixels) kernel = SourceModule(""" #define MAX_PIXEL_VALUE 255 #define THRESHOLD 50 __global__ void process_pixel(int *r, int *g, int *b) { int id = blockDim.x * blockIdx.x + threadIdx.x; if ((r[id] > THRESHOLD) && (g[id] > THRESHOLD) && (b[id] > THRESHOLD)) { r[id] = MAX_PIXEL_VALUE; g[id] = MAX_PIXEL_VALUE; b[id] = MAX_PIXEL_VALUE; } } """) func = kernel.get_function("process_pixel") func(gpu, block=(4,4,1)) newpixels = np.zeros_like(pixels) cuda.memcpy_dtoh(newpixels, gpu)
def diffuse_pycuda(u): nx,ny = np.int32(u.shape) alpha = np.float32(0.645) dx = np.float32(3.5/(nx-1)) dy = np.float32(3.5/(ny-1)) dt = np.float32(1e-05) time = np.float32(0.4) nt = np.int32(np.ceil(time/dt)) # print nt u[0,:]=200 u[:,0]=200 u = u.astype(np.float32) u_prev = u.copy() u_d = cuda.mem_alloc(u.size*u.dtype.itemsize) u_prev_d = cuda.mem_alloc(u_prev.size*u_prev.dtype.itemsize) cuda.memcpy_htod(u_d, u) cuda.memcpy_htod(u_prev_d, u_prev) BLOCKSIZE = 16 gridSize = (int(np.ceil(nx/BLOCKSIZE)),int(np.ceil(nx/BLOCKSIZE)),1) blockSize = (BLOCKSIZE,BLOCKSIZE,1) for t in range(nt+1): copy_array(u_d, u_prev_d, nx, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize) update(u_d, u_prev_d, nx, dx, dt, alpha, np.int32(BLOCKSIZE), block=blockSize, grid=gridSize) cuda.memcpy_dtoh(u, u_d) return u
def main(context, stream, plan1, N1, N2, g_buf1, g_buf2): #N1 = # ffts applied #N2 = dim of ffts x = np.linspace(0, 2 * np.pi, N2) y = np.sin(2 * x) ys = y ys = ys.reshape(1,N2) y = np.concatenate((y,np.zeros(nearest_2power(N2)-N2))) y = y.reshape(1,nearest_2power(N2)) for i in xrange(N1-1): #append N1-1 sines yi = np.sin(2 * (i+2) * x) yis = yi yis = yis.reshape(1,N2) ys = np.concatenate(((ys),(yis)),0) yi = np.concatenate((yi,np.zeros(nearest_2power(N2)-N2))) yi = yi.reshape(1,nearest_2power(N2)) y = np.concatenate(((y),(yi)),0) y = y.transpose() yim= np.zeros(y.shape) y = np.array(y,np.float64) yw = y.transpose() yimw = yim.transpose() aw = np.fft.fft(ys,int(nearest_2power(N2)),1) bw = np.real(np.fft.ifft(aw,int(nearest_2power(N2)),1)) aw0 = np.fft.fft(y,int(nearest_2power(N2)),0) bw0 = np.real(np.fft.ifft(aw0,int(nearest_2power(N2)),0)) gpu_testmat = gpuarray.to_gpu(y) gpu_testmatim = gpuarray.to_gpu(yim) plan1.execute(gpu_testmat, gpu_testmatim, batch=N1) gfft = gpu_testmat.get() #get fft result plan1.execute(gpu_testmat, gpu_testmatim, inverse=True, batch=N1) gifft = np.real(gpu_testmat.get()) #get ifft result cuda.memcpy_htod(g_buf1, y) cuda.memset_d32(g_buf2, 0, yim.size*2) # double all zero bits should be zero again. This function only works for 32 bit, so we need twice as many plan1.execute(g_buf1, g_buf2, batch=N1) grfft=np.empty_like(y) cuda.memcpy_dtoh(grfft, g_buf1) #fft result plan1.execute(g_buf1, g_buf2, inverse=True, batch=N1) grifft=np.empty_like(y) cuda.memcpy_dtoh(grifft, g_buf1) #ifft result if Plot: np.set_printoptions(threshold=np.nan) #plot cuda fft results f, axarr = plt.subplots(5, sharex=False) axarr[0].plot(y) axarr[1].plot(gfft) axarr[2].plot(gifft) axarr[3].plot(grfft) axarr[4].plot(grifft) plt.show() raise SystemExit
def save_data(data, data_package): # if data is numpy.ndarray, copy to GPU and save only devptr dp = data_package data_name = dp.data_name data_range = dp.data_range shape = dp.data_shape u, ss, sp = dp.get_id() #log("rank%d, \"%s\", u=%d, saved"%(rank, data_name, u),'general',log_type) buf_dtype = type(data) if buf_dtype == numpy.ndarray: dp.memory_type = 'devptr' dp.data_dtype = 'cuda_memory' dp.devptr = to_device_with_time(data, data_name, u) elif buf_dtype == cuda.DeviceAllocation: dp.memory_type = 'devptr' dp.data_dtype = 'cuda_memory' dp.devptr = data elif buf_dtype == 'cuda_memory': pass else: assert(False) if dp.devptr == None: assert(False) target = (u,ss,sp) if target not in gpu_list: gpu_list.append(target) # save image if log_type in ['image', 'all']: if len(dp.data_shape) == 2 and dp.memory_type == 'devptr': image_data_shape = dp.data_memory_shape md = dp.data_contents_memory_dtype a = numpy.empty(image_data_shape,dtype=md) # cuda.memcpy_dtoh_async(a, dp.devptr, stream=stream[1]) cuda.memcpy_dtoh(a, dp.devptr) ctx.synchronize() buf = a extension = 'png' dtype = dp.data_contents_dtype chan = dp.data_contents_memory_shape buf = buf.astype(numpy.uint8) if chan == [1]: img = Image.fromarray(buf, 'L') elif chan == [3]: img = Image.fromarray(buf, 'RGB') elif chan == [4]: img = Image.fromarray(buf, 'RGBA') e = os.system("mkdir -p result") img.save('./result/%s%s%s.png'%(u,ss,sp), format=extension) u = dp.get_unique_id() if u not in data_list: data_list[u] = {} if ss not in data_list[u]:data_list[u][ss] = {} data_list[u][ss][sp] = dp
def dtoh(self,nam): try: import pycuda.driver as cuda import pycuda.autoinit except: return var_id = self.nam_args.index(nam) cuda.memcpy_dtoh(self.args[var_id], self.cu_args[var_id]);
def get_from_gpu(self): if not self._cptr is None: tempstr = np.array([' ']*self.nbytes()) cuda.memcpy_dtoh(tempstr,self._cptr) self.C = np.fromstring(tempstr[:self.C.nbytes], dtype=self.C.dtype).resize(self.C.shape) self.num = np.fromstring(tempstr[self.C.nbytes:], dtype=self.num.dtype)
def confirmInitialization(featuresForSOM,somMatrix): #allocate memory for the somcuda on the device somMatrixPtr = pycuda.mem_alloc(somMatrix.nbytes) somBytesPerRow = np.int32(somMatrix.strides[0]) somNumberOfRows = np.int32(somMatrix.shape[0]) somNumberOfColumns = np.int32(somMatrix.shape[1]) pycuda.memcpy_htod(somMatrixPtr,somMatrix) #allocate space for bmu index bmu = np.zeros(somMatrixRows).astype(np.float32) bmuPtr = pycuda.mem_alloc(bmu.nbytes) pycuda.memcpy_htod(bmuPtr,bmu) bmuIndex = np.zeros(somMatrixRows).astype(np.int32) bmuIndexPtr = pycuda.mem_alloc(bmuIndex.nbytes) pycuda.memcpy_htod(bmuIndexPtr,bmuIndex) intraDayOffset = features.columns.get_loc('Ret_121') dayOffset = features.columns.get_loc('Ret_PlusOne') objVal = 0.0; objSampSize=0.0 r = [[[0.0 for k in range(0,3)] for i in range(somMatrixColumns)] for j in range (somMatrixRows)] nodeHitMatrix = np.array(r).astype(np.float32) hitCountDict = defaultdict(list) samples = [x for x in range (0, somMatrixRows*somMatrixColumns)] if len(samples) >= len(featuresForSOM): samples = [x for x in range (0, len(featuresForSOM))] for i in samples: feats = featuresForSOM.loc[i].as_matrix().astype(np.float32) featuresPtr = pycuda.mem_alloc(feats.nbytes) pycuda.memcpy_htod(featuresPtr,feats) #find the BMU computeBMU(somMatrixPtr, bmuPtr, bmuIndexPtr, featuresPtr, np.int32(len(featuresForSOM.columns)), somBytesPerRow, somNumberOfRows, somNumberOfColumns, np.float32(MAX_FEAT), np.float32(INF),np.int32(metric),block=(blk,1,1),grid=(somNumberOfRows,1)) pycuda.memcpy_dtoh(bmu,bmuPtr) pycuda.memcpy_dtoh(bmuIndex,bmuIndexPtr) block = np.argmin(bmu) thread = bmuIndex[block] val = hitCountDict[(block,thread)] if val == None or len(val) == 0: hitCountDict[(block,thread)] = [1,i] else: hitCountDict[(block,thread)][0] += 1 val = np.int32(hitCountDict[(block,thread)])[0] if val == 1: val = 0x0000ff00 elif val <= 10: val = 0x000000ff elif val <= 100: val = 0x00ff0000 else: val = 0x00ffffff bval = (val & 0x000000ff) gval = ((val & 0x0000ff00) >> 8) rval = ((val & 0x00ff0000) >> 16) nodeHitMatrix[block][thread] = [rval/255.0,gval/255.0,bval/255.0] fig20 = plt.figure(20,figsize=(6*3.13,4*3.13)) fig20.suptitle('Train Node Hit Counts. Black: 0 Green: 1 Blue: <=10 Red: <=100 White >100', fontsize=20) ax = plt.subplot(111) somplot = plt.imshow(nodeHitMatrix,interpolation="none") plt.show() plt.pause(0.1)
def GenerateFractal(dimensions,position,zoom,iterations,block=(20,20,1), report=False, silent=False): chunkSize = numpy.array([dimensions[0]/block[0],dimensions[1]/block[1]],dtype=numpy.int32) zoom = numpy.float32(zoom) iterations = numpy.int32(iterations) blockDim = numpy.array([block[0],block[1]],dtype=numpy.int32) result = numpy.zeros(dimensions,dtype=numpy.int32) #Center position position = Vector(position[0]*zoom,position[1]*zoom) position = position - (Vector(result.shape[0],result.shape[1])/2) position = numpy.array([int(position.x),int(position.y)]).astype(numpy.float32) #For progress reporting: ppc = cuda.pagelocked_zeros((1,1),numpy.int32, mem_flags=cuda.host_alloc_flags.DEVICEMAP) #pagelocked progress counter ppc[0,0] = 0 ppc_ptr = numpy.intp(ppc.base.get_device_pointer()) #pagelocked memory counter, device pointer to #End progress reporting #Copy parameters over to device chunkS = In(chunkSize) posit = In(position) blockD = In(blockDim) zoo = In(zoom) iters = In(iterations) res = In(result) if not silent: print("Calling CUDA function. Starting timer. progress starting at: "+str(ppc[0,0])) start_time = time.time() genChunk(chunkS, posit, blockD, zoo, iters, res, ppc_ptr, block=(1,1,1), grid=block) if report: total = (dimensions[0]*dimensions[1]) print "Reporting up to "+str(total)+", "+str(ppc[0,0]) while ppc[0,0] < ((dimensions[0]*dimensions[1])): pct = (ppc[0,0]*100)/(total) hashes = "#"*pct dashes = "-"*(100-pct) print "\r["+hashes+dashes+"] "+locale.format("%i",ppc[0,0],grouping=True)+"/"+locale.format("%i",total,grouping=True), time.sleep(0.00001) cuda.Context.synchronize() if not silent: print "Done. "+str(ppc[0,0]) #Copy result back from device cuda.memcpy_dtoh(result, res) if not silent: end_time = time.time() elapsed_time = end_time-start_time print("Done with call. Took "+str(elapsed_time)+" seconds. Here's the repr'd arary:\n") print(result) result[result.shape[0]/2,result.shape[1]/2]=iterations+1 #mark center of image return result
def __gini(self, n_samples, indices_offset, si_gpu_in): n_block, n_range = self.__get_block_size(n_samples) self.scan_total_2d.prepared_call( (self.max_features, n_block), (self.COMPUTE_THREADS_PER_BLOCK, 1, 1), si_gpu_in.ptr + indices_offset, self.labels_gpu.ptr, self.label_total_2d.ptr, self.features_array_gpu.ptr, n_range, n_samples) self.scan_reduce.prepared_call( (self.max_features, 1), (32, 1, 1), self.label_total_2d.ptr, n_block) self.comput_total_2d.prepared_call( (self.max_features, n_block), (self.COMPUTE_THREADS_PER_BLOCK, 1, 1), si_gpu_in.ptr + indices_offset, self.samples_gpu.ptr, self.labels_gpu.ptr, self.impurity_2d.ptr, self.label_total_2d.ptr, self.min_split_2d.ptr, self.features_array_gpu.ptr, n_range, n_samples) self.reduce_2d.prepared_call( (self.max_features, 1), (32, 1, 1), self.impurity_2d.ptr, self.impurity_left.ptr, self.impurity_right.ptr, self.min_split_2d.ptr, self.min_split.ptr, n_block) self.find_min_kernel.prepared_call( (1, 1), (32, 1, 1), self.impurity_left.ptr, self.impurity_right.ptr, self.min_split.ptr, self.max_features) cuda.memcpy_dtoh(self.min_imp_info, self.impurity_left.ptr) min_right = self.min_imp_info[1] min_left = self.min_imp_info[0] col = int(self.min_imp_info[2]) row = int(self.min_imp_info[3]) row = self.features_array[row] return min_left, min_right, row, col
def get(self): # Allocate an empty buffer buf = np.empty(self.datashape, dtype=self.dtype) # Copy cuda.memcpy_dtoh(buf, self.data) # Slice to give the expected I/O shape return buf[...,:self.ioshape[-1]]
def wait(self): nx, ny, nz_pitch = self.mainf.ns_pitch for shift_idx, source_buf in enumerate(self.source_bufs): self.kernel_copy( \ nx, ny, nz_pitch, np.int32(shift_idx), self.target_buf, source_buf, \ grid=self.mainf.gs, block=self.mainf.bs) cuda.memcpy_dtoh(self.host_array, self.target_buf)
def chambolle_pock_TVl1_CUDA(image, clambda, tau, sigma, iters=100): r""" 2D ROF CUDA solver using Chambolle-Pock Method Parameters ---------- image : numpy array The noisy image we are processing clambda : float The non-negative weight in the optimization problem tau : float Parameter of the proximal operator iters : int Number of iterations allowed """ print("2D Primal-Dual TV-l1 CUDA solver using Chambolle-Pock method") start_time = timeit.default_timer() (h, w) = image.shape dim = w * h nc = 1 # Load Modules init_module = SourceModule( open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_init.cu', 'r').read()) primal_module = SourceModule( open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_primal.cu', 'r').read()) dual_module = SourceModule( open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_dual.cu', 'r').read()) extrapolate_module = SourceModule( open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_extrapolate.cu', 'r').read()) solution_module = SourceModule( open('../bilevel_imaging_toolbox/cuda/tvl1/tvl1_solution.cu', 'r').read()) # Memory Allocation nbyted = image.astype(np.float32).nbytes d_imgInOut = drv.mem_alloc(nbyted) d_x = drv.mem_alloc(nbyted) d_xbar = drv.mem_alloc(nbyted) d_xcur = drv.mem_alloc(nbyted) d_y1 = drv.mem_alloc(nbyted) d_y2 = drv.mem_alloc(nbyted) # Copy host memory h_img = image.astype(np.float32) drv.memcpy_htod(d_imgInOut, h_img) # Launch kernel block = (16, 16, 1) grid = (np.ceil( (w + block[0] - 1) / block[0]), np.ceil((h + block[1] - 1) / block[1])) grid = (int(grid[0]), int(grid[1])) # Function definition init_func = init_module.get_function('init') primal_func = primal_module.get_function('primal') dual_func = dual_module.get_function('dual') extrapolate_func = extrapolate_module.get_function('extrapolate') solution_func = solution_module.get_function('solution') # Initialization init_func(d_xbar, d_xcur, d_x, d_y1, d_y2, d_imgInOut, np.int32(w), np.int32(h), np.int32(nc), block=block, grid=grid) w = np.int32(w) h = np.int32(h) nc = np.int32(nc) sigma = np.float32(sigma) tau = np.float32(tau) clambda = np.float32(clambda) for i in range(iters): primal_func(d_y1, d_y2, d_xbar, sigma, w, h, nc, block=block, grid=grid) dual_func(d_x, d_xcur, d_y1, d_y2, d_imgInOut, tau, clambda, w, h, nc, block=block, grid=grid) extrapolate_func(d_xbar, d_xcur, d_x, np.float32(0.5), w, h, nc, block=block, grid=grid) solution_func(d_imgInOut, d_x, w, h, nc, block=block, grid=grid) drv.memcpy_dtoh(h_img, d_imgInOut) print( "Finished Chambolle-Pock TV-l1 CUDA denoising in %d iterations and %f sec" % (iters, timeit.default_timer() - start_time)) return (h_img, 0)
def compute(self, col_idx, prefix_sums, cpm_resolution): # check input cpm_resolution = np.float32(cpm_resolution) d_col_idx = ready_input(col_idx) d_prefix_sums = ready_input(prefix_sums) # prepare GPU memory: # array for storing community assignment per node community_idx = np.arange(self.N).astype(np.int32) d_community_idx = allocate_and_copy(community_idx) d_tmp_community_idx = allocate_and_copy(community_idx) # array for storing community sizes community_sizes = np.ones(self.N).astype(np.int32) d_community_sizes = allocate_and_copy(community_sizes) d_tmp_community_sizes = allocate_and_copy(community_sizes) # array for storing community inter connecting edges community_inter = np.zeros(self.N).astype(np.int32) d_community_inter = allocate_and_copy(community_inter) d_tmp_community_inter = allocate_and_copy(community_inter) # array for storing partial cpm for each community part_cpm = np.zeros(self.N).astype(np.float32) d_part_cpm = allocate_and_copy(part_cpm) # config iterations_limit = 15 # limit of cpm iterations cpm_thresh = 0.02 # threshold ratio supports decision if another cpm iteration iss needed iterations = 0 # counter iter_cpm_score = 0 # current cpm score # CPM execution: while True: # calclate best community assignment args_move_nodes = [ self.N, d_col_idx, d_prefix_sums, d_community_idx, d_community_sizes, d_tmp_community_idx, d_tmp_community_sizes, cpm_resolution ] self.move_nodes(*args_move_nodes, block=self.threads, grid=self.grid, stream=None, shared=0) # memory reset d_tmp_community_inter = allocate_and_copy( np.zeros(self.N).astype(np.int32)) d_part_cpm = allocate_and_copy(np.zeros(self.N).astype(np.float32)) # calculate interconnecting edges per community args_community_internal_edges = [ self.N, d_col_idx, d_prefix_sums, d_tmp_community_idx, d_tmp_community_inter ] self.community_internal_edges(*args_community_internal_edges, block=self.threads, grid=self.grid, stream=None, shared=0) # calculate partial cpm per community args_calculate_part_cpm = [ self.N, d_tmp_community_inter, d_tmp_community_sizes, d_part_cpm, cpm_resolution ] self.calculate_part_cpm(*args_calculate_part_cpm, block=self.threads, grid=self.grid, stream=None, shared=0) # calculate overall cpm score drv.memcpy_dtoh(part_cpm, d_part_cpm) current_cpm_score = sum(part_cpm) # check cpm improvement for given iteration if iter_cpm_score != 0: cpm_diff = abs( (current_cpm_score - iter_cpm_score) / iter_cpm_score) else: cpm_diff = 1 if cpm_diff <= cpm_thresh or iterations > iterations_limit: # terminate if improvement below threshold or iteration limit reached break else: # prepare next iteration iterations += 1 iter_cpm_score = current_cpm_score # copy temporary results of iteration drv.memcpy_dtod(d_community_idx, d_tmp_community_idx, community_idx.nbytes) drv.memcpy_dtod(d_community_sizes, d_tmp_community_sizes, community_idx.nbytes) drv.memcpy_dtod(d_community_inter, d_tmp_community_inter, community_idx.nbytes) # classify communities community_class = np.zeros(self.N).astype(np.int32) d_community_class = allocate_and_copy(community_class) args_classify_communities = [ self.N, d_community_inter, d_community_sizes, d_community_class ] self.classify_communities(*args_classify_communities, block=self.threads, grid=self.grid, stream=None, shared=0) # classify hits hit_class = np.zeros(self.N).astype(np.int32) d_hit_class = allocate_and_copy(hit_class) args_classify_hits = [ self.N, d_community_idx, d_community_class, d_hit_class ] self.classify_hits(*args_classify_hits, block=self.threads, grid=self.grid, stream=None, shared=0) # get classified hits drv.memcpy_dtoh(hit_class, d_hit_class) classified_hits = sum(hit_class) return classified_hits
import atexit print("pycuda module version : ",pycuda.VERSION) # print(pycuda.VERSION_TEXT) drv.init() print("CUDA toolkit driver version : ",drv.get_version()) print("cuda gpu in this system : ",drv.Device.count()) dev = drv.Device(0) print("GPU name : ",dev.name()) print("If you want to check device attributes,\ check this dictionary : ",type(dev.get_attributes())) ctx = dev.make_context() print(ctx.get_device()) # ctx.pop() atexit.register(ctx.pop,) print("global memory (free,total) : ",\ [drv.mem_get_info()[i]/1024/1024 for i in range(2)], 'MB') a = np.arange(10) # print(a) a_gpu = drv.mem_alloc(a.nbytes) drv.memcpy_htod(a_gpu,a) a_rcv = np.empty_like(a) print("a_rcv before: ",a_rcv) drv.memcpy_dtoh(a_rcv,a_gpu) print("a_rcv after: ",a_rcv) # ctx.push() # ctx.detach()
start, stop = cuda.Event(), cuda.Event() exec_time = {'update_h':np.zeros(tmax), 'mpi_recv_h':np.zeros(tmax), 'memcpy_htod_h':np.zeros(tmax), 'mpi_send_h':np.zeros(tmax), 'memcpy_dtoh_h':np.zeros(tmax), 'update_e':np.zeros(tmax), 'mpi_recv_e':np.zeros(tmax), 'memcpy_htod_e':np.zeros(tmax), 'mpi_send_e':np.zeros(tmax), 'memcpy_dtoh_e':np.zeros(tmax), 'src_e':np.zeros(tmax)} # main loop ey_tmp = np.zeros((ny,nz),'f') ez_tmp = np.zeros_like(ey_tmp) hy_tmp = np.zeros_like(ey_tmp) hz_tmp = np.zeros_like(ey_tmp) for tn in xrange(1, tmax+1): if rank == 1: start.record() for i, bpg in enumerate(bpg_list): update_h.prepared_call(bpg, np.int32(i*MBy), *eh_args) if rank == 0: cuda.memcpy_dtoh(hy_tmp, int(hy_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) cuda.memcpy_dtoh(hz_tmp, int(hz_gpu)+(nx-1)*ny*nz*np.nbytes['float32']) comm.Send(hy_tmp, 1, 20) comm.Send(hz_tmp, 1, 21) elif rank == 1: stop.record() stop.synchronize() exec_time['update_h'][tn-1] = stop.time_since(start) start.record() comm.Recv(hy_tmp, 0, 20) comm.Recv(hz_tmp, 0, 21) stop.record() stop.synchronize() exec_time['mpi_recv_h'][tn-1] = stop.time_since(start) start.record()
#funBC(ftemp_g, feq_g, fin_g,block=(blockDimX,blockDimY,1), grid=(gridDimX,gridDimY)) # # stop.record() # stop.synchronize() #cudaDeviceSynchronize(); # time_gpu = start.time_till(stop) if( (It%Pinterval == 0) & (SaveVTK | SavePlot)) : # print('time spent in gpu (in millisecs) at this iteration is ',time_gpu) # print('So, Gpu bandwidth ( in GBPS) is ', 35*8*0.000001/(time_gpu)) # u_past = u.copy() #cuda.memcpy_dtoh(fin,fin_g) cuda.memcpy_dtoh(rho,rho_g) cuda.memcpy_dtoh(u,u_g) rho = rho.transpose() u[0,:,:] = u[0,:,:].transpose() u[1,:,:] = u[1,:,:].transpose() print ('current iteration :', It) #print (np.mean(u[0,:,0])/uLB) Usquare = u[0,:,:]**2 + u[1,:,:]**2 Usquare = Usquare/(uLB**2) BCoffset = int(xsize/40) # replacing all boundaries with nan to get location of vortices Usquare[0:BCoffset,:] = nan ; Usquare[:,0:BCoffset] = nan Usquare[xsize_max-BCoffset:xsize,:] = nan;Usquare[:,ysize_max-BCoffset:ysize] = nan Loc1 = np.unravel_index(np.nanargmin(Usquare),Usquare.shape)
def _walk_on_current_gpu(raw, slices, allLabels, indices, nbrw, sorw, name, foundAxis): walkmap = np.zeros((len(allLabels), ) + raw.shape, dtype=np.float32) if raw.dtype == 'uint8': kernel = _build_kernel_int8() raw = (raw - 128).astype('int8') else: kernel = _build_kernel_float32() raw = raw.astype(np.float32) fill_gpu = _build_kernel_fill() zsh, ysh, xsh = raw.shape xsh_gpu = np.int32(xsh) ysh_gpu = np.int32(ysh) zsh_gpu = np.int32(zsh) block = (32, 32, 1) x_grid = (xsh // 32) + 1 y_grid = (ysh // 32) + 1 grid2 = (int(x_grid), int(y_grid), int(zsh)) slshape = [None] * 3 indices_gpu = [None] * 3 beta_gpu = [None] * 3 slices_gpu = [None] * 3 ysh = [None] * 3 xsh = [None] * 3 print(indices) for k, found in enumerate(foundAxis): if found: indices_tmp = np.array(indices[k], dtype=np.int32) slices_tmp = slices[k].astype(np.int32) slshape[k], ysh[k], xsh[k] = slices_tmp.shape indices_gpu[k] = gpuarray.to_gpu(indices_tmp) slices_gpu[k] = gpuarray.to_gpu(slices_tmp) Beta = np.zeros(slices_tmp.shape, dtype=np.float32) for m in range(slshape[k]): for n in allLabels: A = _calc_label_walking_area(slices_tmp[m], n) plane = indices_tmp[m] if k == 0: raw_tmp = raw[plane] if k == 1: raw_tmp = raw[:, plane] if k == 2: raw_tmp = raw[:, :, plane] Beta[m] += _calc_var(raw_tmp.astype(float), A) beta_gpu[k] = gpuarray.to_gpu(Beta) sorw = np.int32(sorw) nbrw = np.int32(nbrw) raw_gpu = gpuarray.to_gpu(raw) a = np.empty(raw.shape, dtype=np.float32) a_gpu = cuda.mem_alloc(a.nbytes) for label_counter, segment in enumerate(allLabels): print('%s:' % (name) + ' ' + str(label_counter + 1) + '/' + str(len(allLabels))) fill_gpu(a_gpu, xsh_gpu, ysh_gpu, block=block, grid=grid2) segment_gpu = np.int32(segment) for k, found in enumerate(foundAxis): if found: axis_gpu = np.int32(k) x_grid = (xsh[k] // 32) + 1 y_grid = (ysh[k] // 32) + 1 grid = (int(x_grid), int(y_grid), int(slshape[k])) kernel(axis_gpu, segment_gpu, raw_gpu, slices_gpu[k], a_gpu, xsh_gpu, ysh_gpu, zsh_gpu, indices_gpu[k], sorw, beta_gpu[k], nbrw, block=block, grid=grid) cuda.memcpy_dtoh(a, a_gpu) walkmap[label_counter] += a return walkmap
def process(self, img, **kwargs): if isinstance(img, Dataset): img = img.pixel_array w = img.shape[1] h = img.shape[0] n = w * h all_segments = kwargs.get('all_segments', True) max_it = kwargs.get('max_it', 100) if not isinstance(max_it, int) or max_it <= 0: raise ValueError('Number of iterations should not be negative') n_clusters = kwargs.get('n_clusters', 2) if not isinstance(n_clusters, int) or n_clusters <= 0: raise ValueError('Number of clusters should not be less than 1') numpass = kwargs.get('numpass', 5) median_radius = kwargs.get('median_radius', 10) high_intensity_threshold = kwargs.get('high_intensity_threshold', 0.1) blur_radius = kwargs.get('blur_radius', 5) img, _ = median_otsu(img, numpass=numpass, median_radius=median_radius) img = (img - np.min(img)) / (np.max(img) - np.min(img)) blurred = cv.blur(img, (15, 15)) edges = np.clip(img - blurred, 0.0, 1.0) edges[edges > high_intensity_threshold] = 1.0 edges[edges <= high_intensity_threshold] = 0.0 edges = cv.dilate(edges, np.ones((3, 3)), iterations=1) img = np.clip(img - edges, 0.0, 1.0) img = cv.erode(img, np.ones((3, 3)), iterations=1) img = cv.blur(img, (blur_radius, blur_radius)) # src = (img - np.min(img)) / (np.max(img) - np.min(img)) src = img.astype(np.float32) src = src.reshape((-1)) centers = np.random.rand(n_clusters).astype(np.float32) # Image src_gpu = cuda.mem_alloc(src.nbytes) cuda.memcpy_htod(src_gpu, src) # Cluster centers centers_gpu = cuda.mem_alloc(centers.nbytes) cuda.memcpy_htod(centers_gpu, centers) # Labels labels = np.empty_like(src).astype(np.int32) labels_gpu = cuda.mem_alloc(labels.nbytes) cuda.memcpy_htod(labels_gpu, labels) module = SourceModule(Template(SRC).render(N=n)) relabel = module.get_function('relabel') calculate_clusters = module.get_function('calculateClusters') find_centers = module.get_function('findCenters') for it in range(max_it): relabel(src_gpu, centers_gpu, np.int32(n), np.int32(n_clusters), labels_gpu, block=(BLOCKDIM, 1, 1), grid=((n + BLOCKDIM - 1) // BLOCKDIM, 1)) for c in range(n_clusters): calculate_clusters(src_gpu, labels_gpu, np.int32(n), np.int32(c), block=(BLOCKDIM, 1, 1), grid=((n + BLOCKDIM - 1) // BLOCKDIM, 1), shared=8 * BLOCKDIM) find_centers(np.int32(n), np.int32(c), centers_gpu, block=((n + BLOCKDIM - 1) // BLOCKDIM, 1, 1), grid=((1, 1)), shared=8 * (n + BLOCKDIM - 1) // BLOCKDIM) cuda.memcpy_dtoh(labels, labels_gpu) cuda.memcpy_dtoh(centers, centers_gpu) labels = labels.reshape((-1)) if not all_segments: c_index = np.argmax(centers) flat = np.full(n, 0, dtype=np.uint8) flat[labels == c_index] = 1 mask = flat.reshape((h, w)) return mask else: return labels.reshape((h, w))
def pre_filter( self, array, return_gpu=True ): """ res = spline.pre_filter( array, return_gpu = True ) Pre-filter a data array to prepare for spline interpolation. Returns an allocation object for the array in gpu memory if return_gpu = true, otherwise the pre filtered array """ # make sure we are ready to execute if not self._prepare_cuda(): return False import time now = time.time() # make sure the data array is little endian type array = self.force_le( array ) # array size ( ncols, nrows ) = array.shape # allocate memory for array array_gpu = drv.mem_alloc( array.nbytes ) # and another for the transposed array t_array_gpu = drv.mem_alloc( array.nbytes ) # and copy to gpu drv.memcpy_htod( array_gpu, array ) # first apply gain. Might as well apply it with GPU since # the array is already there blocks = int( np.ceil( float( array.size )/self.nthreads ) ) self.cuda_apply_gain( array_gpu, np.int64( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) ) # now filter. Rows are filtered individually. Therefore # Each GPU thread will filter one row blocks = int( np.ceil( float( nrows )/self.nthreads ) ) self.cuda_pre_filter_rows( array_gpu, np.int32( nrows ), np.int32( ncols ), grid=(blocks,1), block=(self.nthreads,1,1) ) ## now we need to transpose the array to filter in the other direction #blocks = int( np.ceil( float( array.size )/self.nthreads ) ) #self.cuda_transpose( array_gpu, t_array_gpu, np.int32( nrows ), np.int32( ncols ), np.int32( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) ) ## apply gain again. Must be applied at both steps #blocks = int( np.ceil( float( array.size )/self.nthreads ) ) #self.cuda_apply_gain( t_array_gpu, grid=(blocks,1), block=(self.nthreads,1,1) ) ## and run pre_filter on the transposed array #blocks = int( np.ceil( float( ncols )/self.nthreads ) ) #self.cuda_pre_filter_rows( t_array_gpu, np.int32( ncols ), np.int32( nrows ), grid=(blocks,1), block=(self.nthreads,1,1) ) ## finally transpose back #blocks = int( np.ceil( float( array.size )/self.nthreads ) ) #self.cuda_transpose( t_array_gpu, array_gpu, np.int32( ncols ), np.int32( nrows ), np.int32( array.size ), grid=(blocks,1), block=(self.nthreads,1,1) ) # return gpu array if return_gpu: return array_gpu # otherwise fetch the array back out of GPU memory output = np.empty_like( array ) drv.memcpy_dtoh( output, array_gpu ) array_gpu.free() print time.time()-now # and return return output