def execute(self): # This zeros out the scratch buffer which is accumulated into using atomics # for update output kernels drv.memset_d32(self.kernels[0][5], 0, int(np.prod(self.U.shape))) for kernel in self.kernels: kernel[0].prepared_async_call(*kernel[1:])
def time_inference(engine, batch_size): assert(engine.get_nb_bindings() == 2) input_index = engine.get_binding_index(INPUT_LAYERS[0]) output_index = engine.get_binding_index(OUTPUT_LAYERS[0]) input_dim = engine.get_binding_dimensions(input_index).to_DimsCHW() output_dim = engine.get_binding_dimensions(output_index).to_DimsCHW() insize = batch_size * input_dim.C() * input_dim.H() * input_dim.W() * 4 outsize = batch_size * output_dim.C() * output_dim.H() * output_dim.W() * 4 d_input = cuda.mem_alloc(insize) d_output = cuda.mem_alloc(outsize) bindings = [int(d_input), int(d_output)] context = engine.create_execution_context() context.set_profiler(G_PROFILER) cuda.memset_d32(d_input, 0, insize // 4) for i in range(TIMING_INTERATIONS): context.execute(batch_size, bindings) context.destroy() return
def main(context, stream, plan1, N1, N2, g_buf1, g_buf2): #N1 = # ffts applied #N2 = dim of ffts x = np.linspace(0, 2 * np.pi, N2) y = np.sin(2 * x) ys = y ys = ys.reshape(1,N2) y = np.concatenate((y,np.zeros(nearest_2power(N2)-N2))) y = y.reshape(1,nearest_2power(N2)) for i in xrange(N1-1): #append N1-1 sines yi = np.sin(2 * (i+2) * x) yis = yi yis = yis.reshape(1,N2) ys = np.concatenate(((ys),(yis)),0) yi = np.concatenate((yi,np.zeros(nearest_2power(N2)-N2))) yi = yi.reshape(1,nearest_2power(N2)) y = np.concatenate(((y),(yi)),0) y = y.transpose() yim= np.zeros(y.shape) y = np.array(y,np.float64) yw = y.transpose() yimw = yim.transpose() aw = np.fft.fft(ys,int(nearest_2power(N2)),1) bw = np.real(np.fft.ifft(aw,int(nearest_2power(N2)),1)) aw0 = np.fft.fft(y,int(nearest_2power(N2)),0) bw0 = np.real(np.fft.ifft(aw0,int(nearest_2power(N2)),0)) gpu_testmat = gpuarray.to_gpu(y) gpu_testmatim = gpuarray.to_gpu(yim) plan1.execute(gpu_testmat, gpu_testmatim, batch=N1) gfft = gpu_testmat.get() #get fft result plan1.execute(gpu_testmat, gpu_testmatim, inverse=True, batch=N1) gifft = np.real(gpu_testmat.get()) #get ifft result cuda.memcpy_htod(g_buf1, y) cuda.memset_d32(g_buf2, 0, yim.size*2) # double all zero bits should be zero again. This function only works for 32 bit, so we need twice as many plan1.execute(g_buf1, g_buf2, batch=N1) grfft=np.empty_like(y) cuda.memcpy_dtoh(grfft, g_buf1) #fft result plan1.execute(g_buf1, g_buf2, inverse=True, batch=N1) grifft=np.empty_like(y) cuda.memcpy_dtoh(grifft, g_buf1) #ifft result if Plot: np.set_printoptions(threshold=np.nan) #plot cuda fft results f, axarr = plt.subplots(5, sharex=False) axarr[0].plot(y) axarr[1].plot(gfft) axarr[2].plot(gifft) axarr[3].plot(grfft) axarr[4].plot(grifft) plt.show() raise SystemExit
def pycuda_zeros(arr, shape): if arr is None or arr.shape != shape: arr = gpuarray.zeros(shape, dtype=np.float32) else: if type(arr) != gpuarray.GPUArray: arr = to_gpuarray(arr) pycu.memset_d32(arr.gpudata, 0, arr.size) return arr
def _malloc_impl(self, nbytes): import pycuda.driver as cuda # Allocate data = cuda.mem_alloc(nbytes) # Zero cuda.memset_d32(data, 0, nbytes // 4) return data
def gfx_init( self ) : try : print 'compiling' self.prog = sh.compile_program_vfg( 'shad/balls' ) print 'compiled' self.loc_mmv = sh.get_loc(self.prog,'modelview' ) self.loc_mp = sh.get_loc(self.prog,'projection') self.l_color = sh.get_loc(self.prog,'color' ) self.l_size = sh.get_loc(self.prog,'ballsize' ) except ValueError as ve : print "Shader compilation failed: " + str(ve) sys.exit(0) # glUseProgram( self.prog ) # glUniform1i( pointsid , 0 ); # glUseProgram( 0 ) # # cuda init # self.grid = (int(self.BOX),int(self.BOX)) self.block = (1,1,int(self.BOX)) print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid)) # print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK # print cuda_driver.device_attribute.MAX_BLOCK_DIM_X # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z floatbytes = np.dtype(np.float32).itemsize self.gpos = glGenBuffers(1) glBindBuffer( GL_ARRAY_BUFFER , self.gpos ) glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW ) glBindBuffer( GL_ARRAY_BUFFER , 0 ) self.df1 = cuda_driver.mem_alloc( self.f.nbytes ) self.df2 = cuda_driver.mem_alloc( self.f.nbytes ) cuda_driver.memcpy_htod( self.df1 , self.f ) cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q ) mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' ) self.collision = mod.get_function("collision_step") self.collision.prepare( "Piii" ) self.streaming = mod.get_function("streaming_step") self.streaming.prepare( "PPiii" ) self.colors = mod.get_function("colors") self.colors.prepare( "PPiii" )
def execute(positions, num_particles, num_frames): #Get host positions: cpuPos = numpy.array(positions, dtype=numpy.float32) #Allocate position space on device: devPos = cuda.mem_alloc(cpuPos.nbytes) #Copy positions: cuda.memcpy_htod(devPos, cpuPos) #Allocate device velocities: devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes) cuda.memset_d32(devVels, 0, 2 * num_particles) # #Copy velocities: # cuda.memcpy_htod(devVels, cpuVels) #Allocate and initialize device in bounds to false: #inBounds = numpy.zeros(num_particles, dtype=bool) devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes) cuda.memset_d8(devInBounds, True, num_particles) # inB = numpy.zeros(num_particles, dtype=numpy.bool) # cuda.memcpy_dtoh(inB, devInBounds) # print inB # cuda.memcpy_htod(devInBounds, inBounds) # numBlocks = 1#(num_particles // 512) + 1; grid_dim = ((num_particles // NUM_THREADS) + 1, 1) print grid_dim runframe = module.get_function("runframe") frames = [None] * num_frames for i in range(num_frames): runframe(devPos, devVels, devInBounds, numpy.int32(num_particles), grid=grid_dim, block=(NUM_THREADS, 1, 1)) #Get the positions from device: cuda.memcpy_dtoh(cpuPos, devPos) frames[i] = cpuPos.copy() #frames[i] = copy(cpuPos) #write_frame(out, cpuPos, num_particles) #Simulation destination file: # out = open(OUTPUT_FILE, 'w') # write_header(out, num_particles) # for frame in frames: # write_frame(out, frame, num_particles) #clean up... #out.close() devPos.free() devVels.free() devInBounds.free()
def leapfrogStationary(d_x, d_t, v, xmin, xmax, alpha): # --- Allocate device memory space for solution d_u = cuda.mem_alloc((N + 1) * (M + 1) * 4) d_u1 = cuda.mem_alloc((N + 1) * 4) d_u2 = cuda.mem_alloc((N + 1) * 4) d_u3 = cuda.mem_alloc((N + 1) * 4) # --- Set memory to zero cuda.memset_d32(d_u , 0x00, (N + 1) * (M + 1)) cuda.memset_d32(d_u1, 0x00, (N + 1) ) cuda.memset_d32(d_u2, 0x00, (N + 1) ) cuda.memset_d32(d_u3, 0x00, (N + 1) ) # u = np.zeros(((M + 1), N + 1)) blockDim = (BLOCKSIZE, 1, 1) gridDim = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1) # --- Step0 setStep0(d_u1, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.int32(N), block = blockDim, grid = gridDim) # --- Step1 setStep1(d_u1, d_u2, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.float32(dt), np.int32(N), block = blockDim, grid = gridDim) for l in range(1, M - 1): updateShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) # updateNoShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) # updateNoSharedNotWorking(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) cuda.memcpy_dtod(d_u1, d_u2, (N + 1) * 4) cuda.memcpy_dtod(d_u2, d_u3, (N + 1) * 4) return d_u
def render(): global invViewMatrix_h, c_invViewMatrix global gl_PBO, cuda_PBO global width_GL, height_GL, density, brightness, transferOffset, transferScale global block2D_GL, grid2D_GL global tex, transferTex global testData_d cuda.memcpy_htod( c_invViewMatrix, invViewMatrix_h) for i in range(nTextures): if i == 0: tex.set_array(plotData_dArray) if i == 1: tex.set_array(plotData_dArray_1) # map PBO to get CUDA device pointer cuda_PBO_map = cuda_PBO[i].map() cuda_PBO_ptr, cuda_PBO_size = cuda_PBO_map.device_ptr_and_size() cuda.memset_d32( cuda_PBO_ptr, 0, width_GL*height_GL ) renderKernel( np.intp(cuda_PBO_ptr), np.int32(width_GL), np.int32(height_GL), density, brightness, transferOffset, transferScale, grid=grid2D_GL, block = block2D_GL, texrefs=[tex, transferTex] ) cuda_PBO_map.unmap()
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8( self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False): m = re.search(r'wide(\d+)', kernel_set) if m: self.width = int(m.group(1)) else: raise ValueError("Invalid kernel_set") self.locks = locks self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin") self.mode = 0 if calc_partials else 4 self.fgemm = dict() for op in ("nt", "nn", "tn"): mod = self.module.get_function(kernel_set + "_" + op) mod.prepare("PPPIIIIIIHH") self.fgemm[op] = mod fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64") fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["fprop_conv"] = fprop_conv bprop_conv = self.module.get_function( "bprop_conv_float32_CRST64N64T64") bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["bprop_conv"] = bprop_conv udpate_conv = self.module.get_function( "update_conv_float32_CRST64K64T64") udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["update_conv"] = udpate_conv self.gpulock = drv.mem_alloc(locks * 4) drv.memset_d32(self.gpulock, 0, locks) self.bench = bench if bench: self.start = drv.Event() self.end = drv.Event()
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8(self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def leapfrog(d_x, d_t, v, alpha): # --- Allocate device memory space for solution d_u = cuda.mem_alloc((N + 1) * (M + 1) * 4) # --- Set memory to zero cuda.memset_d32(d_u, 0x00, (N + 1) * (M + 1)) # --- Initial condition blockDim = (BLOCKSIZE, 1, 1) gridDim = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1) initialConditionKernel(d_u, d_t, d_x, np.float32(v), np.int32(N), block = blockDim, grid = gridDim) # --- First step matsunoFirstStep(d_u, d_t, d_x, np.float32(v), np.float32(alpha), np.int32(N), block = blockDim, grid = gridDim) Q = (1. - alpha) / (1. + alpha) for l in range(1, M): updateKernel(d_u, d_t, d_x, np.float32(v), np.float32(alpha), np.float32(Q), np.int32(l), np.int32(N), block = blockDim, grid = gridDim); # --- Boundary condition return d_u
def __init__(self, backend, dtype, ioshape, initval, iopacking, tags): super(CUDAMatrixBase, self).__init__(backend, ioshape, iopacking, tags) # Data type info self.dtype = dtype self.itemsize = np.dtype(dtype).itemsize # Dimensions nrow, ncol = backend.compact_shape(ioshape, iopacking) self.nrow = nrow self.ncol = ncol # Compute the size, in bytes, of the minor dimension colsz = self.ncol*self.itemsize if 'align' in tags: # Allocate a 2D array aligned to the major dimension self.data, self.pitch = cuda.mem_alloc_pitch(colsz, nrow, self.itemsize) self._nbytes = nrow*self.pitch # Ensure that the pitch is a multiple of itemsize assert (self.pitch % self.itemsize) == 0 else: # Allocate a standard, tighly packed, array self._nbytes = colsz*nrow self.data = cuda.mem_alloc(self._nbytes) self.pitch = colsz self.leaddim = self.pitch / self.itemsize self.leadsubdim = self.soa_shape[-1] self.traits = (nrow, self.leaddim, self.leadsubdim, self.dtype) # Zero the entire matrix (incl. slack) assert (self._nbytes % 4) == 0 cuda.memset_d32(self.data, 0, self._nbytes/4) # Process any initial values if initval is not None: self.set(initval)
def cu_lpf(stimulus, dt, freq): """ CUDA implementation of low-pass-filter. stimulus: ndarray The input to be filtered. dt: float The sampling interval of the input. freq: float The cut-off frequency of the low pass filter. """ num = len(stimulus) num_fft = int(num / 2 + 1) idtype = stimulus.dtype odtype = np.complex128 if idtype == np.float64 else np.complex64 if not isinstance(stimulus, gpuarray.GPUArray): d_stimulus = gpuarray.to_gpu(stimulus) else: d_stimulus = stimulus plan = Plan(stimulus.shape, idtype, odtype) d_fstimulus = gpuarray.empty(num_fft, odtype) fft(d_stimulus, d_fstimulus, plan) df = 1.0 / dt / num idx = int(freq // df) unit = int(d_fstimulus.dtype.itemsize / 4) offset = int(d_fstimulus.gpudata) + d_fstimulus.dtype.itemsize * idx cuda.memset_d32(offset, 0, unit * (num_fft - idx)) plan = Plan(stimulus.shape, odtype, idtype) d_lpf_stimulus = gpuarray.empty(num, idtype) ifft(d_fstimulus, d_lpf_stimulus, plan, False) return d_lpf_stimulus.get()
def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False): m = re.search( r'wide(\d+)', kernel_set) if m: self.width = int(m.group(1)) else: raise ValueError("Invalid kernel_set") self.locks = locks self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin") self.mode = 0 if calc_partials else 4 self.fgemm = dict() for op in ("nt", "nn", "tn"): mod = self.module.get_function(kernel_set + "_" + op) mod.prepare("PPPIIIIIIHH") self.fgemm[op] = mod fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64") fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["fprop_conv"] = fprop_conv bprop_conv = self.module.get_function("bprop_conv_float32_CRST64N64T64") bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["bprop_conv"] = bprop_conv udpate_conv = self.module.get_function("update_conv_float32_CRST64K64T64") udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["update_conv"] = udpate_conv self.gpulock = drv.mem_alloc(locks*4) drv.memset_d32(self.gpulock, 0, locks) self.bench = bench if bench: self.start = drv.Event() self.end = drv.Event()
"""Animation (skip if GPU).""" anim = animation.ArtistAnimation(fig, imSequence, interval = 50, blit = True) # anim.save('waveEquation2D.mp4') rc('animation', html = 'jshtml') anim """Allocate solution on device.""" # --- Allocate device memory space for solution d_u = cuda.mem_alloc(Nx * Ny * 8) d_uold = cuda.mem_alloc(Nx * Ny * 8) d_unew = cuda.mem_alloc(Nx * Ny * 8) # --- Set memory to zero cuda.memset_d32(d_u, 0x00, Nx * Ny) cuda.memset_d32(d_uold, 0x00, Nx * Ny) cuda.memset_d32(d_unew, 0x00, Nx * Ny) """Transfering the initial condition from host to device.""" cuda.memcpy_htod(d_uold, u_old) cuda.memcpy_htod(d_u, u) """Solution at the subsequent steps.""" fig = plt.figure() blockDim = (BLOCKSIZEX, BLOCKSIZEY, 1) gridDim = (int(iDivUp(Nx, BLOCKSIZEX)), int(iDivUp(Ny, BLOCKSIZEY)), 1)
def basic_add_performance_2(): """Measures memory latency for certain operations.""" base_src = Template(""" .entry $FNAME ( .param .u32 out ) { .reg .u32 base, off, clka, clkb, clkoa, clkob, clks, tmp, iter; .reg .pred p; mov.u32 iter, $RUNS; mov.u32 clks, 0; mov.u32 tmp, 0; ld.const.u32 base, [scratch]; $MULT mov.u32 lcg_state, scratch; warmup: mov.u32 clka, %clock; $OPER sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni warmup; mov.u32 clkoa, %clock; mov.u32 iter, $RUNS; loop: //call.uni (tmp), lcg_rounds, (100); $LCGROUNDS mov.u32 clka, %clock; $OPER xor.b32 clka, clka, tmp; mov.u32 clkb, %clock; xor.b32 clka, clka, tmp; sub.u32 clka, clkb, clka; add.u32 clks, clks, clka; sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni loop; mov.u32 clkob, %clock; sub.u32 clkoa, clkob, clkoa; mov.u32 iter, $RUNS; cooldown: $OPER sub.u32 iter, iter, 1; setp.ne.u32 p, iter, 0; @p bra.uni cooldown; ld.param.u32 base, [out]; call.uni (off), get_gtid, (); shr.u32 off, off, 5; mad24.lo.u32 base, off, 8, base; call.uni (tmp), lcg_rounds, (1); st.volatile.global.b32 [base], tmp; st.volatile.global.b32 [base], clks; add.u32 base, base, 4; st.global.b32 [base], clkoa; } """) addrtypes = { 'single': {'label': "all conflicts", 'ADDRTYPE': "single", 'MULT': "mov.u32 off, %smid;" + "mad24.lo.u32 base, off, 128, base;"}, 'uncoa': {'label': "uncoalesced", 'ADDRTYPE': "uncoa", 'MULT': "call.uni (off), get_gtid, ();" + "mad24.lo.u32 base, off, 128, base;"}, 'coa': {'label': "coalesced", 'ADDRTYPE': "coa", 'MULT': "call.uni (off), get_gtid, ();" + "mad24.lo.u32 base, off, 4, base;"}, } # Evil, I know, DRY and all addrtypesorder = ['single', 'uncoa', 'coa'] opertypes = { 'atomic': "atom.global.add.u32 tmp, [base], tmp;", 'red': "red.global.add.u32 [base], clks;", 'store': "st.global.u32 [base], clks;", 'load': "ld.global.u32 tmp, [base];", 'load_store': """ ld.global.u32 tmp, [base]; add.u32 tmp, tmp, clks; st.global.u32 [base], tmp; """ } opertypesorder = ['load', 'store', 'load_store', 'red', 'atomic'] lcgtext = "mad.lo.u32 lcg_state, lcg_state, 1664525, 1013904223;\n"*50 order = [] for va in addrtypesorder: for k in sorted(opertypes.keys()): order.append((va, k)) runs = 512 rounds = 4 mod = stdlib + "\n.const .u32 scratch;" for (addr, oper) in order: c = dict(addrtypes[addr]) c['otype'] = oper c['OPER'] = opertypes[oper] c['RUNS'] = runs c['FNAME'] = "%s_%s" % (addr, oper) c['LCGROUNDS'] = lcgtext mod += base_src.substitute(c) for i in enumerate(mod.split('\n')): print "%3d %s" % i disassemble(mod) mod = cuda.module_from_buffer(mod) figs = [] barwidth = 0.3 scratch = cuda.mem_alloc(1024*16*30*128) scratchptr = mod.get_global('scratch') cuda.memset_d32(scratchptr[0], int(scratch), 1) def plot(title, names, vals, errs): N=len(vals[0]) bw=2*.9/len(names) fig = plt.figure() ax = fig.add_subplot(111, title=title) ax.set_ylabel('Clocks') ax.set_xlabel('Warps/SM') ax.set_xticks(range(N)) ax.set_xticklabels([1<<i for i in range(N)]) for idx, (name,val,err) in enumerate(zip(names, vals, errs)): ax.bar([i+bw*(idx/2)-.45 for i in range(N)], val, bw, yerr=err, color=colors[idx], label=name, zorder=-idx) ax.axis(ymin=0) ax.legend(loc=0) return fig for addr in addrtypesorder: addrlbl = addrtypes[addr]['label'] print "Access pattern:", addrlbl interms, interes, totalms, totales = [], [], [], [] for operidx, oper in enumerate(opertypesorder): interm, intere, totalm, totale = [], [], [], [] for dim in ((1, 1), (2, 1), (4, 1), (8, 1), (8, 2), (8, 4)): vals = numpy.zeros( (dim[0] * dim[1] * 30, 2) ) fn = mod.get_function('%s_%s' % (addr, oper)) for round in range(rounds+1): a = numpy.zeros_like(vals).astype(numpy.int32) fn(cuda.InOut(a), block=(32 * dim[0], 1, 1), grid=(30 * dim[1], 1)) if round != 0: vals += a time.sleep(.005) means = scipy.mean(vals, axis=0) / (runs*rounds) stds = scipy.std(vals, axis=0) / (runs*rounds) # this is just gross interm.append(means[0]) totalm.append(means[1]) intere.append(stds[0]) totale.append(stds[1]) print "%16s: %1.7f±%1.6f" % (oper, means[0], stds[0]) print "%16s: %1.7f±%1.6f" % (oper+' total', means[1], stds[1]) interms.append(interm) interes.append(intere) interms.append(totalm) interes.append(totale) names = [] for i in opertypesorder: names.append(i) names.append(i + ' total') fig1 = plot('Compute memory latency, %s access pattern' % addrlbl, names, interms, interes) figs.append((addr, fig1)) return figs
def kmeans(objects, numClusters, threshold): """ objects: numCoords x numObjs """ event = cuda.Event() """ Step 0 cast to float, copy to device """ objects = objects.astype(np.float32) objects_gpu = cuda.mem_alloc(objects.nbytes) cuda.memcpy_htod(objects_gpu, objects) numCoords, numObjs = objects.shape """ Step 1. Load cuda module """ src = open("cuda_kmeans.cu").read() mod = SourceModule(src, include_dirs=[os.getcwd()]) find_nearest_cluster = mod.get_function("find_nearest_cluster") compute_delta = mod.get_function("compute_delta") reduce_clusterSize = mod.get_function("reduce_clusterSize") reduce_centroids = mod.get_function("reduce_centroids") update_centroids_clusterSize = mod.get_function( "update_centroids_clusterSize") """ Step 2. define some constant """ # For find_nearest_cluster threadsPer_FNC_Block = 128 num_FNC_Blocks = int(math.ceil(float(numObjs) / threadsPer_FNC_Block)) # SDSize = shared memory size FNC_SDSize = threadsPer_FNC_Block * 2 + numClusters * numCoords * 4 # For compute_delta threadsPer_CD_Block = 128 if num_FNC_Blocks > 128 else nextPowerOfTwo( num_FNC_Blocks) num_CD_Blocks = int(math.ceil(float(num_FNC_Blocks) / threadsPer_CD_Block)) CD_SDSize = threadsPer_CD_Block * 4 """ Step 3. Init centroids using first K elements, define some variables """ centroids = init_centroids(objects, numClusters) centroids_gpu = cuda.mem_alloc(centroids.nbytes) cuda.memcpy_htod(centroids_gpu, centroids) _, interm_gpu = getHostDevicePair((num_FNC_Blocks, ), np.int32, 0) # interm means intermediate membership, membership_gpu = getHostDevicePair( (numObjs, ), np.int32, -1) # initialize membership to -1 reduceInterm, reduceInterm_gpu = getHostDevicePair((num_CD_Blocks, ), np.int32, 0) clusterSize, clusterSize_gpu = getHostDevicePair((numClusters, ), np.int32, 0) # seg means segregated segClusterSize, segClusterSize_gpu = getHostDevicePair( (num_FNC_Blocks, numClusters), np.int32, 0) _, segCentroids_gpu = getHostDevicePair( (num_FNC_Blocks, numCoords, numClusters), np.int32, 0) for loop in range(500): find_nearest_cluster(np.int32(numCoords), np.int32(numObjs), np.int32(numClusters), objects_gpu, centroids_gpu, membership_gpu, interm_gpu, block=(threadsPer_FNC_Block, 1, 1), grid=(num_FNC_Blocks, 1), shared=FNC_SDSize) event.synchronize() """validating centroids""" """ cuda.memcpy_dtoh(membership, membership_gpu) cent_valid = np.zeros_like(centroids) clusterSize_valid = np.zeros_like(clusterSize) for i in range(numObjs): clusterSize_valid[membership[i]] += 1 cent_valid[:,membership[i]] += objects[:,i] cent_valid = cent_valid / clusterSize_valid print("\nvalid") print(cent_valid) """ compute_delta(interm_gpu, reduceInterm_gpu, np.int32(num_FNC_Blocks), block=(threadsPer_CD_Block, 1, 1), grid=(num_CD_Blocks, 1), shared=CD_SDSize) event.synchronize() cuda.memcpy_dtoh(reduceInterm, reduceInterm_gpu) event.synchronize() delta = reduceInterm.sum() # set segClusterSize and segCentroids to 0 cuda.memset_d32(clusterSize_gpu, 0, numClusters) cuda.memset_d32(segClusterSize_gpu, 0, num_FNC_Blocks * numClusters) cuda.memset_d32(centroids_gpu, 0, numCoords * numClusters) cuda.memset_d32(segCentroids_gpu, 0, num_FNC_Blocks * numCoords * numClusters) event.synchronize() update_centroids_clusterSize(objects_gpu, membership_gpu, segCentroids_gpu, segClusterSize_gpu, np.int32(numCoords), np.int32(numObjs), np.int32(numClusters), block=(threadsPer_FNC_Block, 1, 1), grid=(num_FNC_Blocks, 1)) event.synchronize() reduce_clusterSize(segClusterSize_gpu, clusterSize_gpu, np.int32(num_FNC_Blocks), np.int32(numClusters), block=(numClusters, 1, 1)) event.synchronize() reduce_centroids(segCentroids_gpu, centroids_gpu, clusterSize_gpu, np.int32(num_FNC_Blocks), np.int32(numClusters), np.int32(numCoords), block=(numClusters, 1, 1), grid=(numCoords, 1)) event.synchronize() """ cuda.memcpy_dtoh(centroids, centroids_gpu) print("computed centroids") print(centroids) """ delta /= float(numObjs) if delta <= threshold: break loop += 1 cuda.memcpy_dtoh(centroids, centroids_gpu) #print(centroids) print("Looped for", loop, "iterations") return centroids
def __init__(self, default_dtype=np.float32, stochastic_round=False, deterministic=None, device_id=0, bench=False, scratch_size=0, hist_bins=64, hist_offset=-48, compat_mode=None, enable_winograd=True, cache_dir=os.path.join(os.path.expanduser('~'), 'nervana/cache')): if default_dtype not in [np.float16, np.float32]: raise ValueError('Default data type for nervanagpu ' 'backend must be float16 or 32') if default_dtype is np.float32: if stochastic_round: if stochastic_round is True: raise ValueError('Default rounding bit width is not ' 'supported for fp32. Please specify ' 'number of bits to round to.') logger.warn( 'Using 32 bit floating point and setting stochastic ' 'rounding to %d bits' % stochastic_round) # context drv.init() self.device_type = 1 self.device_id = device_id if device_id is not None else 0 self.ctx = drv.Device(device_id).make_context() # super class init super(NervanaGPU, self).__init__(default_dtype, compat_mode=compat_mode, deterministic=deterministic) # log logger.info("Initialized NervanaGPU") # stochastic_round assert stochastic_round is False, "Are you sure about using SR globally in the backend?" if stochastic_round: if stochastic_round is True: stochastic_round = 10 else: stochastic_round = 0 # attributes self.scratch_size = scratch_size self.scratch_offset = 0 self.round_mode = stochastic_round self.bench = bench self.stream = None self.buf = {} self.buf_active = {} self.warmup = False # store histograms for batched memcpy self.hist_bins = hist_bins self.hist_offset = hist_offset self.hist_map = dict() self.hist_idx = 0 self.hist_max = 4 * 4096 self.hist_base = drv.mem_alloc(self.hist_bins * self.hist_max * 4) drv.memset_d32(self.hist_base, 0, self.hist_bins * self.hist_max) self.compute_capability = (4, 0) self.use_cudac_kernels = True self.enable_winograd = enable_winograd self.cache_dir = cache_dir if not os.path.isdir(self.cache_dir): os.makedirs(self.cache_dir)
def __init__(self, jitfunc1, jitfunc2, fd1_d, fd2_d, model, dx, source_dt, sources, pad_width, pml_width=None): super(PycudaPropagator, self).__init__(model.astype(np.float32), np.float32(dx), np.float32(source_dt), sources, np.int32(pad_width), pml_width=pml_width) self.jitfunc1 = jitfunc1 self.jitfunc2 = jitfunc2 # allocate and copy model to GPU self.model.padded_property_gpu = {} self.model.padded_property_gpu['vp2dt2'] = \ drv.mem_alloc(self.model.padded_property['vp2dt2'].nbytes) drv.memcpy_htod(self.model.padded_property_gpu['vp2dt2'], self.model.padded_property['vp2dt2']) # allocate and initialize wavefields self.wavefield.current_gpu = \ drv.mem_alloc(self.wavefield.current.nbytes) drv.memset_d32(self.wavefield.current_gpu, 0, self.wavefield.current.size) self.wavefield.previous_gpu = \ drv.mem_alloc(self.wavefield.previous.nbytes) drv.memset_d32(self.wavefield.previous_gpu, 0, self.wavefield.previous.size) # allocate and initialize PML arrays self.pml.sigma_gpu = [] for dim in range(self.geometry.ndim): self.pml.phi[dim].current_gpu = \ drv.mem_alloc(self.pml.phi[dim].current.nbytes) drv.memset_d32(self.pml.phi[dim].current_gpu, 0, self.pml.phi[dim].current.size) self.pml.phi[dim].previous_gpu = \ drv.mem_alloc(self.pml.phi[dim].previous.nbytes) drv.memset_d32(self.pml.phi[dim].previous_gpu, 0, self.pml.phi[dim].previous.size) self.pml.sigma_gpu.append(drv.mem_alloc(self.pml.sigma[dim].nbytes)) drv.memcpy_htod(self.pml.sigma_gpu[dim], self.pml.sigma[dim]) # allocate and copy sources arrays self.sources.amplitude_gpu \ = drv.mem_alloc(self.sources.amplitude.nbytes) drv.memcpy_htod(self.sources.amplitude_gpu, self.sources.amplitude) self.sources.padded_locations_gpu \ = drv.mem_alloc(self.sources.padded_locations.nbytes) drv.memcpy_htod(self.sources.padded_locations_gpu, self.sources.padded_locations) # create and copy finite difference coeffs to constant memory self.fd1_d = fd1_d fd1 = np.array([8/12, -1/12], np.float32) / dx drv.memcpy_htod(self.fd1_d, fd1) self.fd2_d = fd2_d if self.geometry.ndim == 1: fd2 = np.array([-5/2, 4/3, -1/12], np.float32) / dx**2 elif self.geometry.ndim == 2: fd2 = np.array([-10/2, 4/3, -1/12], np.float32) / dx**2 drv.memcpy_htod(self.fd2_d, fd2) # set block and grid dimensions threadsperblockx = 32 blockspergridx = ((self.geometry.propagation_shape_padded[-1] + (threadsperblockx - 1)) // threadsperblockx) if self.geometry.ndim == 1: threadsperblockz = 1 blockspergridz = self.sources.num_shots elif self.geometry.ndim == 2: threadsperblockz = 32 blockspergridz = ((self.geometry.propagation_shape_padded[-2] + (threadsperblockz - 1)) // threadsperblockz) * self.sources.num_shots self.griddim = int(blockspergridx), int(blockspergridz) self.blockdim = int(threadsperblockx), int(threadsperblockz), 1
def __init__(self, default_dtype=np.float32, stochastic_round=False, deterministic=None, device_id=0, bench=False, scratch_size=0, hist_bins=64, hist_offset=-48, compat_mode=None, enable_winograd=True, cache_dir=os.path.join(os.path.expanduser('~'), 'nervana/cache')): if default_dtype not in [np.float16, np.float32]: raise ValueError('Default data type for nervanagpu ' 'backend must be float16 or 32') if default_dtype is np.float32: if stochastic_round: if stochastic_round is True: raise ValueError('Default rounding bit width is not ' 'supported for fp32. Please specify ' 'number of bits to round to.') logger.warn('Using 32 bit floating point and setting stochastic ' 'rounding to %d bits' % stochastic_round) # context drv.init() self.device_type = 1 self.device_id = device_id if device_id is not None else 0 self.ctx = drv.Device(device_id).make_context() # super class init super(NervanaGPU, self).__init__(default_dtype, compat_mode=compat_mode, deterministic=deterministic) # log logger.info("Initialized NervanaGPU") # stochastic_round assert stochastic_round is False, "Are you sure about using SR globally in the backend?" if stochastic_round: if stochastic_round is True: stochastic_round = 10 else: stochastic_round = 0 # attributes self.scratch_size = scratch_size self.scratch_offset = 0 self.round_mode = stochastic_round self.bench = bench self.stream = None self.buf = {} self.buf_active = {} self.warmup = False # store histograms for batched memcpy self.hist_bins = hist_bins self.hist_offset = hist_offset self.hist_map = dict() self.hist_idx = 0 self.hist_max = 4*4096 self.hist_base = drv.mem_alloc(self.hist_bins * self.hist_max * 4) drv.memset_d32(self.hist_base, 0, self.hist_bins * self.hist_max) self.compute_capability = (4,0) self.use_cudac_kernels = True self.enable_winograd = enable_winograd self.cache_dir = cache_dir if not os.path.isdir(self.cache_dir): os.makedirs(self.cache_dir)
def ssf_cuda(q, r, block_size=128, copy=True): import pycuda.driver as cuda import pycuda.gpuarray as ga from time import time from numpy import prod, float32, int32 nq, dim = q.shape npart = r.shape[0] global timer_copy, timer_memory, timer_zero, timer_exp, timer_sum # CUDA execution dimensions block = (block_size, 1, 1) grid = (60, 1) # access module functions, textures and constants if not 'compute_ssf' in globals(): global compute_ssf, finalise_ssf, tex_q, dim_ptr, npart_ptr, nq_ptr compute_ssf = ssf_module.get_function('compute_ssf') finalise_ssf = ssf_module.get_function('finalise_ssf') tex_q = ssf_module.get_texref('tex_q') dim_ptr = ssf_module.get_global('dim')[0] npart_ptr = ssf_module.get_global('npart')[0] nq_ptr = ssf_module.get_global('nq')[0] # set device constants t1 = time() cuda.memset_d32(dim_ptr, dim, 1) cuda.memset_d32(npart_ptr, npart, 1) cuda.memset_d32(nq_ptr, nq, 1) t2 = time() timer_copy += t2 - t1 # copy particle positions to device # (x0, x1, x2, ..., xN, y0, y1, y2, ..., yN, z0, z1, z2, ..., zN) if copy: global gpu_r t1 = time() gpu_r = ga.to_gpu(r.T.flatten().astype(float32)) t2 = time() timer_copy += t2 - t1 # allocate space for results t1 = time() gpu_sin = ga.empty(int(nq * prod(grid)), float32) gpu_cos = ga.empty(int(nq * prod(grid)), float32) gpu_ssf = ga.empty(int(prod(grid)), float32) t2 = time() timer_memory += t2 - t1 # copy group of wavevectors with (almost) equal magnitude t1 = time() gpu_q = ga.to_gpu(q.flatten().astype(float32)) gpu_q.bind_to_texref_ext(tex_q) t2 = time() timer_copy += t2 - t1 # compute exp(iq·r) for each particle t1 = time() compute_ssf(gpu_sin, gpu_cos, gpu_r, block=block, grid=grid, texrefs=[tex_q]) t2 = time() # compute sum(sin(q·r))^2 + sum(cos(q·r))^2 per wavevector # and sum over wavevectors finalise_ssf(gpu_sin, gpu_cos, gpu_ssf, int32(prod(grid)), block=block, grid=grid) result = sum(gpu_ssf.get()) t3 = time() timer_exp += t2 - t1 timer_sum += t3 - t2 # normalize result with #wavevectors and #particles return result / (nq * npart)
def go_sort(count, stream=None): grids = count / 8192 keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16) #keys = np.arange(count, dtype=np.uint16) #np.random.shuffle(keys) mkeys = np.reshape(keys, (grids, 8192)) vals = np.arange(count, dtype=np.uint32) dkeys = cuda.to_device(keys) dvals = cuda.to_device(vals) print 'Done seeding' dpfxs = cuda.mem_alloc(grids * 256 * 4) doffsets = cuda.mem_alloc(count * 2) launch('prefix_scan_8_0', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) # This stage will be rejiggered along with the split launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) #print frle(tkeys & 0xff) d_skeys = cuda.mem_alloc(count * 2) d_svals = cuda.mem_alloc(count * 4) if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) # Test integrity of sort (keys and values kept together): # skeys[i] = keys[svals[i]] for all i print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' dkeys, d_skeys = d_skeys, dkeys dvals, d_svals = d_svals, dvals if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('prefix_scan_8_8', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) if not stream: pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = np.reshape(tkeys, (grids, 8192)) new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) print np.nonzero(new_offs != offsets) fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) #print frle(fkeys) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: #print cuda.from_device(doffsets, (4, 8192), np.uint16) #print cuda.from_device(dkeys, (4, 8192), np.uint16) #print cuda.from_device(d_skeys, (4, 8192), np.uint16) skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' sorted_keys = np.sort(keys) # Test that ordering is correct. (Note that we don't need 100% # correctness, so this test should be made "soft".) print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
def go_sort(count, stream=None): grids = count / 8192 keys = np.fromstring(np.random.bytes(count*2), dtype=np.uint16) #keys = np.arange(count, dtype=np.uint16) #np.random.shuffle(keys) mkeys = np.reshape(keys, (grids, 8192)) vals = np.arange(count, dtype=np.uint32) dkeys = cuda.to_device(keys) dvals = cuda.to_device(vals) print 'Done seeding' dpfxs = cuda.mem_alloc(grids * 256 * 4) doffsets = cuda.mem_alloc(count * 2) launch('prefix_scan_8_0', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) # This stage will be rejiggered along with the split launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) #print frle(tkeys & 0xff) d_skeys = cuda.mem_alloc(count * 2) d_svals = cuda.mem_alloc(count * 4) if not stream: cuda.memset_d32(d_skeys, 0, count/2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) # Test integrity of sort (keys and values kept together): # skeys[i] = keys[svals[i]] for all i print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' dkeys, d_skeys = d_skeys, dkeys dvals, d_svals = d_svals, dvals if not stream: cuda.memset_d32(d_skeys, 0, count/2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('prefix_scan_8_8', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) if not stream: pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = np.reshape(tkeys, (grids, 8192)) new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) print np.nonzero(new_offs != offsets) fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) #print frle(fkeys) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: #print cuda.from_device(doffsets, (4, 8192), np.uint16) #print cuda.from_device(dkeys, (4, 8192), np.uint16) #print cuda.from_device(d_skeys, (4, 8192), np.uint16) skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' sorted_keys = np.sort(keys) # Test that ordering is correct. (Note that we don't need 100% # correctness, so this test should be made "soft".) print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'