def test_normal_hadamard_squares_to_one(self): ptm = np.array( [[0.5, np.sqrt(0.5), 0, 0.5], [np.sqrt(0.5), 0, 0, -np.sqrt(0.5)], [0, 0, -1, 0], [0.5, -np.sqrt(0.5), 0, 0.5]], np.float64) ptm_gpu = drv.to_device(ptm) dm = np.random.random((512, 512)) dm_gpu = drv.to_device(dm) single_qubit_ptm(dm_gpu, ptm_gpu, np.int32(2), np.int32(9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert not np.allclose(dm, dm2) single_qubit_ptm(dm_gpu, ptm_gpu, np.int32(2), np.int32(9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(dm2, dm)
def test_single_ptm_as_two_ptm_hadamard_squares(self): single_hadamard = np.array([[1, 0, 0, 0], [0, 0, 1, 0], [ 0, 1, 0, 0], [0, 0, 0, 1]], np.float64) single_ptm_gpu = drv.to_device(single_hadamard) dm = np.random.random((512, 512)) dm_gpu = drv.to_device(dm) single_qubit_ptm(dm_gpu, single_ptm_gpu, np.int32(2), np.int32( 9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert not np.allclose(dm, dm2) double_hadamard = np.kron(single_hadamard, single_hadamard) double_ptm_gpu = drv.to_device(double_hadamard) two_qubit_ptm(dm_gpu, double_ptm_gpu, np.int32(2), np.int32(0), np.int32( 9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (256 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert not np.allclose(dm2, dm) single_qubit_ptm(dm_gpu, single_ptm_gpu, np.int32(0), np.int32( 9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(dm2, dm)
def nlargest(self, n): """Returns the per-individual threshold above which there are n outputs. @param n: number of outputs which should be above the threshold @type params: int @return list of thresholds, in order of individuals, which delimit the top n output values """ log.debug("enter nlargest with n=%d", n) # Find one more output so that we can use strictly-less-than when counting # and underestimate lift rather than overestimating it. n = n + 1 passSizes = [] while n > 0: nextSize = min(self.maxHeapFloats, n) passSizes.append(nextSize) n -= nextSize log.debug("pass sizes: %r", passSizes) thresholdsMat = np.ones(shape=(self.popSize,), dtype=np.float32) * np.inf self.thresholds = driver.to_device(thresholdsMat) uintBytes = np.dtype(np.uint32).itemsize thresholdCounts = np.zeros(shape=(self.popSize,), dtype=np.uint32) self.thresholdCounts = driver.to_device(thresholdCounts) for passSize in passSizes: log.debug("begin pass size %d", passSize) self.nlargestKernel.prepared_call(self.nlargestGridDim, self.outputs, self.trainSet.size, self.popSize, passSize, self.thresholds, self.thresholdCounts) driver.Context.synchronize() if log.isEnabledFor(logging.DEBUG): thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat) log.debug("thresholds: %s", str(thresholdsMat)) thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts) log.debug("thresholdCounts: %s", str(thresholdCounts)) self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat) return self.thresholdsMat
def test_reshuffle_invertible(self): dm = random_dm10() dm_gpu = drv.to_device(dm) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dmreal = np.zeros(2**(2 * no_qubits)) dmreal_gpu = drv.to_device(dmreal) pauli_reshuffle(dm_gpu, dmreal_gpu, np.int32(no_qubits), np.int32(0), block=block, grid=grid) dm_gpu2 = drv.mem_alloc(dm.nbytes) drv.memset_d8(dm_gpu2, 0, dm.nbytes) pauli_reshuffle(dm_gpu2, dmreal_gpu, np.int32(no_qubits), np.int32(1), block=block, grid=grid) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu2, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dm2 = drv.from_device_like(dm_gpu2, dm) assert np.allclose(dm, dm2)
def synchronize(d_tlead1, d_tlead2, d_tlead3, length): # Number of points to use to synchronize chunk = ecg.sampling_rate * 2 template = numpy.zeros(chunk).astype(numpy.int32) tlead1 = cuda.from_device_like(d_tlead1, template) tlead2 = cuda.from_device_like(d_tlead2, template) tlead3 = cuda.from_device_like(d_tlead3, template) start1 = numpy.argmax(tlead1) start2 = numpy.argmax(tlead2) start3 = numpy.argmax(tlead3) minstart = min(start1, start2, start3) offset1 = start1 - minstart offset2 = start2 - minstart offset3 = start3 - minstart new_length = length - minstart return (offset1, offset2, offset3, new_length)
def device_to_local(self): """ Copies device memory to local host memory. """ self.local_array = drv.from_device_like(self.device_ptr, self.local_array)
def synchronize(d_tlead1, d_tlead2, d_tlead3, length, sampling_rate): # Number of points to use to synchronize chunk = sampling_rate * 2 template = numpy.zeros(chunk).astype(numpy.int32) tlead1 = cuda.from_device_like(d_tlead1, template) tlead2 = cuda.from_device_like(d_tlead2, template) tlead3 = cuda.from_device_like(d_tlead3, template) start1 = numpy.argmax(tlead1) start2 = numpy.argmax(tlead2) start3 = numpy.argmax(tlead3) minstart = min(start1, start2, start3) maxstart = max(start1, start2, start3) offset1 = start1 - minstart offset2 = start2 - minstart offset3 = start3 - minstart new_length = length - (maxstart - minstart) return (offset1, offset2, offset3, new_length)
def examine_subsets_cuda(task, A, N, K, threads_per_block): # Unpack the task tuple subset_start, subset_end = task # Create CUDA context cuda.init() ctx = make_default_context() KFAC = math.factorial(K) # Keep track of the stride stride = subset_end - subset_start # Copy A to the GPU device_A = cuda.to_device(A.astype(np.int16)) # Create results array results = np.zeros(stride, dtype=np.int32) # Copy results array device_results = cuda.to_device(results) # Number of CUDA blocks cuda_blocks = ((stride + threads_per_block - 1) / threads_per_block) # Compile CUDA kernel mod = SourceModule( open( "/Users/rsearles/Documents/Repositories/cisc849-16s/project/src/Spike_Neural_Nets/subsets.cu", "r").read()) kernel = mod.get_function("examine_subsets") # Run kernel kernel(np.int32(N), np.int32(K), np.int64(subset_start), np.int64(subset_end), np.int32(KFAC), device_A, device_results, block=(cuda_blocks, 1, 1), grid=(threads_per_block, 1)) # Copy results back results = cuda.from_device_like(device_results, results) # Free GPU memory device_results.free() device_A.free() # Pop CUDA context (driver yells otherwise) ctx.pop() # Return the counts counts = Counter(results) return counts
def test_large(self): n = 1024 x = np.random.random(n) x_gpu = drv.to_device(x) trace(x_gpu, np.int32(-1), block=(n, 1, 1), grid=(1, 1, 1), shared=8 * n) x2 = drv.from_device_like(x_gpu, x) assert np.allclose(x2[0], np.sum(x))
def test_diag_preserve(self): dm = random_dm10() dm_gpu = drv.to_device(dm) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(np.diag(dm), np.diag(dm2))
def test_identity(self): ptm = np.eye(4) ptm_gpu = drv.to_device(ptm) dm = np.random.random((512, 512)) dm_gpu = drv.to_device(dm) single_qubit_ptm(dm_gpu, ptm_gpu, np.int32(2), np.int32( 9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(dm2, dm)
def test_identity_big(self): ptm = np.eye(16, dtype=np.float64) ptm_gpu = drv.to_device(ptm) dm = np.random.random((512, 512)) dm_gpu = drv.to_device(dm) two_qubit_ptm(dm_gpu, ptm_gpu, np.int32(6), np.int32(2), np.int32( 9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (256 + 512)) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(dm2, dm)
def test_identity_very_small(self): ptm = np.eye(16, dtype=np.float64) ptm_gpu = drv.to_device(ptm) dm = np.random.random((16, 16)) dm_gpu = drv.to_device(dm) two_qubit_ptm(dm_gpu, ptm_gpu, np.int32(1), np.int32(0), np.int32( 3), block=(16, 1, 1), grid=(16, 1, 1), shared=8 * (256 + 16)) dm2 = drv.from_device_like(dm_gpu, dm) assert np.allclose(dm2, dm)
def test_all_real_or_imag(self): dm = random_dm10() dm_gpu = drv.to_device(dm) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dm2 = drv.from_device_like(dm_gpu, dm) where_real = dm2.real == dm2 where_imag = 1j * dm2.imag == dm2 where_all = where_real + where_imag assert np.all(where_all)
def test_sum_bit_high(self): n = 32 x = np.random.random(n) # x = np.arange(n).astype(np.float64) x_gpu = drv.to_device(x) trace( x_gpu, np.int32(4), block=( n, 1, 1), grid=( 1, 1, 1), shared=8 * 128) x2 = drv.from_device_like(x_gpu, x) print(x) print(x2) assert np.allclose(x2[1], np.sum(x[:16])) assert np.allclose(x2[0], np.sum(x[16:]))
def test_stupid(self): n = 8 x = np.random.random(2**(2 * n)) x_gpu = drv.to_device(x) block = (128, 1, 1) grid = (2**(2 * n) // 128, 1, 1) swap(x_gpu, np.int32(4), np.int32(5), np.int32(n), grid=grid, block=block) x2 = drv.from_device_like(x_gpu, x) assert np.allclose(np.sum(x2), np.sum(x)) assert not np.allclose(x, x2)
from cgen.cuda import CudaGlobal mod = Module([ FunctionBody( CudaGlobal( FunctionDeclaration(Value("void", "add"), arg_decls=[ Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"] ])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (block_size * thread_strides)), ] + [ Assign( "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" % (o * block_size, o * block_size)) for o in range(thread_strides) ])) ]) mod = SourceModule(mod) func = mod.get_function("add") func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1)) c = cuda.from_device_like(c_gpu, a) assert la.norm(c - (a + b)) == 0
def _to_host(self, dev_array, host_array): return cuda.from_device_like(dev_array, host_array)
from pycuda.compiler import SourceModule import pycuda.autoinit # optional use >> reason: initialization, context creation, and cleanup can also be performed manually import numpy # utilized to transfer data onto the device; transfer data from numpy arrays on the host a = numpy.random.randn(4, 4) # generate random array # require conversion because variable a consists of double precision numbers, # but most nVidia devices only support single precision a = a.astype(numpy.float32) # allocate memory & transfer data to gpu a_gpu = cuda.to_device(a) # write code to double each entry in a_gpu mod = SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) func = mod.get_function("doublify") func(a_gpu, block=(4, 4, 1)) # fetch the data back from the GPU and display it a_doubled = cuda.from_device_like( devptr=a_gpu, other_ary=a) # above two lines of code clubbed in one now print(a_doubled) print(a)
mod = Module([ FunctionBody( CudaGlobal(FunctionDeclaration( Value("void", "add"), arg_decls=[Pointer(POD(dtype, name)) for name in ["tgt", "op1", "op2"]])), Block([ Initializer( POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" % (block_size*thread_strides)), ]+[ Assign( "tgt[idx+%d]" % (o*block_size), "op1[idx+%d] + op2[idx+%d]" % ( o*block_size, o*block_size)) for o in range(thread_strides)]))]) mod = SourceModule(mod) func = mod.get_function("add") func(c_gpu, a_gpu, b_gpu, block=(block_size,1,1), grid=(macroblock_count,1)) c = cuda.from_device_like(c_gpu, a) assert la.norm(c-(a+b)) == 0
def test_memory(self): z = np.random.randn(400).astype(np.float32) new_z = drv.from_device_like(drv.to_device(z), z) assert la.norm(new_z - z) == 0
def test_memory(self): z = np.random.randn(400).astype(np.float32) new_z = drv.from_device_like(drv.to_device(z), z) assert la.norm(new_z-z) == 0
cuda.memcpy_htod(entropia_gpu, entropia_cpu.astype(np.float32)) phc0_cpu = array([0]) phc0_gpu = cuda.to_device(phc0_cpu.astype(np.int32)) phc1_cpu = array([0]) phc1_gpu = cuda.to_device(phc1_cpu.astype(np.int32)) pivot_cpu = array([0]) pivot_gpu = cuda.to_device(pivot_cpu.astype(np.int32)) phase_adj( phc0_gpu, phc1_gpu, pivot_gpu, block=(360, 1, 1), #fase 0 grid=(360, size)) #fase 1, pivot entropia_cpu = cuda.from_device_like(entropia_gpu, entropia_cpu.astype(np.float32)) phc0_cpu = cuda.from_device_like(phc0_gpu, phc0_cpu.astype(np.int32)) phc1_cpu = cuda.from_device_like(phc1_gpu, phc1_cpu.astype(np.int32)) pivot_cpu = cuda.from_device_like(pivot_gpu, pivot_cpu.astype(np.int32)) gpu_time = timer() - start gpu_h1 = ACMEentropy((phc0_cpu, phc1_cpu), FFT, pivot_cpu)[0] def adjust(s, phc0, phc1, ref_ph1): L = len(s) s0 = s * exp(1j * (pi / 180) * (phc0 + phc1 * (-(array(range(1, L + 1)) - ref_ph1) / float(L)))) return s0
def go_sort(count, stream=None): grids = count / 8192 keys = np.fromstring(np.random.bytes(count*2), dtype=np.uint16) #keys = np.arange(count, dtype=np.uint16) #np.random.shuffle(keys) mkeys = np.reshape(keys, (grids, 8192)) vals = np.arange(count, dtype=np.uint32) dkeys = cuda.to_device(keys) dvals = cuda.to_device(vals) print 'Done seeding' dpfxs = cuda.mem_alloc(grids * 256 * 4) doffsets = cuda.mem_alloc(count * 2) launch('prefix_scan_8_0', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) # This stage will be rejiggered along with the split launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) #print frle(tkeys & 0xff) d_skeys = cuda.mem_alloc(count * 2) d_svals = cuda.mem_alloc(count * 4) if not stream: cuda.memset_d32(d_skeys, 0, count/2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) # Test integrity of sort (keys and values kept together): # skeys[i] = keys[svals[i]] for all i print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' dkeys, d_skeys = d_skeys, dkeys dvals, d_svals = d_svals, dvals if not stream: cuda.memset_d32(d_skeys, 0, count/2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('prefix_scan_8_8', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) if not stream: pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = np.reshape(tkeys, (grids, 8192)) new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) print np.nonzero(new_offs != offsets) fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) #print frle(fkeys) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: #print cuda.from_device(doffsets, (4, 8192), np.uint16) #print cuda.from_device(dkeys, (4, 8192), np.uint16) #print cuda.from_device(d_skeys, (4, 8192), np.uint16) skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' sorted_keys = np.sort(keys) # Test that ordering is correct. (Note that we don't need 100% # correctness, so this test should be made "soft".) print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
def sync_from_device(self): object_array = cuda.from_device_like(self.d_object_array, self.object_array) self.object_list = [self.object_class.from_array([int(ptr) for ptr in obj]) for obj in object_array]
u0 = np.full(n * n, 0., dtype=np.float64) u = np.full(n * n, 0., dtype=np.float64) nn = np.full(1, n, dtype=np.int64) th2 = np.full(1, dt / h / h, dtype=np.float64) st = time() u0_gpu = cuda.to_device(u0) u_gpu = cuda.to_device(u) n_gpu = cuda.to_device(nn) th2_gpu = cuda.to_device(th2) func = mod.get_function("NextStpGPU") for i in range(0, int(nstp / 2)): func(n_gpu, th2_gpu, u0_gpu, u_gpu, block=(blockdim[0], blockdim[1], 1), grid=(griddim[0], griddim[1], 1)) func(n_gpu, th2_gpu, u_gpu, u0_gpu, block=(blockdim[0], blockdim[1], 1), grid=(griddim[0], griddim[1], 1)) u0 = cuda.from_device_like(u0_gpu, u0) print('time on GPU = ', time() - st)
def go_sort(count, stream=None): grids = count / 8192 keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16) #keys = np.arange(count, dtype=np.uint16) #np.random.shuffle(keys) mkeys = np.reshape(keys, (grids, 8192)) vals = np.arange(count, dtype=np.uint32) dkeys = cuda.to_device(keys) dvals = cuda.to_device(vals) print 'Done seeding' dpfxs = cuda.mem_alloc(grids * 256 * 4) doffsets = cuda.mem_alloc(count * 2) launch('prefix_scan_8_0', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) dsplit = cuda.mem_alloc(grids * 256 * 4) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) # This stage will be rejiggered along with the split launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) launch('convert_offsets', doffsets, dsplit, dkeys, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0) #print frle(tkeys & 0xff) d_skeys = cuda.mem_alloc(count * 2) d_svals = cuda.mem_alloc(count * 4) if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(0), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) # Test integrity of sort (keys and values kept together): # skeys[i] = keys[svals[i]] for all i print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' dkeys, d_skeys = d_skeys, dkeys dvals, d_svals = d_svals, dvals if not stream: cuda.memset_d32(d_skeys, 0, count / 2) cuda.memset_d32(d_svals, 0xffffffff, count) launch('prefix_scan_8_8', doffsets, dpfxs, dkeys, block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1) launch('better_split', dsplit, dpfxs, block=(32, 1, 1), grid=(grids / 32, 1), stream=stream) launch('prefix_sum', dpfxs, np.int32(grids * 256), block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1) if not stream: pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) launch('convert_offsets', doffsets, dsplit, dkeys, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream) if not stream: offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16) split = cuda.from_device(dsplit, (grids, 256), np.uint32) pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32) tkeys = np.reshape(tkeys, (grids, 8192)) new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8) print np.nonzero(new_offs != offsets) fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8) #print frle(fkeys) launch('radix_sort_maybe', d_skeys, d_svals, dkeys, dvals, doffsets, dpfxs, dsplit, i32(8), block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1) if not stream: #print cuda.from_device(doffsets, (4, 8192), np.uint16) #print cuda.from_device(dkeys, (4, 8192), np.uint16) #print cuda.from_device(d_skeys, (4, 8192), np.uint16) skeys = cuda.from_device_like(d_skeys, keys) svals = cuda.from_device_like(d_svals, vals) print 'Integrity: ', if np.all(svals < len(keys)) and np.all(skeys == keys[svals]): print 'pass' else: print 'FAIL' sorted_keys = np.sort(keys) # Test that ordering is correct. (Note that we don't need 100% # correctness, so this test should be made "soft".) print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'