Example #1
0
    def test_normal_hadamard_squares_to_one(self):

        ptm = np.array(
            [[0.5, np.sqrt(0.5), 0, 0.5], [np.sqrt(0.5), 0, 0, -np.sqrt(0.5)],
             [0, 0, -1, 0], [0.5, -np.sqrt(0.5), 0, 0.5]], np.float64)

        ptm_gpu = drv.to_device(ptm)

        dm = np.random.random((512, 512))

        dm_gpu = drv.to_device(dm)

        single_qubit_ptm(dm_gpu,
                         ptm_gpu,
                         np.int32(2),
                         np.int32(9),
                         block=(512, 1, 1),
                         grid=(512, 1, 1),
                         shared=8 * (16 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)
        assert not np.allclose(dm, dm2)

        single_qubit_ptm(dm_gpu,
                         ptm_gpu,
                         np.int32(2),
                         np.int32(9),
                         block=(512, 1, 1),
                         grid=(512, 1, 1),
                         shared=8 * (16 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(dm2, dm)
Example #2
0
    def test_single_ptm_as_two_ptm_hadamard_squares(self):
        single_hadamard = np.array([[1, 0, 0, 0], [0, 0, 1, 0], [
            0, 1, 0, 0], [0, 0, 0, 1]], np.float64)

        single_ptm_gpu = drv.to_device(single_hadamard)

        dm = np.random.random((512, 512))

        dm_gpu = drv.to_device(dm)

        single_qubit_ptm(dm_gpu, single_ptm_gpu, np.int32(2), np.int32(
            9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)
        assert not np.allclose(dm, dm2)

        double_hadamard = np.kron(single_hadamard, single_hadamard)
        double_ptm_gpu = drv.to_device(double_hadamard)

        two_qubit_ptm(dm_gpu, double_ptm_gpu, np.int32(2), np.int32(0), np.int32(
            9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (256 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert not np.allclose(dm2, dm)

        single_qubit_ptm(dm_gpu, single_ptm_gpu, np.int32(0), np.int32(
            9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(dm2, dm)
Example #3
0
  def nlargest(self, n):
    """Returns the per-individual threshold above which there are n outputs.
    
    @param n: number of outputs which should be above the threshold
    @type params: int

    @return list of thresholds, in order of individuals, which delimit the top
            n output values
    """
    log.debug("enter nlargest with n=%d", n)

    # Find one more output so that we can use strictly-less-than when counting
    # and underestimate lift rather than overestimating it.
    n = n + 1

    passSizes = []
    while n > 0:
      nextSize = min(self.maxHeapFloats, n)
      passSizes.append(nextSize)
      n -= nextSize

    log.debug("pass sizes: %r", passSizes)
    
    thresholdsMat = np.ones(shape=(self.popSize,),
                            dtype=np.float32) * np.inf
    self.thresholds = driver.to_device(thresholdsMat)

    uintBytes = np.dtype(np.uint32).itemsize
    thresholdCounts = np.zeros(shape=(self.popSize,),
                               dtype=np.uint32)
    self.thresholdCounts = driver.to_device(thresholdCounts)

    for passSize in passSizes:
      log.debug("begin pass size %d", passSize)
      self.nlargestKernel.prepared_call(self.nlargestGridDim,
                                        self.outputs,
                                        self.trainSet.size,
                                        self.popSize,
                                        passSize,
                                        self.thresholds,
                                        self.thresholdCounts)

      driver.Context.synchronize()

      if log.isEnabledFor(logging.DEBUG):
        thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
        log.debug("thresholds: %s", str(thresholdsMat))
        
        thresholdCounts = driver.from_device_like(self.thresholdCounts, thresholdCounts)
        log.debug("thresholdCounts: %s", str(thresholdCounts))

    self.thresholdsMat = driver.from_device_like(self.thresholds, thresholdsMat)
    return self.thresholdsMat
Example #4
0
    def test_reshuffle_invertible(self):
        dm = random_dm10()

        dm_gpu = drv.to_device(dm)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dmreal = np.zeros(2**(2 * no_qubits))
        dmreal_gpu = drv.to_device(dmreal)

        pauli_reshuffle(dm_gpu, dmreal_gpu, np.int32(no_qubits), np.int32(0),
                        block=block, grid=grid)

        dm_gpu2 = drv.mem_alloc(dm.nbytes)
        drv.memset_d8(dm_gpu2, 0, dm.nbytes)

        pauli_reshuffle(dm_gpu2, dmreal_gpu, np.int32(no_qubits), np.int32(1),
                        block=block, grid=grid)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu2, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dm2 = drv.from_device_like(dm_gpu2, dm)

        assert np.allclose(dm, dm2)
Example #5
0
def synchronize(d_tlead1, d_tlead2, d_tlead3, length):
    # Number of points to use to synchronize
    chunk = ecg.sampling_rate * 2
    template = numpy.zeros(chunk).astype(numpy.int32)
    tlead1 = cuda.from_device_like(d_tlead1, template)
    tlead2 = cuda.from_device_like(d_tlead2, template)
    tlead3 = cuda.from_device_like(d_tlead3, template)
    start1 = numpy.argmax(tlead1)
    start2 = numpy.argmax(tlead2)
    start3 = numpy.argmax(tlead3)
    minstart = min(start1, start2, start3)
    offset1 = start1 - minstart
    offset2 = start2 - minstart
    offset3 = start3 - minstart
    new_length = length - minstart
    return (offset1, offset2, offset3, new_length)
    def device_to_local(self):
        """
		    Copies device memory to local host memory.
		"""

        self.local_array = drv.from_device_like(self.device_ptr,
                                                self.local_array)
Example #7
0
def synchronize(d_tlead1, d_tlead2, d_tlead3, length, sampling_rate):
    # Number of points to use to synchronize
    chunk = sampling_rate * 2
    template = numpy.zeros(chunk).astype(numpy.int32)
    tlead1 = cuda.from_device_like(d_tlead1, template)
    tlead2 = cuda.from_device_like(d_tlead2, template)
    tlead3 = cuda.from_device_like(d_tlead3, template)
    start1 = numpy.argmax(tlead1)
    start2 = numpy.argmax(tlead2)
    start3 = numpy.argmax(tlead3)
    minstart = min(start1, start2, start3)
    maxstart = max(start1, start2, start3)
    offset1 = start1 - minstart
    offset2 = start2 - minstart
    offset3 = start3 - minstart
    new_length = length - (maxstart - minstart)
    return (offset1, offset2, offset3, new_length)
Example #8
0
def examine_subsets_cuda(task, A, N, K, threads_per_block):
    # Unpack the task tuple
    subset_start, subset_end = task

    # Create CUDA context
    cuda.init()
    ctx = make_default_context()

    KFAC = math.factorial(K)

    # Keep track of the stride
    stride = subset_end - subset_start

    # Copy A to the GPU
    device_A = cuda.to_device(A.astype(np.int16))

    # Create results array
    results = np.zeros(stride, dtype=np.int32)

    # Copy results array
    device_results = cuda.to_device(results)

    # Number of CUDA blocks
    cuda_blocks = ((stride + threads_per_block - 1) / threads_per_block)

    # Compile CUDA kernel
    mod = SourceModule(
        open(
            "/Users/rsearles/Documents/Repositories/cisc849-16s/project/src/Spike_Neural_Nets/subsets.cu",
            "r").read())
    kernel = mod.get_function("examine_subsets")

    # Run kernel
    kernel(np.int32(N),
           np.int32(K),
           np.int64(subset_start),
           np.int64(subset_end),
           np.int32(KFAC),
           device_A,
           device_results,
           block=(cuda_blocks, 1, 1),
           grid=(threads_per_block, 1))

    # Copy results back
    results = cuda.from_device_like(device_results, results)

    # Free GPU memory
    device_results.free()
    device_A.free()

    # Pop CUDA context (driver yells otherwise)
    ctx.pop()

    # Return the counts
    counts = Counter(results)
    return counts
Example #9
0
    def test_large(self):
        n = 1024
        x = np.random.random(n)

        x_gpu = drv.to_device(x)

        trace(x_gpu, np.int32(-1), block=(n, 1, 1),
              grid=(1, 1, 1), shared=8 * n)

        x2 = drv.from_device_like(x_gpu, x)

        assert np.allclose(x2[0], np.sum(x))
Example #10
0
    def test_diag_preserve(self):
        dm = random_dm10()

        dm_gpu = drv.to_device(dm)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(np.diag(dm), np.diag(dm2))
Example #11
0
    def test_identity(self):
        ptm = np.eye(4)

        ptm_gpu = drv.to_device(ptm)

        dm = np.random.random((512, 512))

        dm_gpu = drv.to_device(dm)

        single_qubit_ptm(dm_gpu, ptm_gpu, np.int32(2), np.int32(
            9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (16 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(dm2, dm)
Example #12
0
    def test_identity_big(self):
        ptm = np.eye(16, dtype=np.float64)

        ptm_gpu = drv.to_device(ptm)

        dm = np.random.random((512, 512))

        dm_gpu = drv.to_device(dm)

        two_qubit_ptm(dm_gpu, ptm_gpu, np.int32(6), np.int32(2), np.int32(
            9), block=(512, 1, 1), grid=(512, 1, 1), shared=8 * (256 + 512))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(dm2, dm)
Example #13
0
    def test_identity_very_small(self):
        ptm = np.eye(16, dtype=np.float64)

        ptm_gpu = drv.to_device(ptm)

        dm = np.random.random((16, 16))

        dm_gpu = drv.to_device(dm)

        two_qubit_ptm(dm_gpu, ptm_gpu, np.int32(1), np.int32(0), np.int32(
            3), block=(16, 1, 1), grid=(16, 1, 1), shared=8 * (256 + 16))

        dm2 = drv.from_device_like(dm_gpu, dm)

        assert np.allclose(dm2, dm)
Example #14
0
    def test_all_real_or_imag(self):
        dm = random_dm10()

        dm_gpu = drv.to_device(dm)

        for i in range(no_qubits):
            bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits),
                               block=block, grid=grid)

        dm2 = drv.from_device_like(dm_gpu, dm)

        where_real = dm2.real == dm2
        where_imag = 1j * dm2.imag == dm2
        where_all = where_real + where_imag

        assert np.all(where_all)
Example #15
0
    def test_sum_bit_high(self):

        n = 32
        x = np.random.random(n)
        # x = np.arange(n).astype(np.float64)

        x_gpu = drv.to_device(x)

        trace(
            x_gpu, np.int32(4), block=(
                n, 1, 1), grid=(
                1, 1, 1), shared=8 * 128)

        x2 = drv.from_device_like(x_gpu, x)

        print(x)
        print(x2)

        assert np.allclose(x2[1], np.sum(x[:16]))
        assert np.allclose(x2[0], np.sum(x[16:]))
Example #16
0
    def test_stupid(self):

        n = 8

        x = np.random.random(2**(2 * n))

        x_gpu = drv.to_device(x)

        block = (128, 1, 1)
        grid = (2**(2 * n) // 128, 1, 1)

        swap(x_gpu,
             np.int32(4),
             np.int32(5),
             np.int32(n),
             grid=grid,
             block=block)

        x2 = drv.from_device_like(x_gpu, x)

        assert np.allclose(np.sum(x2), np.sum(x))

        assert not np.allclose(x, x2)
Example #17
0
from cgen.cuda import CudaGlobal

mod = Module([
    FunctionBody(
        CudaGlobal(
            FunctionDeclaration(Value("void", "add"),
                                arg_decls=[
                                    Pointer(POD(dtype, name))
                                    for name in ["tgt", "op1", "op2"]
                                ])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"), "threadIdx.x + %d*blockIdx.x" %
                (block_size * thread_strides)),
        ] + [
            Assign(
                "tgt[idx+%d]" % (o * block_size), "op1[idx+%d] + op2[idx+%d]" %
                (o * block_size, o * block_size))
            for o in range(thread_strides)
        ]))
])

mod = SourceModule(mod)

func = mod.get_function("add")
func(c_gpu, a_gpu, b_gpu, block=(block_size, 1, 1), grid=(macroblock_count, 1))

c = cuda.from_device_like(c_gpu, a)

assert la.norm(c - (a + b)) == 0
Example #18
0
 def _to_host(self, dev_array, host_array):
   return cuda.from_device_like(dev_array, host_array)
Example #19
0
from pycuda.compiler import SourceModule
import pycuda.autoinit  # optional use >> reason: initialization, context creation, and cleanup can also be performed manually

import numpy  # utilized to transfer data onto the device; transfer data from numpy arrays on the host

a = numpy.random.randn(4, 4)  # generate random array
# require conversion because variable a consists of double precision numbers,
# but most nVidia devices only support single precision
a = a.astype(numpy.float32)

# allocate memory & transfer data to gpu
a_gpu = cuda.to_device(a)

# write code to double each entry in a_gpu
mod = SourceModule("""
    __global__ void doublify(float *a)
    {
        int idx = threadIdx.x + threadIdx.y*4;
        a[idx] *= 2;
    }
""")

func = mod.get_function("doublify")
func(a_gpu, block=(4, 4, 1))

# fetch the data back from the GPU and display it
a_doubled = cuda.from_device_like(
    devptr=a_gpu, other_ary=a)  # above two lines of code clubbed in one now
print(a_doubled)
print(a)
Example #20
0
mod = Module([
    FunctionBody(
        CudaGlobal(FunctionDeclaration(
            Value("void", "add"),
            arg_decls=[Pointer(POD(dtype, name)) 
                for name in ["tgt", "op1", "op2"]])),
        Block([
            Initializer(
                POD(numpy.int32, "idx"),
                "threadIdx.x + %d*blockIdx.x" 
                % (block_size*thread_strides)),
            ]+[
            Assign(
                "tgt[idx+%d]" % (o*block_size),
                "op1[idx+%d] + op2[idx+%d]" % (
                    o*block_size, 
                    o*block_size))
            for o in range(thread_strides)]))])

mod = SourceModule(mod)

func = mod.get_function("add")
func(c_gpu, a_gpu, b_gpu, 
        block=(block_size,1,1),
        grid=(macroblock_count,1))

c = cuda.from_device_like(c_gpu, a)

assert la.norm(c-(a+b)) == 0
Example #21
0
 def test_memory(self):
     z = np.random.randn(400).astype(np.float32)
     new_z = drv.from_device_like(drv.to_device(z), z)
     assert la.norm(new_z - z) == 0
Example #22
0
 def test_memory(self):
     z = np.random.randn(400).astype(np.float32)
     new_z = drv.from_device_like(drv.to_device(z), z)
     assert la.norm(new_z-z) == 0
Example #23
0
cuda.memcpy_htod(entropia_gpu, entropia_cpu.astype(np.float32))
phc0_cpu = array([0])
phc0_gpu = cuda.to_device(phc0_cpu.astype(np.int32))
phc1_cpu = array([0])
phc1_gpu = cuda.to_device(phc1_cpu.astype(np.int32))
pivot_cpu = array([0])
pivot_gpu = cuda.to_device(pivot_cpu.astype(np.int32))

phase_adj(
    phc0_gpu,
    phc1_gpu,
    pivot_gpu,
    block=(360, 1, 1),  #fase 0
    grid=(360, size))  #fase 1, pivot

entropia_cpu = cuda.from_device_like(entropia_gpu,
                                     entropia_cpu.astype(np.float32))
phc0_cpu = cuda.from_device_like(phc0_gpu, phc0_cpu.astype(np.int32))
phc1_cpu = cuda.from_device_like(phc1_gpu, phc1_cpu.astype(np.int32))
pivot_cpu = cuda.from_device_like(pivot_gpu, pivot_cpu.astype(np.int32))
gpu_time = timer() - start
gpu_h1 = ACMEentropy((phc0_cpu, phc1_cpu), FFT, pivot_cpu)[0]


def adjust(s, phc0, phc1, ref_ph1):
    L = len(s)
    s0 = s * exp(1j * (pi / 180) *
                 (phc0 + phc1 *
                  (-(array(range(1, L + 1)) - ref_ph1) / float(L))))
    return s0

Example #24
0
def go_sort(count, stream=None):
    grids = count / 8192

    keys = np.fromstring(np.random.bytes(count*2), dtype=np.uint16)
    #keys = np.arange(count, dtype=np.uint16)
    #np.random.shuffle(keys)
    mkeys = np.reshape(keys, (grids, 8192))
    vals = np.arange(count, dtype=np.uint32)
    dkeys = cuda.to_device(keys)
    dvals = cuda.to_device(vals)
    print 'Done seeding'

    dpfxs = cuda.mem_alloc(grids * 256 * 4)
    doffsets = cuda.mem_alloc(count * 2)
    launch('prefix_scan_8_0', doffsets, dpfxs, dkeys,
            block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)

    # This stage will be rejiggered along with the split
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1)

    launch('convert_offsets', doffsets, dsplit, dkeys, i32(0),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0)
        #print frle(tkeys & 0xff)

    d_skeys = cuda.mem_alloc(count * 2)
    d_svals = cuda.mem_alloc(count * 4)
    if not stream:
        cuda.memset_d32(d_skeys, 0, count/2)
        cuda.memset_d32(d_svals, 0xffffffff, count)
    launch('radix_sort_maybe', d_skeys, d_svals,
            dkeys, dvals, doffsets, dpfxs, dsplit, i32(0),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    if not stream:
        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        # Test integrity of sort (keys and values kept together):
        #   skeys[i] = keys[svals[i]] for all i
        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

    dkeys, d_skeys = d_skeys, dkeys
    dvals, d_svals = d_svals, dvals

    if not stream:
        cuda.memset_d32(d_skeys, 0, count/2)
        cuda.memset_d32(d_svals, 0xffffffff, count)

    launch('prefix_scan_8_8', doffsets, dpfxs, dkeys,
            block=(512, 1, 1), grid=(grids, 1), stream=stream, l1=1)
    launch('better_split', dsplit, dpfxs,
            block=(32, 1, 1), grid=(grids / 32, 1), stream=stream)
    launch('prefix_sum', dpfxs, np.int32(grids * 256),
            block=(256, 1, 1), grid=(1, 1), stream=stream, l1=1)
    if not stream:
        pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
    launch('convert_offsets', doffsets, dsplit, dkeys, i32(8),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = np.reshape(tkeys, (grids, 8192))

        new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8)
        print np.nonzero(new_offs != offsets)
        fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8)
        #print frle(fkeys)

    launch('radix_sort_maybe', d_skeys, d_svals,
            dkeys, dvals, doffsets, dpfxs, dsplit, i32(8),
            block=(1024, 1, 1), grid=(grids, 1), stream=stream, l1=1)

    if not stream:
        #print cuda.from_device(doffsets, (4, 8192), np.uint16)
        #print cuda.from_device(dkeys, (4, 8192), np.uint16)
        #print cuda.from_device(d_skeys, (4, 8192), np.uint16)

        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

        sorted_keys = np.sort(keys)
        # Test that ordering is correct. (Note that we don't need 100%
        # correctness, so this test should be made "soft".)
        print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'
	def device_to_local(self):
		"""
		    Copies device memory to local host memory.
		"""

		self.local_array = drv.from_device_like(self.device_ptr, self.local_array)
Example #26
0
 def sync_from_device(self):
     object_array = cuda.from_device_like(self.d_object_array,
                                          self.object_array)
     self.object_list = [self.object_class.from_array([int(ptr)
             for ptr in obj]) for obj in object_array]
Example #27
0
u0 = np.full(n * n, 0., dtype=np.float64)
u = np.full(n * n, 0., dtype=np.float64)
nn = np.full(1, n, dtype=np.int64)
th2 = np.full(1, dt / h / h, dtype=np.float64)

st = time()

u0_gpu = cuda.to_device(u0)
u_gpu = cuda.to_device(u)
n_gpu = cuda.to_device(nn)
th2_gpu = cuda.to_device(th2)

func = mod.get_function("NextStpGPU")
for i in range(0, int(nstp / 2)):
    func(n_gpu,
         th2_gpu,
         u0_gpu,
         u_gpu,
         block=(blockdim[0], blockdim[1], 1),
         grid=(griddim[0], griddim[1], 1))
    func(n_gpu,
         th2_gpu,
         u_gpu,
         u0_gpu,
         block=(blockdim[0], blockdim[1], 1),
         grid=(griddim[0], griddim[1], 1))

u0 = cuda.from_device_like(u0_gpu, u0)

print('time on GPU = ', time() - st)
Example #28
0
def go_sort(count, stream=None):
    grids = count / 8192

    keys = np.fromstring(np.random.bytes(count * 2), dtype=np.uint16)
    #keys = np.arange(count, dtype=np.uint16)
    #np.random.shuffle(keys)
    mkeys = np.reshape(keys, (grids, 8192))
    vals = np.arange(count, dtype=np.uint32)
    dkeys = cuda.to_device(keys)
    dvals = cuda.to_device(vals)
    print 'Done seeding'

    dpfxs = cuda.mem_alloc(grids * 256 * 4)
    doffsets = cuda.mem_alloc(count * 2)
    launch('prefix_scan_8_0',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    dsplit = cuda.mem_alloc(grids * 256 * 4)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)

    # This stage will be rejiggered along with the split
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)

    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = py_radix_sort_maybe(mkeys, offsets, pfxs, split, 0)
        #print frle(tkeys & 0xff)

    d_skeys = cuda.mem_alloc(count * 2)
    d_svals = cuda.mem_alloc(count * 4)
    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)
    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(0),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        # Test integrity of sort (keys and values kept together):
        #   skeys[i] = keys[svals[i]] for all i
        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

    dkeys, d_skeys = d_skeys, dkeys
    dvals, d_svals = d_svals, dvals

    if not stream:
        cuda.memset_d32(d_skeys, 0, count / 2)
        cuda.memset_d32(d_svals, 0xffffffff, count)

    launch('prefix_scan_8_8',
           doffsets,
           dpfxs,
           dkeys,
           block=(512, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)
    launch('better_split',
           dsplit,
           dpfxs,
           block=(32, 1, 1),
           grid=(grids / 32, 1),
           stream=stream)
    launch('prefix_sum',
           dpfxs,
           np.int32(grids * 256),
           block=(256, 1, 1),
           grid=(1, 1),
           stream=stream,
           l1=1)
    if not stream:
        pre_offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
    launch('convert_offsets',
           doffsets,
           dsplit,
           dkeys,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream)
    if not stream:
        offsets = cuda.from_device(doffsets, (grids, 8192), np.uint16)
        split = cuda.from_device(dsplit, (grids, 256), np.uint32)
        pfxs = cuda.from_device(dpfxs, (grids, 256), np.uint32)
        tkeys = np.reshape(tkeys, (grids, 8192))

        new_offs = py_convert_offsets(pre_offsets, split, tkeys, 8)
        print np.nonzero(new_offs != offsets)
        fkeys = py_radix_sort_maybe(tkeys, new_offs, pfxs, split, 8)
        #print frle(fkeys)

    launch('radix_sort_maybe',
           d_skeys,
           d_svals,
           dkeys,
           dvals,
           doffsets,
           dpfxs,
           dsplit,
           i32(8),
           block=(1024, 1, 1),
           grid=(grids, 1),
           stream=stream,
           l1=1)

    if not stream:
        #print cuda.from_device(doffsets, (4, 8192), np.uint16)
        #print cuda.from_device(dkeys, (4, 8192), np.uint16)
        #print cuda.from_device(d_skeys, (4, 8192), np.uint16)

        skeys = cuda.from_device_like(d_skeys, keys)
        svals = cuda.from_device_like(d_svals, vals)

        print 'Integrity: ',
        if np.all(svals < len(keys)) and np.all(skeys == keys[svals]):
            print 'pass'
        else:
            print 'FAIL'

        sorted_keys = np.sort(keys)
        # Test that ordering is correct. (Note that we don't need 100%
        # correctness, so this test should be made "soft".)
        print 'Order: ', 'pass' if np.all(skeys == sorted_keys) else 'FAIL'