Exemple #1
0
def shared_storage_multiarray(threads_per_block, block_count):
    test = get_cuda_function('shared_storage.cu',
                             'test_shared_storage_multiarray')

    total_thread_count = threads_per_block * block_count
    data = OrderedDict([
        ('a', np.empty(total_thread_count, dtype=np.int32)),
        ('b', np.empty(total_thread_count, dtype=np.uint16)),
        ('c', np.empty(total_thread_count, dtype=np.float32)),
    ])
    data_struct = MultiArrayPointers(data.values(), copy_to=False)
    thread_contexts = np.empty((total_thread_count, 2), dtype=np.uint32)

    # Store capacity as array so kernel can pass back final occupancy count.
    capacity = np.array([total_thread_count], dtype=np.uint32)

    block = (threads_per_block, 1, 1)
    grid = (block_count, 1, 1)

    print 'thread_count: %d' % threads_per_block
    print 'block_count: %d' % block_count

    test(cuda.InOut(capacity), cuda.Out(thread_contexts),
         data_struct.struct_ptr, block=block, grid=grid)
    return capacity[0], data_struct, thread_contexts
def scatter_gather(in_data, scatter_lists, scatter_list_order=None, dtype=None,
        thread_count=None, block_count=None):
    if dtype is None:
        dtype = in_data.dtype
    test = get_cuda_function('scatter_gather.cu', 'k_scatter', dtype)
    data = np.array(in_data, dtype=dtype)
    data_count = np.int32(len(data))
    k = np.int32(len(scatter_lists[0]))
    scatter_count = np.int32(len(scatter_lists))
    scatter_lists = np.concatenate(scatter_lists).astype(np.int32)
    gathered_data = np.empty_like(scatter_lists).astype(dtype)

    default_thread_count = 1 << log2ceil(test.get_attribute(
            cuda.function_attribute.MAX_THREADS_PER_BLOCK))

    if thread_count is None:
        thread_count = int(min(scatter_count, default_thread_count))
    if block_count is None:
        block_count = int(np.ceil(scatter_count / thread_count))

    block = (thread_count, 1, 1)
    grid = (block_count, 1, 1)

    # The number of scatter lists to be processed by all thread blocks
    # (except the first, in the case where scatter_count does not divide
    # evenly by block_count)
    common_scatter_count = scatter_count // block_count

    # If scatter_count does not divide evenly by block_count, compute
    # how many extra elements must be processed by the first thread block
    odd_scatter_count = scatter_count % block_count

    shared = int(np.ceil((common_scatter_count + odd_scatter_count) * k)
            ) * dtype.type(0).itemsize

    print 'thread_count: %d' % thread_count
    print 'block_count: %d' % block_count
    print 'shared mem/block: %d' % shared

    if scatter_list_order is None:
        scatter_list_order = np.arange(len(scatter_lists), dtype=np.uint32)

    test(k, data_count, cuda.InOut(data), scatter_count, cuda.In(scatter_lists),
            cuda.In(scatter_list_order), cuda.Out(gathered_data),
            block=block, grid=grid, shared=shared)

    return gathered_data
Exemple #3
0
def shared_storage(threads_per_block=1024, block_count=1):
    dtype = np.dtype('int32')
    test = get_cuda_function('shared_storage.cu', 'test_shared_storage', dtype)
    total_thread_count = threads_per_block * block_count
    data = np.empty(total_thread_count, dtype=dtype)
    thread_contexts = np.empty((total_thread_count, 2), dtype=np.uint32)

    # Store capacity as array so kernel can pass back final occupancy count.
    capacity = np.array([total_thread_count], dtype=np.uint32)

    block = (threads_per_block, 1, 1)
    grid = (block_count, 1, 1)

    print 'thread_count: %d' % threads_per_block
    print 'block_count: %d' % block_count

    test(cuda.InOut(capacity), cuda.Out(thread_contexts), cuda.Out(data),
            block=block, grid=grid)
    return capacity[0], data, thread_contexts