def test_multi_queue_out_of_queues(mock_4_device_context): context = mock_4_device_context queue0 = Queue(context.devices[0]) queue0_2 = Queue(context.devices[0]) queue1 = Queue(context.devices[1]) mqueue = MultiQueue([queue0, queue1]) assert set( mqueue.queues.keys()) == {context.devices[0], context.devices[1]} assert mqueue.devices == context.devices[[0, 1]]
def test_extract_dependencies(mock_context): queue = Queue(mock_context.device) virtual_alloc = TrivialManager(mock_context.device).allocator() vbuf = virtual_alloc(mock_context.device, 100) varr = Array.empty(mock_context.device, 100, numpy.int32, allocator=virtual_alloc) assert extract_dependencies(vbuf) == {vbuf._buffer_adapter._id} assert extract_dependencies(varr) == {varr.data._buffer_adapter._id} assert extract_dependencies( [vbuf, varr]) == {vbuf._buffer_adapter._id, varr.data._buffer_adapter._id} class DependencyHolder: __virtual_allocations__ = [vbuf, varr] assert extract_dependencies(DependencyHolder()) == { vbuf._buffer_adapter._id, varr.data._buffer_adapter._id } # An object not having any dependencies assert extract_dependencies(123) == set()
def test_empty(mock_or_real_context): context, _mocked = mock_or_real_context queue = Queue(context.device) arr_dev = Array.empty(context.device, 100, numpy.int32) arr = arr_dev.get(queue) assert arr.shape == (100, ) assert arr.dtype == numpy.int32
def check_func(context, func_module, reference_func, out_dtype, in_dtypes, atol=1e-8, rtol=1e-5, is_mocked=False): N = 256 full_src = get_func_kernel(func_module, out_dtype, in_dtypes) # Can't test anything else if we don't have a real context if is_mocked: return program = Program(context.device, full_src) test = program.kernel.test queue = Queue(context.device) arrays = [get_test_array(N, dt, no_zeros=True, high=8) for dt in in_dtypes] arrays_dev = [Array.from_host(queue, array) for array in arrays] dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, *arrays_dev) assert numpy.allclose(dest_dev.get(queue), reference_func(*arrays).astype(out_dtype), atol=atol, rtol=rtol)
def test_compile_static(mock_or_real_context): context, mocked = mock_or_real_context if mocked: kernel = MockKernel('multiply', [None, None, None], max_total_local_sizes={0: 1024}) src = MockDefTemplate(kernels=[kernel]) else: src = SRC a = numpy.arange(11).astype(numpy.int32) b = numpy.arange(15).astype(numpy.int32) ref = numpy.outer(a, b) queue = Queue(context.device) a_dev = Array.from_host(queue, a) b_dev = Array.from_host(queue, b) res_dev = Array.empty(context.device, (11, 15), numpy.int32) multiply = StaticKernel(context.device, src, 'multiply', (11, 15)) multiply(queue, res_dev, a_dev, b_dev) res = res_dev.get(queue) if not mocked: assert (res == ref).all()
def test_pow_zero_base(context, out_code, in_codes): """ Specific tests for 0^0 and 0^x. """ N = 256 out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) func_module = functions.pow(in_dtypes[0], exponent_dtype=in_dtypes[1], out_dtype=out_dtype) full_src = get_func_kernel(func_module, out_dtype, in_dtypes) program = Program(context.device, full_src) test = program.kernel.test queue = Queue(context.device) bases = Array.from_host(queue, numpy.zeros(N, in_dtypes[0])) # zero exponents exponents = Array.from_host(queue, numpy.zeros(N, in_dtypes[1])) dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, bases, exponents) assert numpy.allclose(dest_dev.get(queue), numpy.ones(N, in_dtypes[0])) # non-zero exponents exponents = Array.from_host(queue, numpy.ones(N, in_dtypes[1])) dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, bases, exponents) assert numpy.allclose(dest_dev.get(queue), numpy.zeros(N, in_dtypes[0]))
def test_set_from_wrong_type(mock_context): context = mock_context queue = Queue(context.device) arr = Array.empty(context.device, (10, 20), numpy.int32) with pytest.raises( TypeError, match="Cannot set from an object of type <class 'int'>"): arr.set(queue, 1)
def test_mismatched_devices(mock_4_device_context): context = mock_4_device_context buf = Buffer.allocate(context.devices[0], 100) queue = Queue(context.devices[1]) arr = numpy.ones(100, numpy.uint8) with pytest.raises(ValueError, match="Mismatched devices: queue on device"): buf.get(queue, arr) with pytest.raises(ValueError, match="Mismatched devices: queue on device"): buf.set(queue, arr)
def test_sizes(context, vstest): """ Test that virtual sizes are correct. """ ref = ReferenceIds(vstest.global_size, vstest.local_size) vs = VirtualSizes(max_total_local_size=vstest.max_total_local_size, max_local_sizes=vstest.max_local_sizes, max_num_groups=vstest.max_num_groups, local_size_multiple=2, virtual_global_size=vstest.global_size, virtual_local_size=vstest.local_size) vdims = len(vstest.global_size) program = Program(context.device, """ KERNEL void get_sizes(GLOBAL_MEM int *sizes) { if (${static.global_id}(0) > 0) return; for (int i = 0; i < ${vdims}; i++) { sizes[i] = ${static.local_size}(i); sizes[i + ${vdims}] = ${static.num_groups}(i); sizes[i + ${vdims * 2}] = ${static.global_size}(i); } sizes[${vdims * 3}] = ${static.global_flat_size}(); } """, render_globals=dict(vdims=vdims, static=vs.vsize_modules)) get_sizes = program.kernel.get_sizes queue = Queue(context.device) sizes = Array.empty(context.device, vdims * 3 + 1, numpy.int32) get_sizes(queue, vs.real_global_size, vs.real_local_size, sizes) sizes = sizes.get(queue) local_sizes = sizes[0:vdims] grid_sizes = sizes[vdims:vdims * 2] global_sizes = sizes[vdims * 2:vdims * 3] flat_size = sizes[vdims * 3] global_sizes_ref = numpy.array(vstest.global_size) assert (global_sizes == global_sizes_ref).all() assert flat_size == prod(vstest.global_size) if vstest.local_size is not None: grid_sizes_ref = numpy.array(vstest.grid_size) assert (grid_sizes == grid_sizes_ref).all() local_sizes_ref = numpy.array(vstest.local_size) assert (local_sizes == local_sizes_ref).all()
def test_set_checks_shape(mock_context): context = mock_context queue = Queue(context.device) arr = Array.empty(context.device, (10, 20), numpy.int32) with pytest.raises( ValueError, match="Shape mismatch: expected \\(10, 20\\), got \\(10, 30\\)"): arr.set(queue, numpy.zeros((10, 30), numpy.int32)) with pytest.raises(ValueError, match="Dtype mismatch: expected int32, got int64"): arr.set(queue, numpy.zeros((10, 20), numpy.int64))
def test_compile(mock_or_real_context, no_prelude): context, mocked = mock_or_real_context if mocked: src = MockDefTemplate( kernels=[MockKernel('multiply', [None, None, None, numpy.int32])]) else: if no_prelude: src = SRC_CUDA if context.api.id == cuda_api_id() else SRC_OPENCL else: src = SRC_GENERIC program = Program(context.device, src, no_prelude=no_prelude) if mocked and no_prelude: assert program.sources[context.device].prelude.strip() == "" length = 64 a = numpy.arange(length).astype(numpy.int32) b = numpy.arange(length).astype(numpy.int32) + 1 c = numpy.int32(3) ref = a * b + c queue = Queue(context.device) a_dev = Array.from_host(queue, a) b_dev = Array.from_host(queue, b) res_dev = Array.empty(context.device, length, numpy.int32) # Check that passing both Arrays and Buffers is supported # Pass one of the buffers as a subregion, too. a_dev_view = a_dev.data.get_sub_region(0, a_dev.data.size) program.kernel.multiply(queue, length, None, res_dev, a_dev_view, b_dev.data, c) res = res_dev.get(queue) if not mocked: assert (res == ref).all() # Explicit local_size res2_dev = Array.from_host(queue, a) # Array.empty(queue, length, numpy.int32) program.kernel.multiply(queue, length, length // 2, res2_dev, a_dev, b_dev, c) res2 = res2_dev.get(queue) if not mocked: assert (res2 == ref).all()
def test_ids(context, vstest): """ Test that virtual IDs are correct for each thread. """ ref = ReferenceIds(vstest.global_size, vstest.local_size) vs = VirtualSizes(max_total_local_size=vstest.max_total_local_size, max_local_sizes=vstest.max_local_sizes, max_num_groups=vstest.max_num_groups, local_size_multiple=2, virtual_global_size=vstest.global_size, virtual_local_size=vstest.local_size) program = Program(context.device, """ KERNEL void get_ids( GLOBAL_MEM int *local_ids, GLOBAL_MEM int *group_ids, GLOBAL_MEM int *global_ids, int vdim) { ${static.begin}; const VSIZE_T i = ${static.global_flat_id}(); local_ids[i] = ${static.local_id}(vdim); group_ids[i] = ${static.group_id}(vdim); global_ids[i] = ${static.global_id}(vdim); } """, render_globals=dict(static=vs.vsize_modules)) get_ids = program.kernel.get_ids queue = Queue(context.device) local_ids = Array.empty(context.device, ref.global_size, numpy.int32) group_ids = Array.empty(context.device, ref.global_size, numpy.int32) global_ids = Array.empty(context.device, ref.global_size, numpy.int32) for vdim in range(len(vstest.global_size)): get_ids(queue, vs.real_global_size, vs.real_local_size, local_ids, group_ids, global_ids, numpy.int32(vdim)) assert (global_ids.get(queue) == ref.predict_global_ids(vdim)).all() if vstest.local_size is not None: assert (local_ids.get(queue) == ref.predict_local_ids(vdim)).all() assert (group_ids.get(queue) == ref.predict_group_ids(vdim)).all()
def test_custom_allocator(mock_context): context = mock_context queue = Queue(context.device) allocated = [] def allocator(device, size): allocated.append(size) return Buffer.allocate(device, size) arr_dev = Array.empty(context.device, 100, numpy.int32, allocator=allocator) arr = arr_dev.get(queue) assert arr.shape == (100, ) assert arr.dtype == numpy.int32 assert allocated == [arr.size * arr.dtype.itemsize]
def test_set_from_non_contiguous(mock_or_real_context): context, _mocked = mock_or_real_context queue = Queue(context.device) arr = Array.empty(context.device, (10, 20), numpy.int32) arr2 = Array.empty(context.device, (20, 20), numpy.int32) with pytest.raises( ValueError, match="Setting from a non-contiguous device array is not supported" ): arr.set(queue, arr2[::2, :]) # Can set from a non-contiguous numpy array though arr.set(queue, numpy.ones((20, 20), numpy.int32)[::2, :]) assert (arr.get(queue) == 1).all()
def test_custom_buffer(mock_context): context = mock_context queue = Queue(context.device) arr = numpy.arange(100).astype(numpy.int32) metadata = ArrayMetadata.from_arraylike(arr) data = Buffer.allocate(context.device, 100) with pytest.raises( ValueError, match="Provided data buffer is not big enough to hold the array"): Array(metadata, data) bigger_data = Buffer.allocate(context.device, arr.size * arr.dtype.itemsize) bigger_data.set(queue, arr) arr_dev = Array(metadata, bigger_data) res = arr_dev.get(queue) assert (res == arr).all()
def test_wrong_context(mock_backend): mock_backend.add_devices(['Device0']) src = MockDefTemplate(kernels=[MockKernel('multiply', [None])]) api = API.from_api_id(mock_backend.api_id) context = Context.from_devices(api.platforms[0].devices[0]) context2 = Context.from_devices(api.platforms[0].devices[0]) res_dev = Array.empty(context.device, 16, numpy.int32) program = Program(context.device, src) queue = Queue(context2.device) with pytest.raises( ValueError, match= "The provided queue must belong to the same context this program uses" ): program.kernel.multiply(queue, 8, None, res_dev)
def test_contract(context, valloc_cls, pack): dtype = numpy.int32 program = Program(context.device, """ KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val) { const SIZE_T i = get_global_id(0); dest[i] = val; } """, render_globals=dict(ctype=dtypes.ctype(dtype))) fill = program.kernel.fill queue = Queue(context.device) virtual_alloc = valloc_cls(context.device) buffers_metadata, arrays = allocate_test_set( virtual_alloc, # Bump size to make sure buffer alignment doesn't hide any out-of-bounds access lambda allocator, size: Array.empty( context.device, size * 100, dtype, allocator=allocator)) dependencies = {id_: deps for id_, _, deps in buffers_metadata} if pack: virtual_alloc.pack(queue) # Clear all arrays for name in sorted(arrays.keys()): fill(queue, arrays[name].shape, None, arrays[name], dtype(0)) for i, name in enumerate(sorted(arrays.keys())): val = dtype(i + 1) fill(queue, arrays[name].shape, None, arrays[name], val) # According to the virtual allocator contract, the allocated buffer # will not intersect with the buffers from the specified dependencies. # So we're filling the buffer and checking that the dependencies did not change. for dep in dependencies[name]: assert (arrays[dep].get(queue) != val).all()
def test_statistics(mock_context, valloc_cls): context = mock_context queue = Queue(context.device) virtual_alloc = valloc_cls(context.device) buffers_metadata, buffers = allocate_test_set( virtual_alloc, lambda allocator, size: allocator(context.device, size)) stats = virtual_alloc.statistics() check_statistics(buffers_metadata, stats) virtual_alloc.pack(queue) stats = virtual_alloc.statistics() check_statistics(buffers_metadata, stats) s = str(stats) assert str(stats.real_size_total) in s assert str(stats.real_num) in s assert str(stats.virtual_size_total) in s assert str(stats.virtual_num) in s
def test_contract_mocked(mock_backend_pycuda, mock_context_pycuda, valloc_cls, pack): # Using PyCUDA backend here because it tracks the allocations. context = mock_context_pycuda queue = Queue(context.device) virtual_alloc = valloc_cls(context.device) buffers_metadata, buffers = allocate_test_set( virtual_alloc, lambda allocator, size: allocator(context.device, size)) for name, size, deps in buffers_metadata: # Virtual buffer size should be exactly as requested assert buffers[name].size == size # The real buffer behind the virtual buffer may be larger # (note that _size is only present in mocked DeviceAllocation) assert buffers[name].kernel_arg._size >= size if pack: virtual_alloc.pack(queue) # Clear all buffers for name, _, _ in buffers_metadata: mock_fill(buffers[name], -1) for i, metadata in enumerate(buffers_metadata): name, size, deps = metadata mock_fill(buffers[name], i) # According to the virtual allocator contract, the allocated buffer # will not intersect with the buffers from the specified dependencies. # So we're filling the buffer and checking that the dependencies did not change. for dep in deps: assert (mock_get(buffers[dep]) != i).all() # Check that after deleting virtual buffers all the real buffers are freed as well del buffers assert mock_backend_pycuda.allocation_count() == 0
def test_single_device(device_idx, full_len, benchmark=False): pwr = 50 a = numpy.arange(full_len).astype(numpy.uint64) context = Context.from_devices([api.platforms[0].devices[device_idx]]) queue = Queue(context.device) program = Program(context.device, src) a_dev = Array.from_host(queue, a) queue.synchronize() t1 = time.time() program.kernel.sum(queue, full_len, None, a_dev, numpy.int32(pwr)) queue.synchronize() t2 = time.time() print(f"Single device time (device {device_idx}):", t2 - t1) a_res = a_dev.get(queue) if not benchmark: a_ref = calc_ref(a, pwr) assert (a_ref == a_res).all()
def test_allocate_and_copy(mock_or_real_context): context, _mocked = mock_or_real_context length = 100 dtype = numpy.dtype('int32') size = length * dtype.itemsize arr = numpy.arange(length).astype(dtype) buf = Buffer.allocate(context.device, size) assert buf.size == size assert buf.offset == 0 # Just covering the existence of the attribute. # Hard to actually check it without running a kernel assert buf.kernel_arg is not None queue = Queue(context.device) buf.set(queue, arr) # Read the whole buffer res = numpy.empty_like(arr) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Read a subregion buf_region = buf.get_sub_region(25 * dtype.itemsize, 50 * dtype.itemsize) arr_region = arr[25:25+50] res_region = numpy.empty_like(arr_region) buf_region.get(queue, res_region) queue.synchronize() assert (res_region == arr_region).all() # Write a subregion arr_region = (numpy.ones(50) * 100).astype(dtype) arr[25:25+50] = arr_region buf_region.set(queue, arr_region) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Subregion of subregion if context.api.id == cuda_api_id(): # In OpenCL that leads to segfault, but with CUDA we just emulate that with pointers. arr_region2 = (numpy.ones(20) * 200).astype(dtype) arr[25+20:25+40] = arr_region2 buf_region2 = buf_region.get_sub_region(20 * dtype.itemsize, 20 * dtype.itemsize) buf_region2.set(queue, arr_region2) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Device-to-device copy buf2 = Buffer.allocate(context.device, size * 2) buf2.set(queue, numpy.ones(length * 2, dtype)) buf2_view = buf2.get_sub_region(50 * dtype.itemsize, 100 * dtype.itemsize) buf2_view.set(queue, buf) res2 = numpy.empty(length * 2, dtype) buf2.get(queue, res2) queue.synchronize() assert (res2[50:150] == arr).all() assert (res2[:50] == 1).all() assert (res2[150:] == 1).all() # Device-to-device copy (no_async) buf2 = Buffer.allocate(context.device, size * 2) buf2.set(queue, numpy.ones(length * 2, dtype)) buf2_view = buf2.get_sub_region(50 * dtype.itemsize, 100 * dtype.itemsize) buf2_view.set(queue, buf, no_async=True) res2 = numpy.empty(length * 2, dtype) buf2.get(queue, res2) queue.synchronize() assert (res2[50:150] == arr).all() assert (res2[:50] == 1).all() assert (res2[150:] == 1).all()
def _test_constant_memory(context, mocked, is_static): cm1 = numpy.arange(16).astype(numpy.int32) cm2 = numpy.arange(16).astype(numpy.int32) * 2 + 1 cm3 = numpy.arange(16).astype(numpy.int32) * 3 + 2 if mocked: kernel = MockKernel('copy_from_cm', [None] if context.api.id == cuda_api_id() else [None, None, None, None], max_total_local_sizes={0: 1024}) src = MockDefTemplate(constant_mem={ 'cm1': cm1.size * cm1.dtype.itemsize, 'cm2': cm2.size * cm2.dtype.itemsize, 'cm3': cm3.size * cm3.dtype.itemsize }, kernels=[kernel]) else: src = SRC_CONSTANT_MEM_STATIC if is_static else SRC_CONSTANT_MEM queue = Queue(context.device) cm1_dev = Array.from_host(queue, cm1) cm2_dev = Array.from_host(queue, cm2) cm3_dev = Array.from_host(queue, cm3) res_dev = Array.empty(context.device, 16, numpy.int32) if context.api.id == cuda_api_id(): # Use different forms of constant array representation constant_arrays = dict( cm1=cm1, # as an array(-like) object cm2=(cm2.shape, cm2.dtype), # as a tuple of shape and dtype cm3=cm3_dev) # as a device array if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16, constant_arrays=constant_arrays) copy_from_cm.set_constant_array( queue, 'cm1', cm1_dev) # setting from a device array copy_from_cm.set_constant_array(queue, 'cm2', cm2) # setting from a host array copy_from_cm.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer else: program = Program(context.device, src, constant_arrays=constant_arrays) program.set_constant_array(queue, 'cm1', cm1_dev) # setting from a device array program.set_constant_array(queue, 'cm2', cm2) # setting from a host array program.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev) else: if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16) else: program = Program(context.device, src) copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev, cm1_dev, cm2_dev, cm3_dev) res = res_dev.get(queue) if not mocked: assert (res == cm1 + cm2 + cm3).all()
def test_from_host(mock_or_real_context): context, _mocked = mock_or_real_context queue = Queue(context.device) arr = numpy.arange(100) arr_dev = Array.from_host(queue, arr) assert (arr_dev.get(queue) == arr).all()
def test_single_device(mock_or_real_context): context, _mocked = mock_or_real_context _check_array_operations(Queue(context.device), Array)
def test_set_constant_array_errors(mock_4_device_context): context = mock_4_device_context api = API.from_api_id(mock_4_device_context.api.id) other_context = Context.from_criteria(api) other_queue = Queue(other_context.devices[0]) # Contexts don't know about each other and can't interact with stack in a consistent manner. # So we deactivate the other context if we're on CUDA API. if api.id == cuda_api_id(): other_context.deactivate() cm1 = numpy.arange(16).astype(numpy.int32) src = MockDefTemplate(kernels=[ MockKernel('kernel', [], max_total_local_sizes={ 0: 1024, 1: 1024, 2: 1024, 3: 1024 }) ], constant_mem={'cm1': cm1.size * cm1.dtype.itemsize}) queue = Queue(context.devices[0]) if context.api.id == cuda_api_id(): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) with pytest.raises( ValueError, match= "The provided queue must belong to the same context as this program uses" ): program.set_constant_array(other_queue, 'cm1', cm1) with pytest.raises(TypeError, match="Unsupported array type"): program.set_constant_array(queue, 'cm1', [1]) with pytest.raises(ValueError, match="Incorrect size of the constant buffer;"): program.set_constant_array(queue, 'cm1', cm1[:8]) with pytest.raises(TypeError, match="Unknown constant array metadata type"): program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=1)) program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=cm1)) queue3 = Queue(context.devices[3]) with pytest.raises( ValueError, match= "The program was not compiled for the device this queue uses"): program.set_constant_array(queue3, 'cm1', cm1) else: with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) program = Program(context.devices, src) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): program.set_constant_array(queue, 'cm1', cm1) with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): sk = StaticKernel(context.devices, src, 'kernel', 1024, constant_arrays=dict(cm1=cm1)) sk = StaticKernel(context.devices, src, 'kernel', 1024) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): sk.set_constant_array(queue, 'cm1', cm1)
def test_queue_on_multi_device_context(mock_or_real_multi_device_context): context, _mocked = mock_or_real_multi_device_context queue = Queue(context.devices[1]) assert queue.device == context.devices[1]
def test_queue(mock_or_real_context): context, _mocked = mock_or_real_context queue = Queue(context.device) assert queue.device == context.devices[0] queue.synchronize()
def test_set_from_wrong_type(mock_context): buf = Buffer.allocate(mock_context.device, 100) queue = Queue(mock_context.device) with pytest.raises(TypeError, match="Cannot set from an object of type <class 'int'>"): buf.set(queue, 1)