def test_builtin_globals(mock_backend_pycuda): mock_backend_pycuda.add_devices([ PyCUDADeviceInfo(max_threads_per_block=1024), PyCUDADeviceInfo(max_threads_per_block=512) ]) source_template = DefTemplate.from_string( 'mock_source', [], """ KERNEL void test() { int max_total_local_size = ${device_params.max_total_local_size}; } """) api = API.from_api_id(mock_backend_pycuda.api_id) context = Context.from_devices( [api.platforms[0].devices[0], api.platforms[0].devices[1]]) src = MockDefTemplate(kernels=[MockKernel('test', [None])], source_template=source_template) program = Program(context.devices, src) assert 'max_total_local_size = 1024' in program.sources[ context.devices[0]].source assert 'max_total_local_size = 512' in program.sources[ context.devices[1]].source
def test_max_total_local_sizes(mock_backend): mock_backend.add_devices( ["Device1", "Device2 - tag", "Device3 - tag", "Device4"]) api = API.from_api_id(mock_backend.api_id) context = Context.from_criteria(api, devices_num=2, device_include_masks=["tag"]) # Providing max_total_local_sizes for all possible devices to make sure # only the ones corresponding to the context will get picked up kernel = MockKernel('test', max_total_local_sizes={ 0: 64, 1: 1024, 2: 512, 3: 128 }) src = MockDefTemplate(kernels=[kernel]) program = Program(context.devices, src) # The indices here correspond to the devices in the context, not in the platform assert program.kernel.test.max_total_local_sizes == { context.devices[0]: 1024, context.devices[1]: 512 }
def check_func(context, func_module, reference_func, out_dtype, in_dtypes, atol=1e-8, rtol=1e-5, is_mocked=False): N = 256 full_src = get_func_kernel(func_module, out_dtype, in_dtypes) # Can't test anything else if we don't have a real context if is_mocked: return program = Program(context.device, full_src) test = program.kernel.test queue = Queue(context.device) arrays = [get_test_array(N, dt, no_zeros=True, high=8) for dt in in_dtypes] arrays_dev = [Array.from_host(queue, array) for array in arrays] dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, *arrays_dev) assert numpy.allclose(dest_dev.get(queue), reference_func(*arrays).astype(out_dtype), atol=atol, rtol=rtol)
def test_pow_zero_base(context, out_code, in_codes): """ Specific tests for 0^0 and 0^x. """ N = 256 out_dtype, in_dtypes = generate_dtypes(out_code, in_codes) func_module = functions.pow(in_dtypes[0], exponent_dtype=in_dtypes[1], out_dtype=out_dtype) full_src = get_func_kernel(func_module, out_dtype, in_dtypes) program = Program(context.device, full_src) test = program.kernel.test queue = Queue(context.device) bases = Array.from_host(queue, numpy.zeros(N, in_dtypes[0])) # zero exponents exponents = Array.from_host(queue, numpy.zeros(N, in_dtypes[1])) dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, bases, exponents) assert numpy.allclose(dest_dev.get(queue), numpy.ones(N, in_dtypes[0])) # non-zero exponents exponents = Array.from_host(queue, numpy.ones(N, in_dtypes[1])) dest_dev = Array.empty(context.device, N, out_dtype) test(queue, N, None, dest_dev, bases, exponents) assert numpy.allclose(dest_dev.get(queue), numpy.zeros(N, in_dtypes[0]))
def test_multi_device(device_idxs, full_len, benchmark=False): pwr = 50 a = numpy.arange(full_len).astype(numpy.uint64) context = Context.from_devices( [api.platforms[0].devices[device_idx] for device_idx in device_idxs]) mqueue = MultiQueue.on_devices(context.devices) program = Program(context.devices, src) a_dev = MultiArray.from_host(mqueue, a) mqueue.synchronize() t1 = time.time() program.kernel.sum(mqueue, a_dev.shapes, None, a_dev, numpy.int32(pwr)) mqueue.synchronize() t2 = time.time() print(f"Multidevice time (devices {device_idxs}):", t2 - t1) a_res = a_dev.get(mqueue) if not benchmark: a_ref = calc_ref(a, pwr) assert (a_ref == a_res).all()
def test_cannot_override_builtin_globals(mock_context): with pytest.raises( ValueError, match="'device_params' is a reserved global name and cannot be used" ): Program(mock_context.device, MockDefTemplate(kernels=[MockKernel('test', [None])]), render_globals=dict(device_params=None))
def test_wrong_device_idxs(mock_4_device_context): src = MockDefTemplate(kernels=[MockKernel('multiply', [None])]) context = mock_4_device_context program = Program(context.devices[[0, 1]], src) mqueue = MultiQueue.on_devices(context.devices[[2, 1]]) res_dev = MultiArray.empty(context.devices[[2, 1]], 16, numpy.int32) # Using all the queue's devices (1, 2) with pytest.raises(ValueError, match="Requested execution on devices"): program.kernel.multiply(mqueue, 8, None, res_dev)
def test_sizes(context, vstest): """ Test that virtual sizes are correct. """ ref = ReferenceIds(vstest.global_size, vstest.local_size) vs = VirtualSizes(max_total_local_size=vstest.max_total_local_size, max_local_sizes=vstest.max_local_sizes, max_num_groups=vstest.max_num_groups, local_size_multiple=2, virtual_global_size=vstest.global_size, virtual_local_size=vstest.local_size) vdims = len(vstest.global_size) program = Program(context.device, """ KERNEL void get_sizes(GLOBAL_MEM int *sizes) { if (${static.global_id}(0) > 0) return; for (int i = 0; i < ${vdims}; i++) { sizes[i] = ${static.local_size}(i); sizes[i + ${vdims}] = ${static.num_groups}(i); sizes[i + ${vdims * 2}] = ${static.global_size}(i); } sizes[${vdims * 3}] = ${static.global_flat_size}(); } """, render_globals=dict(vdims=vdims, static=vs.vsize_modules)) get_sizes = program.kernel.get_sizes queue = Queue(context.device) sizes = Array.empty(context.device, vdims * 3 + 1, numpy.int32) get_sizes(queue, vs.real_global_size, vs.real_local_size, sizes) sizes = sizes.get(queue) local_sizes = sizes[0:vdims] grid_sizes = sizes[vdims:vdims * 2] global_sizes = sizes[vdims * 2:vdims * 3] flat_size = sizes[vdims * 3] global_sizes_ref = numpy.array(vstest.global_size) assert (global_sizes == global_sizes_ref).all() assert flat_size == prod(vstest.global_size) if vstest.local_size is not None: grid_sizes_ref = numpy.array(vstest.grid_size) assert (grid_sizes == grid_sizes_ref).all() local_sizes_ref = numpy.array(vstest.local_size) assert (local_sizes == local_sizes_ref).all()
def test_compile(mock_or_real_context, no_prelude): context, mocked = mock_or_real_context if mocked: src = MockDefTemplate( kernels=[MockKernel('multiply', [None, None, None, numpy.int32])]) else: if no_prelude: src = SRC_CUDA if context.api.id == cuda_api_id() else SRC_OPENCL else: src = SRC_GENERIC program = Program(context.device, src, no_prelude=no_prelude) if mocked and no_prelude: assert program.sources[context.device].prelude.strip() == "" length = 64 a = numpy.arange(length).astype(numpy.int32) b = numpy.arange(length).astype(numpy.int32) + 1 c = numpy.int32(3) ref = a * b + c queue = Queue(context.device) a_dev = Array.from_host(queue, a) b_dev = Array.from_host(queue, b) res_dev = Array.empty(context.device, length, numpy.int32) # Check that passing both Arrays and Buffers is supported # Pass one of the buffers as a subregion, too. a_dev_view = a_dev.data.get_sub_region(0, a_dev.data.size) program.kernel.multiply(queue, length, None, res_dev, a_dev_view, b_dev.data, c) res = res_dev.get(queue) if not mocked: assert (res == ref).all() # Explicit local_size res2_dev = Array.from_host(queue, a) # Array.empty(queue, length, numpy.int32) program.kernel.multiply(queue, length, length // 2, res2_dev, a_dev, b_dev, c) res2 = res2_dev.get(queue) if not mocked: assert (res2 == ref).all()
def test_mismatched_devices(mock_4_device_context): context = mock_4_device_context src = MockDefTemplate( kernels=[MockKernel('multiply', [None, None, None, numpy.int32])]) program = Program(context.devices, src) with pytest.raises( ValueError, match="Mismatched device sets for global and local sizes"): program.kernel.multiply.prepare( { context.devices[0]: 10, context.devices[2]: 20 }, { context.devices[0]: None, context.devices[1]: None })
def test_ids(context, vstest): """ Test that virtual IDs are correct for each thread. """ ref = ReferenceIds(vstest.global_size, vstest.local_size) vs = VirtualSizes(max_total_local_size=vstest.max_total_local_size, max_local_sizes=vstest.max_local_sizes, max_num_groups=vstest.max_num_groups, local_size_multiple=2, virtual_global_size=vstest.global_size, virtual_local_size=vstest.local_size) program = Program(context.device, """ KERNEL void get_ids( GLOBAL_MEM int *local_ids, GLOBAL_MEM int *group_ids, GLOBAL_MEM int *global_ids, int vdim) { ${static.begin}; const VSIZE_T i = ${static.global_flat_id}(); local_ids[i] = ${static.local_id}(vdim); group_ids[i] = ${static.group_id}(vdim); global_ids[i] = ${static.global_id}(vdim); } """, render_globals=dict(static=vs.vsize_modules)) get_ids = program.kernel.get_ids queue = Queue(context.device) local_ids = Array.empty(context.device, ref.global_size, numpy.int32) group_ids = Array.empty(context.device, ref.global_size, numpy.int32) global_ids = Array.empty(context.device, ref.global_size, numpy.int32) for vdim in range(len(vstest.global_size)): get_ids(queue, vs.real_global_size, vs.real_local_size, local_ids, group_ids, global_ids, numpy.int32(vdim)) assert (global_ids.get(queue) == ref.predict_global_ids(vdim)).all() if vstest.local_size is not None: assert (local_ids.get(queue) == ref.predict_local_ids(vdim)).all() assert (group_ids.get(queue) == ref.predict_group_ids(vdim)).all()
def test_wrong_context(mock_backend): mock_backend.add_devices(['Device0']) src = MockDefTemplate(kernels=[MockKernel('multiply', [None])]) api = API.from_api_id(mock_backend.api_id) context = Context.from_devices(api.platforms[0].devices[0]) context2 = Context.from_devices(api.platforms[0].devices[0]) res_dev = Array.empty(context.device, 16, numpy.int32) program = Program(context.device, src) queue = Queue(context2.device) with pytest.raises( ValueError, match= "The provided queue must belong to the same context this program uses" ): program.kernel.multiply(queue, 8, None, res_dev)
def test_contract(context, valloc_cls, pack): dtype = numpy.int32 program = Program(context.device, """ KERNEL void fill(GLOBAL_MEM ${ctype} *dest, ${ctype} val) { const SIZE_T i = get_global_id(0); dest[i] = val; } """, render_globals=dict(ctype=dtypes.ctype(dtype))) fill = program.kernel.fill queue = Queue(context.device) virtual_alloc = valloc_cls(context.device) buffers_metadata, arrays = allocate_test_set( virtual_alloc, # Bump size to make sure buffer alignment doesn't hide any out-of-bounds access lambda allocator, size: Array.empty( context.device, size * 100, dtype, allocator=allocator)) dependencies = {id_: deps for id_, _, deps in buffers_metadata} if pack: virtual_alloc.pack(queue) # Clear all arrays for name in sorted(arrays.keys()): fill(queue, arrays[name].shape, None, arrays[name], dtype(0)) for i, name in enumerate(sorted(arrays.keys())): val = dtype(i + 1) fill(queue, arrays[name].shape, None, arrays[name], val) # According to the virtual allocator contract, the allocated buffer # will not intersect with the buffers from the specified dependencies. # So we're filling the buffer and checking that the dependencies did not change. for dep in dependencies[name]: assert (arrays[dep].get(queue) != val).all()
def test_compile_multi_device(mock_or_real_multi_device_context): context, mocked = mock_or_real_multi_device_context devices = context.devices[[1, 0]] if mocked: src = MockDefTemplate( kernels=[MockKernel('multiply', [None, None, None, numpy.int32])]) else: src = SRC_GENERIC length = 64 program = Program(devices, src) a = numpy.arange(length).astype(numpy.int32) b = numpy.arange(length).astype(numpy.int32) + 1 c = numpy.int32(3) ref = a * b + c mqueue = MultiQueue.on_devices(devices) a_dev = MultiArray.from_host(mqueue, a) b_dev = MultiArray.from_host(mqueue, b) res_dev = MultiArray.empty(devices, length, numpy.int32) program.kernel.multiply(mqueue, a_dev.shapes, None, res_dev, a_dev, b_dev, c) res = res_dev.get(mqueue) if not mocked: assert (res == ref).all() # Test argument unpacking from dictionaries res_dev = MultiArray.empty(devices, length, numpy.int32) program.kernel.multiply(mqueue, a_dev.shapes, {device: None for device in devices}, res_dev, a_dev.subarrays, b_dev, c) res = res_dev.get(mqueue) if not mocked: assert (res == ref).all()
def test_compilation_error(mock_or_real_context, capsys): context, mocked = mock_or_real_context if mocked: src = MockDefTemplate(should_fail=True) else: src = SRC_COMPILE_ERROR with pytest.raises(CompilationError): Program(context.device, src) captured = capsys.readouterr() assert "Failed to compile on device" in captured.out # check that the full source is shown (including the prelude) assert "#define GRUNNUR_" in captured.out if mocked: assert "<<< mock source >>>" in captured.out else: assert "KERNEL void compile_error(GLOBAL_MEM int *dest)" in captured.out
def test_single_device(device_idx, full_len, benchmark=False): pwr = 50 a = numpy.arange(full_len).astype(numpy.uint64) context = Context.from_devices([api.platforms[0].devices[device_idx]]) queue = Queue(context.device) program = Program(context.device, src) a_dev = Array.from_host(queue, a) queue.synchronize() t1 = time.time() program.kernel.sum(queue, full_len, None, a_dev, numpy.int32(pwr)) queue.synchronize() t2 = time.time() print(f"Single device time (device {device_idx}):", t2 - t1) a_res = a_dev.get(queue) if not benchmark: a_ref = calc_ref(a, pwr) assert (a_ref == a_res).all()
def test_keep(mock_or_real_context, capsys): context, mocked = mock_or_real_context if mocked: src = MockDefTemplate( kernels=[MockKernel('multiply', [None, None, None, numpy.int32])]) else: src = SRC_GENERIC program = Program(context.device, src, keep=True) captured = capsys.readouterr() path = re.match(r'\*\*\* compiler output in (.*)', captured.out).group(1) assert os.path.isdir(path) if context.api.id == opencl_api_id(): srcfile = os.path.join(path, 'kernel.cl') elif context.api.id == cuda_api_id(): srcfile = os.path.join(path, 'kernel.cu') with open(srcfile) as f: source = f.read() assert str(src) in source
def _test_constant_memory(context, mocked, is_static): cm1 = numpy.arange(16).astype(numpy.int32) cm2 = numpy.arange(16).astype(numpy.int32) * 2 + 1 cm3 = numpy.arange(16).astype(numpy.int32) * 3 + 2 if mocked: kernel = MockKernel('copy_from_cm', [None] if context.api.id == cuda_api_id() else [None, None, None, None], max_total_local_sizes={0: 1024}) src = MockDefTemplate(constant_mem={ 'cm1': cm1.size * cm1.dtype.itemsize, 'cm2': cm2.size * cm2.dtype.itemsize, 'cm3': cm3.size * cm3.dtype.itemsize }, kernels=[kernel]) else: src = SRC_CONSTANT_MEM_STATIC if is_static else SRC_CONSTANT_MEM queue = Queue(context.device) cm1_dev = Array.from_host(queue, cm1) cm2_dev = Array.from_host(queue, cm2) cm3_dev = Array.from_host(queue, cm3) res_dev = Array.empty(context.device, 16, numpy.int32) if context.api.id == cuda_api_id(): # Use different forms of constant array representation constant_arrays = dict( cm1=cm1, # as an array(-like) object cm2=(cm2.shape, cm2.dtype), # as a tuple of shape and dtype cm3=cm3_dev) # as a device array if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16, constant_arrays=constant_arrays) copy_from_cm.set_constant_array( queue, 'cm1', cm1_dev) # setting from a device array copy_from_cm.set_constant_array(queue, 'cm2', cm2) # setting from a host array copy_from_cm.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer else: program = Program(context.device, src, constant_arrays=constant_arrays) program.set_constant_array(queue, 'cm1', cm1_dev) # setting from a device array program.set_constant_array(queue, 'cm2', cm2) # setting from a host array program.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev) else: if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16) else: program = Program(context.device, src) copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev, cm1_dev, cm2_dev, cm3_dev) res = res_dev.get(queue) if not mocked: assert (res == cm1 + cm2 + cm3).all()
def test_set_constant_array_errors(mock_4_device_context): context = mock_4_device_context api = API.from_api_id(mock_4_device_context.api.id) other_context = Context.from_criteria(api) other_queue = Queue(other_context.devices[0]) # Contexts don't know about each other and can't interact with stack in a consistent manner. # So we deactivate the other context if we're on CUDA API. if api.id == cuda_api_id(): other_context.deactivate() cm1 = numpy.arange(16).astype(numpy.int32) src = MockDefTemplate(kernels=[ MockKernel('kernel', [], max_total_local_sizes={ 0: 1024, 1: 1024, 2: 1024, 3: 1024 }) ], constant_mem={'cm1': cm1.size * cm1.dtype.itemsize}) queue = Queue(context.devices[0]) if context.api.id == cuda_api_id(): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) with pytest.raises( ValueError, match= "The provided queue must belong to the same context as this program uses" ): program.set_constant_array(other_queue, 'cm1', cm1) with pytest.raises(TypeError, match="Unsupported array type"): program.set_constant_array(queue, 'cm1', [1]) with pytest.raises(ValueError, match="Incorrect size of the constant buffer;"): program.set_constant_array(queue, 'cm1', cm1[:8]) with pytest.raises(TypeError, match="Unknown constant array metadata type"): program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=1)) program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=cm1)) queue3 = Queue(context.devices[3]) with pytest.raises( ValueError, match= "The program was not compiled for the device this queue uses"): program.set_constant_array(queue3, 'cm1', cm1) else: with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) program = Program(context.devices, src) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): program.set_constant_array(queue, 'cm1', cm1) with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): sk = StaticKernel(context.devices, src, 'kernel', 1024, constant_arrays=dict(cm1=cm1)) sk = StaticKernel(context.devices, src, 'kernel', 1024) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): sk.set_constant_array(queue, 'cm1', cm1)