def test_atomic_fp_native(filter_str, return_list_of_op, fdtype, addrspace): LLVM_SPIRV_ROOT = os.environ.get("NUMBA_DPPY_LLVM_SPIRV_ROOT") if LLVM_SPIRV_ROOT == "" or LLVM_SPIRV_ROOT is None: pytest.skip( "Please set envar NUMBA_DPPY_LLVM_SPIRV_ROOT to run this test" ) if atomic_skip_test(filter_str): pytest.skip() a = np.array([0], fdtype) op_type, expected = return_list_of_op if addrspace == "global": op = getattr(dppy.atomic, op_type) def f(a): op(a, 0, 1) elif addrspace == "local": f = get_func_local(op_type, fdtype) kernel = dppy.kernel(f) NATIVE_FP_ATOMICS_old_val = config.NATIVE_FP_ATOMICS config.NATIVE_FP_ATOMICS = 1 LLVM_SPIRV_ROOT_old_val = config.LLVM_SPIRV_ROOT config.LLVM_SPIRV_ROOT = LLVM_SPIRV_ROOT with dppy.offload_to_sycl_device(filter_str) as sycl_queue: kern = kernel[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( kernel._get_argtypes(a), sycl_queue ) if filter_str != "opencl:cpu:0": assert "__spirv_AtomicFAddEXT" in kern.assembly else: assert "__spirv_AtomicFAddEXT" not in kern.assembly config.NATIVE_FP_ATOMICS = 0 # To bypass caching kernel = dppy.kernel(f) with dppy.offload_to_sycl_device(filter_str) as sycl_queue: kern = kernel[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( kernel._get_argtypes(a), sycl_queue ) assert "__spirv_AtomicFAddEXT" not in kern.assembly config.NATIVE_FP_ATOMICS = NATIVE_FP_ATOMICS_old_val config.LLVM_SPIRV_ROOT = LLVM_SPIRV_ROOT_old_val
def kernel_result_pair(request): op = getattr(dppy.atomic, request.param[0]) def f(a): op(a, 0, 1) return dppy.kernel(f), request.param[1]
def test_caching_kernel_using_same_context(filter_str): """Test kernel caching for the scenario where different SYCL queues that share a SYCL context are used to submit a kernel. Args: filter_str: SYCL filter selector string """ global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Set the global queue to the default device so that the cached_kernel gets # created for that device dpctl.set_global_queue(filter_str) func = dppy.kernel(data_parallel_sum) default_queue = dpctl.get_current_queue() cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), default_queue) for i in range(0, 10): # Each iteration create a fresh queue that will share the same context with dpctl.device_context(filter_str) as gpu_queue: _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_caching_kernel_using_same_queue(filter_str): """Test kernel caching when the same queue is used to submit a kernel multiple times. Args: filter_str: SYCL filter selector string """ global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) with dpctl.device_context(filter_str) as gpu_queue: func = dppy.kernel(data_parallel_sum) cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) for i in range(10): _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_atomic_fp_native( filter_str, NATIVE_FP_ATOMICS, expected_native_atomic_for_device, function_generator, operator_name, expected_spirv_function, dtype, ): function = function_generator(operator_name, dtype) kernel = dppy.kernel(function) argtypes = kernel._get_argtypes(np.array([0], dtype)) with override_config("NATIVE_FP_ATOMICS", NATIVE_FP_ATOMICS): with dpctl.device_context(filter_str) as sycl_queue: specialized_kernel = kernel[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( argtypes, sycl_queue) is_native_atomic = (expected_spirv_function in specialized_kernel.assembly) assert is_native_atomic == expected_native_atomic_for_device( filter_str)
def test_kernel_arg_types(filter_str, input_arrays): kernel = dppy.kernel(mul_kernel) a, actual, c = input_arrays expected = a * c device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, local_size](a, actual, c) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_return(filter_str, sig): a = np.array(np.random.random(122), np.int32) with pytest.raises(TypeError): kernel = dppy.kernel(sig)(f) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a)
def test_kernel_atomic_local(filter_str, input_arrays, return_list_of_op): a, dtype = input_arrays op_type, expected = return_list_of_op f = get_func_local(op_type, dtype) kernel = dppy.kernel(f) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, global_size](a) assert a[0] == expected
def test_bool_type(filter_str): kernel = dppy.kernel(check_bool_kernel) a = np.array([2], np.int64) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, True) assert a[0] == 111 kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, False) assert a[0] == 222
def test_kernel_arg_types(filter_str, input_arrays): if skip_test(filter_str): pytest.skip() kernel = dppy.kernel(mul_kernel) a, actual, c = input_arrays expected = a * c device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, local_size](a, actual, c) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_kernel_atomic_local(filter_str, input_arrays, return_list_of_op): if atomic_skip_test(filter_str): pytest.skip() a, dtype = input_arrays op_type, expected = return_list_of_op f = get_func_local(op_type, dtype) kernel = dppy.kernel(f) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, global_size](a) assert a[0] == expected
def test_return(offload_device, sig): if skip_test(offload_device): pytest.skip() a = np.array(np.random.random(122), np.int32) with pytest.raises(TypeError): kernel = dppy.kernel(sig)(f) device = dpctl.SyclDevice(offload_device) with dppy.offload_to_sycl_device(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a)
def get_kernel_multi_dim(op_type, size): op = getattr(dppy.atomic, op_type) if size == 1: idx = 0 else: idx = (0,) for i in range(size - 1): idx += (0,) def f(a): op(a, idx, 1) return dppy.kernel(f)
def test_bool_type(filter_str): if skip_test(filter_str): pytest.skip() kernel = dppy.kernel(check_bool_kernel) a = np.array([2], np.int64) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, True) assert a[0] == 111 kernel[a.size, dppy.DEFAULT_LOCAL_SIZE](a, False) assert a[0] == 222
def kernel(request): return dppy.kernel(access_types=request.param)(sum_kernel)
def _compile_kernel(self, fnobj, sig): return dppy.kernel(sig)(fnobj)