def test_trigonometric_fn(filter_str, trig_op, input_arrays): if skip_test(filter_str): pytest.skip() # FIXME: Why does archcosh fail on Gen12 discrete graphics card? if trig_op == "arccosh" and is_gen12(filter_str): pytest.skip() a, b = input_arrays trig_fn = getattr(np, trig_op) actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) if trig_op == "arctan2": @njit def f(a, b): return trig_fn(a, b) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a, b) expected = trig_fn(a, b) else: @njit def f(a): return trig_fn(a) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): actual = f(a) expected = trig_fn(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_atomic_fp_native(filter_str, return_list_of_op, fdtype, addrspace): LLVM_SPIRV_ROOT = os.environ.get("NUMBA_DPPY_LLVM_SPIRV_ROOT") if LLVM_SPIRV_ROOT == "" or LLVM_SPIRV_ROOT is None: pytest.skip( "Please set envar NUMBA_DPPY_LLVM_SPIRV_ROOT to run this test" ) if atomic_skip_test(filter_str): pytest.skip() a = np.array([0], fdtype) op_type, expected = return_list_of_op if addrspace == "global": op = getattr(dppy.atomic, op_type) def f(a): op(a, 0, 1) elif addrspace == "local": f = get_func_local(op_type, fdtype) kernel = dppy.kernel(f) NATIVE_FP_ATOMICS_old_val = config.NATIVE_FP_ATOMICS config.NATIVE_FP_ATOMICS = 1 LLVM_SPIRV_ROOT_old_val = config.LLVM_SPIRV_ROOT config.LLVM_SPIRV_ROOT = LLVM_SPIRV_ROOT with dppy.offload_to_sycl_device(filter_str) as sycl_queue: kern = kernel[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( kernel._get_argtypes(a), sycl_queue ) if filter_str != "opencl:cpu:0": assert "__spirv_AtomicFAddEXT" in kern.assembly else: assert "__spirv_AtomicFAddEXT" not in kern.assembly config.NATIVE_FP_ATOMICS = 0 # To bypass caching kernel = dppy.kernel(f) with dppy.offload_to_sycl_device(filter_str) as sycl_queue: kern = kernel[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( kernel._get_argtypes(a), sycl_queue ) assert "__spirv_AtomicFAddEXT" not in kern.assembly config.NATIVE_FP_ATOMICS = NATIVE_FP_ATOMICS_old_val config.LLVM_SPIRV_ROOT = LLVM_SPIRV_ROOT_old_val
def test_multiple_prange(self): @njit def f(a, b): # dimensions must be provided as scalar m, n = a.shape for i in prange(m): val = 10 for j in prange(n): b[i, j] = a[i, j] * val for i in prange(m): for j in prange(n): a[i, j] = a[i, j] * 10 m = 8 n = 8 a = np.ones((m, n)) b = np.ones((m, n)) device = dpctl.SyclDevice("opencl:gpu") with assert_auto_offloading( parfor_offloaded=2), dppy.offload_to_sycl_device(device): f(a, b) self.assertTrue(np.all(b == 10)) self.assertTrue(np.all(a == 10))
def test_local_memory(filter_str): if skip_test(filter_str): pytest.skip() blocksize = 10 if skip_if_win(): pytest.skip() @dppy.kernel("void(float32[::1])") def reverse_array(A): lm = dppy.local.array(shape=10, dtype=np.float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i] arr = np.arange(blocksize).astype(np.float32) orig = arr.copy() with dppy.offload_to_sycl_device(filter_str): reverse_array[blocksize, blocksize](arr) expected = orig[::-1] + orig np.testing.assert_allclose(expected, arr)
def test_proper_lowering(filter_str): if skip_test(filter_str): pytest.skip() # We perform eager compilation at the site of # @dppy.kernel. This takes the default dpctl # queue which is level_zero backed. Level_zero # is not yet supported on Windows platform and # hence we skip these tests if the platform is # Windows regardless of which backend filter_str # specifies. if skip_if_win(): pytest.skip() # This will trigger eager compilation @dppy.kernel("void(float32[::1])") def twice(A): i = dppy.get_global_id(0) d = A[i] dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2 N = 256 arr = np.random.random(N).astype(np.float32) orig = arr.copy() with dppy.offload_to_sycl_device(filter_str): twice[N, N // 2](arr) # The computation is correct? np.testing.assert_allclose(orig * 2, arr)
def test_dppy_fallback_false(self): @numba.jit def fill_value(i): return i def inner_call_fallback(): x = 10 a = np.empty(shape=x, dtype=np.float32) for i in numba.prange(x): a[i] = fill_value(i) return a try: config.DEBUG = 1 config.FALLBACK_ON_CPU = 0 with warnings.catch_warnings(record=True) as w: device = dpctl.SyclDevice("opencl:gpu") with numba_dppy.offload_to_sycl_device(device): dppy = numba.njit(parallel=True)(inner_call_fallback) dppy_fallback_false = dppy() finally: ref_result = inner_call_fallback() config.FALLBACK_ON_CPU = 1 config.DEBUG = 0 not np.testing.assert_array_equal(dppy_fallback_false, ref_result) self.assertNotIn("Failed to offload parfor", str(w[-1].message))
def test_vectorize(filter_str, shape, dtypes, input_type): if _helper.platform_not_supported(filter_str): pytest.skip() if _helper.skip_test(filter_str): pytest.skip() def vector_add(a, b): return a + b dtype, sig_dtype = dtypes sig = [sig_dtype(sig_dtype, sig_dtype)] size, shape = shape if input_type == "array": A = np.arange(size, dtype=dtype).reshape(shape) B = np.arange(size, dtype=dtype).reshape(shape) elif input_type == "scalar": A = dtype(1.2) B = dtype(2.3) with dppy.offload_to_sycl_device(filter_str): f = vectorize(sig, target="dppy")(vector_add) expected = f(A, B) actual = vector_add(A, B) max_abs_err = np.sum(expected) - np.sum(actual) assert max_abs_err < 1e-5
def test_device_array_args_gpu(self): c = np.ones_like(a) with dppy.offload_to_sycl_device("opencl:gpu"): data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c) self.assertTrue(np.all(c == d))
def main(): size = 9 scale = 3.0 # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): result = rand() # Random values in a given shape (3, 2) print(result) result = random_sample(size) # Array of shape (9,) with random floats in the # half-open interval [0.0, 1.0) print(result) result = random_exponential(scale, size) # Array of shape (9,) with samples from an exponential distribution print(result) result = random_normal(0.0, 0.1, size) # Array of shape (9,) with samples from a normal distribution print(result) print("Done...")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--api", required=False, default="numba", choices=["numba", "numba-dppy"], help="Start the version of functions using numba or numba-dppy API", ) args = parser.parse_args() print("Using API:", args.api) global_size = 10 N = global_size a = np.arange(N, dtype=np.float32) b = np.arange(N, dtype=np.float32) c = np.empty_like(a) if args.api == "numba-dppy": device = dpctl.select_default_device() with dppy.offload_to_sycl_device(device): dppy_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c) else: numba_func_driver(a, b, c) print("Done...")
def test_eig(filter_str, eig_input, capfd): if skip_test(filter_str): pytest.skip() a = eig_input fn = get_fn("linalg.eig", 1) f = njit(fn) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), dpnp_debug(): actual_val, actual_vec = f(a) captured = capfd.readouterr() assert "dpnp implementation" in captured.out expected_val, expected_vec = fn(a) # sort val/vec by abs value vvsort(actual_val, actual_vec) vvsort(expected_val, expected_vec) # NP change sign of vectors for i in range(expected_vec.shape[1]): if expected_vec[0, i] * actual_vec[0, i] < 0: expected_vec[:, i] = -expected_vec[:, i] assert np.allclose(actual_val, expected_val) assert np.allclose(actual_vec, expected_vec)
def test_caching_kernel_using_same_context(filter_str): """Test kernel caching for the scenario where different SYCL queues that share a SYCL context are used to submit a kernel. Args: filter_str: SYCL filter selector string """ if skip_test(filter_str): pytest.skip() global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Set the global queue to the default device so that the cached_kernel gets # created for that device dpctl.set_global_queue(filter_str) func = dppy.kernel(data_parallel_sum) default_queue = dpctl.get_current_queue() cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), default_queue) for i in range(0, 10): # Each iteration create a fresh queue that will share the same context with dppy.offload_to_sycl_device(filter_str) as gpu_queue: _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_caching_kernel_using_same_queue(filter_str): """Test kernel caching when the same queue is used to submit a kernel multiple times. Args: filter_str: SYCL filter selector string """ if skip_test(filter_str): pytest.skip() global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) with dppy.offload_to_sycl_device(filter_str) as gpu_queue: func = dppy.kernel(data_parallel_sum) cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) for i in range(10): _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def main(): global_size = 10 N = global_size print("N", N) a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared") da.usm_data.copy_from_host(a.reshape((-1)).view("|u1")) db = dpt.usm_ndarray(b.shape, dtype=b.dtype, buffer="shared") db.usm_data.copy_from_host(b.reshape((-1)).view("|u1")) dc = dpt.usm_ndarray(c.shape, dtype=c.dtype, buffer="shared") driver(da, db, dc, global_size) print("Done...")
def test_all(dtype, shape, filter_str): if skip_test(filter_str): pytest.skip() size = 1 for i in range(len(shape)): size *= shape[i] for i in range(2**size): t = i a = np.empty(size, dtype=dtype) for j in range(size): a[j] = 0 if t % 2 == 0 else j + 1 t = t >> 1 a = a.reshape(shape) def fn(a): return np.all(a) f = njit(fn) with dppy.offload_to_sycl_device(filter_str), dpnp_debug(): actual = f(a) expected = fn(a) np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
def no_arg_barrier_support(): """ This example demonstrates the usage of numba_dppy's ``barrier`` intrinsic function. The ``barrier`` function is usable only inside a ``kernel`` and is equivalent to OpenCL's ``barrier`` function. """ @dppy.kernel def twice(A): i = dppy.get_global_id(0) d = A[i] # no argument defaults to global mem fence dppy.barrier() A[i] = d * 2 N = 10 arr = np.arange(N).astype(np.float32) print(arr) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): twice[N, dppy.DEFAULT_LOCAL_SIZE](arr) # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]` print(arr)
def test_consuming_array_from_dpnp(offload_device, dtype): if not ensure_dpnp(): pytest.skip("No DPNP") import dpnp if skip_test(offload_device): pytest.skip("No device for " + offload_device) if ("opencl" not in dpctl.get_current_queue().sycl_device.filter_string and "opencl" in offload_device): pytest.skip("Bug in DPNP. See: IntelPython/dpnp#723") @dppy.kernel def data_parallel_sum(a, b, c): """ Vector addition using the ``kernel`` decorator. """ i = dppy.get_global_id(0) c[i] = a[i] + b[i] global_size = 1021 with dppy.offload_to_sycl_device(offload_device): a = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) b = dppy.asarray(dpnp.arange(global_size, dtype=dtype)) c = dppy.asarray(dpnp.ones_like(a)) data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)
def local_memory(): """ This example demonstrates the usage of numba-dppy's `local.array` intrinsic function. The function is used to create a static array allocated on the devices local address space. """ blocksize = 10 @dppy.kernel def reverse_array(A): lm = dppy.local.array(shape=10, dtype=float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i] arr = np.arange(blocksize).astype(np.float32) print(arr) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr) # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]`` print(arr)
def test_njit(filter_str): if _helper.platform_not_supported(filter_str): pytest.skip() if _helper.skip_test(filter_str): pytest.skip() @vectorize(nopython=True) def axy(a, x, y): return a * x + y def f(a0, a1): return np.cos(axy(a0, np.sin(a1) - 1.0, 1.0)) A = np.random.random(10) B = np.random.random(10) device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device), assert_auto_offloading(): f_njit = njit(f) expected = f_njit(A, B) actual = f(A, B) max_abs_err = expected.sum() - actual.sum() assert max_abs_err < 1e-5
def test_dppy_func_ndarray(self): @dppy.func def g(a): return a + 1 @dppy.kernel def f(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) @dppy.kernel def h(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) + 1 a = np.ones(self.N) b = np.ones(self.N) device = dpctl.SyclDevice("opencl:gpu") with dppy.offload_to_sycl_device(device): f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) self.assertTrue(np.all(b == 2)) h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) self.assertTrue(np.all(b == 3))
def private_memory(): """ This example demonstrates the usage of numba-dppy's `private.array` intrinsic function. The function is used to create a static array allocated on the devices private address space. """ @numba_dppy.kernel def private_memory_kernel(A): memory = numba_dppy.private.array(shape=1, dtype=np.float32) i = numba_dppy.get_global_id(0) # preload memory[0] = i numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # memory will not hold correct deterministic result if it is not # private to each thread. A[i] = memory[0] * 2 N = 4 arr = np.zeros(N).astype(np.float32) orig = np.arange(N).astype(np.float32) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with numba_dppy.offload_to_sycl_device(device): private_memory_kernel[N, N](arr) np.testing.assert_allclose(orig * 2, arr) # the output should be `orig[i] * 2, i.e. [0, 2, 4, ..]`` print(arr)
def sum_reduce(A): global_size = len(A) work_group_size = 64 nb_work_groups = global_size // work_group_size if (global_size % work_group_size) != 0: nb_work_groups += 1 partial_sums = np.zeros(nb_work_groups).astype(A.dtype) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize) inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype) np.copyto(inp_ndarray, A) partial_sums_buf = dpctl_mem.MemoryUSMShared( partial_sums.size * partial_sums.dtype.itemsize ) partial_sums_ndarray = np.ndarray( partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype, ) np.copyto(partial_sums_ndarray, partial_sums) result = sum_recursive_reduction( global_size, work_group_size, inp_ndarray, partial_sums_ndarray ) return result
def test_dppy_fallback_inner_call(self): @numba.jit def fill_value(i): return i def inner_call_fallback(): x = 10 a = np.empty(shape=x, dtype=np.float32) for i in numba.prange(x): a[i] = fill_value(i) return a device = dpctl.SyclDevice("opencl:gpu") with warnings.catch_warnings( record=True ) as w, numba_dppy.offload_to_sycl_device(device): dppy = numba.njit(inner_call_fallback) dppy_result = dppy() ref_result = inner_call_fallback() np.testing.assert_array_equal(dppy_result, ref_result) self.assertIn("Failed to offload parfor ", str(w[-1].message))
def main(): global_size = 64 local_size = 32 N = global_size * local_size print("N", N) a = np.ones(N, dtype=np.float32) b = np.ones(N, dtype=np.float32) print("a:", a, hex(a.ctypes.data)) print("b:", b, hex(b.ctypes.data)) # Use the environment variable SYCL_DEVICE_FILTER to change # the default device. See # https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): c = f1(a, b) print("RESULT c:", c, hex(c.ctypes.data)) for i in range(N): if c[i] != 2.0: print("First index not equal to 2.0 was", i) break print("Done...")
def test_with_dppy_context_cpu(self): @njit def nested_func(a, b): np.sin(a, b) @njit def func(b): a = np.ones((64), dtype=np.float64) nested_func(a, b) config.DEBUG = 1 expected = np.ones((64), dtype=np.float64) got_cpu = np.ones((64), dtype=np.float64) with captured_stdout() as got_cpu_message: device = dpctl.SyclDevice("opencl:cpu") with dppy.offload_to_sycl_device(device): func(got_cpu) config.DEBUG = 0 func(expected) np.testing.assert_array_equal(expected, got_cpu) self.assertTrue( "Parfor offloaded to opencl:cpu" in got_cpu_message.getvalue())
def main(): blockdim = 512, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): for i in range(iterations): black_scholes_dppy[blockdim, griddim]( callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, ) print("callResult : \n", callResult) print("putResult : \n", putResult) print("Done...")
def main(): # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dppy.offload_to_sycl_device(device): c = f1(a, b) print("c:", c, hex(c.ctypes.data)) for i in range(N): for j in range(N): for k in range(N): for l in range(N): # noqa for m in range(N): if c[i, j, k, l, m] != 2.0: print( "First index not equal to 2.0 was", i, j, k, l, m, ) break print("Done...")
def test_dpctl_api(filter_str): if skip_test(filter_str): pytest.skip() device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_kernel_arg_accessor(filter_str, input_arrays, kernel): if skip_test(filter_str): pytest.skip() a, b, actual = input_arrays expected = a + b device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): call_kernel(global_size, local_size, a, b, actual, kernel) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_kernel_atomic_simple(filter_str, input_arrays, kernel_result_pair): if atomic_skip_test(filter_str): pytest.skip() a, dtype = input_arrays kernel, expected = kernel_result_pair device = dpctl.SyclDevice(filter_str) with dppy.offload_to_sycl_device(device): kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a) assert a[0] == expected