def main(): parser = argparse.ArgumentParser(description="Black-Scholes") parser.add_argument("--options", dest="options", type=int, default=10000000) args = parser.parse_args() options = args.options if dpctl.has_gpu_queues(): print("\nScheduling on OpenCL GPU\n") with dpctl.device_context("opencl:gpu") as gpu_queue: run(10) else: print("\nSkip scheduling on OpenCL GPU\n") # if dpctl.has_gpu_queues(dpctl.backend_type.level_zero): # print("\nScheduling on Level Zero GPU\n") # with dpctl.device_context("level0:gpu") as gpu_queue: # run(10) # else: # print("\nSkip scheduling on Level Zero GPU\n") if dpctl.has_cpu_queues(): print("\nScheduling on OpenCL CPU\n") with dpctl.device_context("opencl:cpu") as cpu_queue: run(10) else: print("\nSkip scheduling on OpenCL CPU\n")
def test_trigonometric_fn(filter_str, trig_op, input_arrays): # FIXME: Why does archcosh fail on Gen12 discrete graphics card? if trig_op == "arccosh" and is_gen12(filter_str): pytest.skip() a, b = input_arrays trig_fn = getattr(np, trig_op) actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) if trig_op == "arctan2": @njit def f(a, b): return trig_fn(a, b) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device), assert_auto_offloading(): actual = f(a, b) expected = trig_fn(a, b) else: @njit def f(a): return trig_fn(a) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device), assert_auto_offloading(): actual = f(a) expected = trig_fn(a) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_get_current_device_type_inside_nested_device_ctxt(): assert dpctl.get_current_device_type() is not None with dpctl.device_context("opencl:cpu:0"): assert dpctl.get_current_device_type() == dpctl.device_type.cpu with dpctl.device_context("opencl:gpu:0"): assert dpctl.get_current_device_type() == dpctl.device_type.gpu assert dpctl.get_current_device_type() == dpctl.device_type.cpu assert dpctl.get_current_device_type() is not None
def test_get_current_device_type_inside_nested_device_ctxt(self): self.assertNotEqual(dpctl.get_current_device_type(), None) with dpctl.device_context("opencl:cpu:0"): self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.cpu) with dpctl.device_context("opencl:gpu:0"): self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.gpu) self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.cpu) self.assertNotEqual(dpctl.get_current_device_type(), None)
def test_caching_kernel_using_same_queue(filter_str): """Test kernel caching when the same queue is used to submit a kernel multiple times. Args: filter_str: SYCL filter selector string """ global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) with dpctl.device_context(filter_str) as gpu_queue: func = dppy.kernel(data_parallel_sum) cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) for i in range(10): _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_all(dtype, shape, filter_str): size = 1 for i in range(len(shape)): size *= shape[i] for i in range(2**size): t = i a = np.empty(size, dtype=dtype) for j in range(size): a[j] = 0 if t % 2 == 0 else j + 1 t = t >> 1 a = a.reshape(shape) def fn(a): return np.all(a) f = njit(fn) with dpctl.device_context(filter_str), dpnp_debug(): actual = f(a) expected = fn(a) np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
def test_dppy_fallback_false(self): @numba.jit def fill_value(i): return i def inner_call_fallback(): x = 10 a = np.empty(shape=x, dtype=np.float32) for i in numba.prange(x): a[i] = fill_value(i) return a try: config.DEBUG = 1 config.FALLBACK_ON_CPU = 0 with warnings.catch_warnings(record=True) as w: device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): dppy = numba.njit(parallel=True)(inner_call_fallback) dppy_fallback_false = dppy() finally: ref_result = inner_call_fallback() config.FALLBACK_ON_CPU = 1 config.DEBUG = 0 not np.testing.assert_array_equal(dppy_fallback_false, ref_result) assert "Failed to offload parfor" not in str(w[-1].message)
def run_kmeans( arrayP, arrayPclusters, arrayC, arrayCsum, arrayCnumpoint, NUMBER_OF_POINTS, NUMBER_OF_CENTROIDS, ): with dpctl.device_context(base_kmeans_gpu_graph.get_device_selector()): for i in range(REPEAT): # for i1 in range(NUMBER_OF_CENTROIDS): # arrayC[i1, 0] = arrayP[i1, 0] # arrayC[i1, 1] = arrayP[i1, 1] arrayC, arrayCsum, arrayCnumpoint = kmeans( arrayP, arrayPclusters, arrayC, arrayCsum, arrayCnumpoint, NUMBER_OF_POINTS, NUMBER_OF_CENTROIDS, )
def test_multiple_prange(self): @njit def f(a, b): # dimensions must be provided as scalar m, n = a.shape for i in prange(m): val = 10 for j in prange(n): b[i, j] = a[i, j] * val for i in prange(m): for j in prange(n): a[i, j] = a[i, j] * 10 m = 8 n = 8 a = np.ones((m, n)) b = np.ones((m, n)) device = dpctl.SyclDevice("opencl:gpu") with assert_auto_offloading( parfor_offloaded=2), dpctl.device_context(device): f(a, b) assert np.all(b == 10) assert np.all(a == 10)
def main(): # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): c = f1(a, b) print("c:", c, hex(c.ctypes.data)) for i in range(N): for j in range(N): for k in range(N): for l in range(N): # noqa for m in range(N): if c[i, j, k, l, m] != 2.0: print( "First index not equal to 2.0 was", i, j, k, l, m, ) break print("Done...")
def local_memory(): """ This example demonstrates the usage of numba-dppy's `local.array` intrinsic function. The function is used to create a static array allocated on the devices local address space. """ blocksize = 10 @dppy.kernel def reverse_array(A): lm = dppy.local.array(shape=10, dtype=float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i] arr = np.arange(blocksize).astype(np.float32) print(arr) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr) # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]`` print(arr)
def no_arg_barrier_support(): """ This example demonstrates the usage of numba_dppy's ``barrier`` intrinsic function. The ``barrier`` function is usable only inside a ``kernel`` and is equivalent to OpenCL's ``barrier`` function. """ @dppy.kernel def twice(A): i = dppy.get_global_id(0) d = A[i] # no argument defaults to global mem fence dppy.barrier() A[i] = d * 2 N = 10 arr = np.arange(N).astype(np.float32) print(arr) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): twice[N, dppy.DEFAULT_LOCAL_SIZE](arr) # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]` print(arr)
def main(): size = 9 scale = 3.0 # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): result = rand() # Random values in a given shape (3, 2) print(result) result = random_sample(size) # Array of shape (9,) with random floats in the # half-open interval [0.0, 1.0) print(result) result = random_exponential(scale, size) # Array of shape (9,) with samples from an exponential distribution print(result) result = random_normal(0.0, 0.1, size) # Array of shape (9,) with samples from a normal distribution print(result) print("Done...")
def gen_data_usm(nopt, dims): X, Y, D = gen_data(nopt, dims) with dpctl.device_context(get_device_selector()) as gpu_queue: X_usm = dpt.usm_ndarray( X.shape, dtype=X.dtype, buffer="device", buffer_ctor_kwargs={"queue": gpu_queue}, ) Y_usm = dpt.usm_ndarray( Y.shape, dtype=Y.dtype, buffer="device", buffer_ctor_kwargs={"queue": gpu_queue}, ) D_usm = dpt.usm_ndarray( D.shape, dtype=D.dtype, buffer="device", buffer_ctor_kwargs={"queue": gpu_queue}, ) X_usm.usm_data.copy_from_host(X.reshape((-1)).view("u1")) Y_usm.usm_data.copy_from_host(Y.reshape((-1)).view("u1")) return (X_usm, Y_usm, D_usm)
def test_caching_kernel_using_same_context(filter_str): """Test kernel caching for the scenario where different SYCL queues that share a SYCL context are used to submit a kernel. Args: filter_str: SYCL filter selector string """ global_size = 10 N = global_size def data_parallel_sum(a, b, c): i = dppy.get_global_id(0) c[i] = a[i] + b[i] a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Set the global queue to the default device so that the cached_kernel gets # created for that device dpctl.set_global_queue(filter_str) func = dppy.kernel(data_parallel_sum) default_queue = dpctl.get_current_queue() cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), default_queue) for i in range(0, 10): # Each iteration create a fresh queue that will share the same context with dpctl.device_context(filter_str) as gpu_queue: _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize( func._get_argtypes(a, b, c), gpu_queue) assert _kernel == cached_kernel
def test_device_array_args_gpu(self): c = np.ones_like(a) with dpctl.device_context("opencl:gpu"): data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c) assert np.all(c == d)
def main(): blockdim = 512, 1 griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1 # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): for i in range(iterations): black_scholes_dppy[blockdim, griddim]( callResult, putResult, stockPrice, optionStrike, optionYears, RISKFREE, VOLATILITY, ) print("callResult : \n", callResult) print("putResult : \n", putResult) print("Done...")
def test_dpctl_api(filter_str): device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): dpctl.lsplatform() dpctl.get_current_queue() dpctl.get_num_activated_queues() dpctl.is_in_device_context()
def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) self.assertIsNotNone(prog) self.assertTrue(prog.has_sycl_kernel("add")) self.assertTrue(prog.has_sycl_kernel("axpy")) addKernel = prog.get_sycl_kernel("add") axpyKernel = prog.get_sycl_kernel("axpy") self.assertEqual(addKernel.get_function_name(), "add") self.assertEqual(axpyKernel.get_function_name(), "axpy") self.assertEqual(addKernel.get_num_args(), 3) self.assertEqual(axpyKernel.get_num_args(), 4)
def test_proper_lowering(filter_str): # We perform eager compilation at the site of # @dppy.kernel. This takes the default dpctl # queue which is level_zero backed. Level_zero # is not yet supported on Windows platform and # hence we skip these tests if the platform is # Windows regardless of which backend filter_str # specifies. if skip_if_win(): pytest.skip() # This will trigger eager compilation @dppy.kernel("void(float32[::1])") def twice(A): i = dppy.get_global_id(0) d = A[i] dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence A[i] = d * 2 N = 256 arr = np.random.random(N).astype(np.float32) orig = arr.copy() with dpctl.device_context(filter_str): twice[N, N // 2](arr) # The computation is correct? np.testing.assert_allclose(orig * 2, arr)
def test_dppy_func_ndarray(self): @dppy.func def g(a): return a + 1 @dppy.kernel def f(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) @dppy.kernel def h(a, b): i = dppy.get_global_id(0) b[i] = g(a[i]) + 1 a = np.ones(self.N) b = np.ones(self.N) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) assert np.all(b == 2) h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b) assert np.all(b == 3)
def test_local_memory(filter_str): blocksize = 10 if skip_if_win(): pytest.skip() @dppy.kernel("void(float32[::1])") def reverse_array(A): lm = dppy.local.array(shape=10, dtype=np.float32) i = dppy.get_global_id(0) # preload lm[i] = A[i] # barrier local or global will both work as we only have one work group dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE) # local mem fence # write A[i] += lm[blocksize - 1 - i] arr = np.arange(blocksize).astype(np.float32) orig = arr.copy() with dpctl.device_context(filter_str): reverse_array[blocksize, blocksize](arr) expected = orig[::-1] + orig np.testing.assert_allclose(expected, arr)
def test_unary_ops(filter_str, unary_op, input_arrays, get_shape, capfd): a = input_arrays[0] op, name = unary_op if name != "argsort" and name != "copy": a = np.reshape(a, get_shape) if name == "cumprod" and (filter_str == "opencl:cpu:0" or a.dtype == np.int32 or is_gen12(filter_str)): pytest.skip() if name == "cumsum" and (filter_str == "opencl:cpu:0" or a.dtype == np.int32 or is_gen12(filter_str)): pytest.skip() if name == "mean" and is_gen12(filter_str): pytest.skip() if name == "argmax" and is_gen12(filter_str): pytest.skip() actual = np.empty(shape=a.shape, dtype=a.dtype) expected = np.empty(shape=a.shape, dtype=a.dtype) f = njit(op) device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device), dpnp_debug(): actual = f(a) captured = capfd.readouterr() assert "dpnp implementation" in captured.out expected = op(a) np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
def sum_reduce(A): global_size = len(A) work_group_size = 64 nb_work_groups = global_size // work_group_size if (global_size % work_group_size) != 0: nb_work_groups += 1 partial_sums = np.zeros(nb_work_groups).astype(A.dtype) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize) inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype) np.copyto(inp_ndarray, A) partial_sums_buf = dpctl_mem.MemoryUSMShared( partial_sums.size * partial_sums.dtype.itemsize) partial_sums_ndarray = np.ndarray( partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype, ) np.copyto(partial_sums_ndarray, partial_sums) result = sum_recursive_reduction(global_size, work_group_size, inp_ndarray, partial_sums_ndarray) return result
def test_kernel_arg_accessor(filter_str, input_arrays, kernel): a, b, actual = input_arrays expected = a + b device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): call_kernel(global_size, local_size, a, b, actual, kernel) np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
def test_sum_reduction(self): # This test will only work for even case N = 1024 assert N % 2 == 0 A = np.array(np.random.random(N), dtype=np.float32) A_copy = A.copy() # at max we will require half the size of A to store sum R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32) device = dpctl.SyclDevice("opencl:gpu") with dpctl.device_context(device): total = N while total > 1: # call kernel global_size = total // 2 reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE]( A, R, global_size ) total = total // 2 result = A_copy.sum() max_abs_err = result - R[0] assert max_abs_err < 1e-4
def main(): global_size = 10 N = global_size print("N", N) a = np.array(np.random.random(N), dtype=np.float32) b = np.array(np.random.random(N), dtype=np.float32) c = np.ones_like(a) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared") da.usm_data.copy_from_host(a.reshape((-1)).view("|u1")) db = dpt.usm_ndarray(b.shape, dtype=b.dtype, buffer="shared") db.usm_data.copy_from_host(b.reshape((-1)).view("|u1")) dc = dpt.usm_ndarray(c.shape, dtype=c.dtype, buffer="shared") driver(da, db, dc, global_size) print("Done...")
def main(): global_size = 64 local_size = 32 N = global_size * local_size print("N", N) a = np.ones(N, dtype=np.float32) b = np.ones(N, dtype=np.float32) print("a:", a, hex(a.ctypes.data)) print("b:", b, hex(b.ctypes.data)) # Use the environment variable SYCL_DEVICE_FILTER to change # the default device. See # https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): c = f1(a, b) print("RESULT c:", c, hex(c.ctypes.data)) for i in range(N): if c[i] != 2.0: print("First index not equal to 2.0 was", i) break print("Done...")
def test_create_program_from_source(self): oclSrc = " \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = d*a[index] + b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) a = np.ndarray((1024), buffer=abuf, dtype="i") b = np.ndarray((1024), buffer=bbuf, dtype="i") c = np.ndarray((1024), buffer=cbuf, dtype="i") a[:] = np.arange(1024) b[:] = np.arange(1024, 0, -1) c[:] = 0 d = 2 args = [] args.append(a.base) args.append(b.base) args.append(c.base) args.append(ctypes.c_int(d)) r = [1024] q.submit(axpyKernel, args, r) self.assertTrue(np.allclose(c, a * d + b))
def test_kernel_atomic_simple(filter_str, input_arrays, kernel_result_pair): a, dtype = input_arrays kernel, expected = kernel_result_pair device = dpctl.SyclDevice(filter_str) with dpctl.device_context(device): kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a) assert a[0] == expected