def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" q = dpctl.SyclQueue("level_zero:gpu") dpctl_prog.create_program_from_source(q, oclSrc)
def test_create_program_from_source(self): oclSrc = " \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = d*a[index] + b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) a = np.ndarray((1024), buffer=abuf, dtype="i") b = np.ndarray((1024), buffer=bbuf, dtype="i") c = np.ndarray((1024), buffer=cbuf, dtype="i") a[:] = np.arange(1024) b[:] = np.arange(1024, 0, -1) c[:] = 0 d = 2 args = [] args.append(a.base) args.append(b.base) args.append(c.base) args.append(ctypes.c_int(d)) r = [1024] q.submit(axpyKernel, args, r) self.assertTrue(np.allclose(c, a * d + b))
def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) self.assertIsNotNone(prog) self.assertTrue(prog.has_sycl_kernel("add")) self.assertTrue(prog.has_sycl_kernel("axpy")) addKernel = prog.get_sycl_kernel("add") axpyKernel = prog.get_sycl_kernel("axpy") self.assertEqual(addKernel.get_function_name(), "add") self.assertEqual(axpyKernel.get_function_name(), "axpy") self.assertEqual(addKernel.get_num_args(), 3) self.assertEqual(axpyKernel.get_num_args(), 4)
def produce_event(profiling=False): oclSrc = " \ kernel void add(global int* a) { \ size_t index = get_global_id(0); \ a[index] = a[index] + 1; \ }" if profiling: q = dpctl.SyclQueue("opencl:cpu", property="enable_profiling") else: q = dpctl.SyclQueue("opencl:cpu") prog = dpctl_prog.create_program_from_source(q, oclSrc) addKernel = prog.get_sycl_kernel("add") bufBytes = 1024 * np.dtype("i").itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((1024), buffer=abuf, dtype="i") a[:] = np.arange(1024) args = [] args.append(a.base) r = [1024] ev = q.submit(addKernel, args, r) return ev
def test_create_program_from_source(self): oclSrc = " \ kernel void add(global int* a, global int* b, global int* c) { \ size_t index = get_global_id(0); \ c[index] = a[index] + b[index]; \ } \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = a[index] + d*b[index]; \ }" with dpctl.device_context("level0:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc)
def test_get_wait_list(): if has_cpu(): oclSrc = " \ kernel void add_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = a[index] + 1; \ } \ kernel void sqrt_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = sqrt(a[index]); \ } \ kernel void sin_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = sin(a[index]); \ }" q = dpctl.SyclQueue("opencl:cpu") prog = dpctl_prog.create_program_from_source(q, oclSrc) addKernel = prog.get_sycl_kernel("add_k") sqrtKernel = prog.get_sycl_kernel("sqrt_k") sinKernel = prog.get_sycl_kernel("sin_k") bufBytes = 1024 * np.dtype("f").itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((1024), buffer=abuf, dtype="f") a[:] = np.arange(1024) args = [] args.append(a.base) r = [1024] ev_1 = q.submit(addKernel, args, r) ev_2 = q.submit(sqrtKernel, args, r, dEvents=[ev_1]) ev_3 = q.submit(sinKernel, args, r, dEvents=[ev_2]) try: wait_list = ev_3.get_wait_list() except ValueError: pytest.fail( "Failed to get a list of waiting events from SyclEvent") assert len(wait_list)
def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): try: q = dpctl.SyclQueue("opencl", property="enable_profiling") except dpctl.SyclQueueCreationError: pytest.skip("OpenCL queue could not be created") # OpenCL conventions for indexing global_id is opposite to # that of SYCL (and DPCTL) oclSrc = ("kernel void axpy(" " global " + ctype_str + " *a, global " + ctype_str + " *b," " global " + ctype_str + " *c, " + ctype_str + " d) {" " size_t index = get_global_id(0);" " c[index] = d * a[index] + b[index];" "}") prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") n_elems = 1024 * 512 lws = 128 bufBytes = n_elems * dtype.itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) bbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) cbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((n_elems, ), buffer=abuf, dtype=dtype) b = np.ndarray((n_elems, ), buffer=bbuf, dtype=dtype) c = np.ndarray((n_elems, ), buffer=cbuf, dtype=dtype) a[:] = np.arange(n_elems) b[:] = np.arange(n_elems, 0, -1) c[:] = 0 d = 2 args = [a.base, b.base, c.base, ctypes_ctor(d)] assert n_elems % lws == 0 for r in ( [ n_elems, ], [2, n_elems], [2, 2, n_elems], ): c[:] = 0 timer = dpctl.SyclTimer() with timer(q): q.submit(axpyKernel, args, r).wait() ref_c = a * np.array(d, dtype=dtype) + b host_dt, device_dt = timer.dt assert type(host_dt) is float and type(device_dt) is float assert np.allclose(c, ref_c), "Failed for {}".format(r) for gr, lr in ( ( [ n_elems, ], [lws], ), ([2, n_elems], [2, lws // 2]), ([2, 2, n_elems], [2, 2, lws // 4]), ): c[:] = 0 timer = dpctl.SyclTimer() with timer(q): q.submit(axpyKernel, args, gr, lr, [dpctl.SyclEvent()]).wait() ref_c = a * np.array(d, dtype=dtype) + b host_dt, device_dt = timer.dt assert type(host_dt) is float and type(device_dt) is float assert np.allclose(c, ref_c), "Faled for {}, {}".formatg(r, lr)