def sum_reduce(A): global_size = len(A) work_group_size = 64 nb_work_groups = global_size // work_group_size if (global_size % work_group_size) != 0: nb_work_groups += 1 partial_sums = np.zeros(nb_work_groups).astype(A.dtype) # Use the environment variable SYCL_DEVICE_FILTER to change the default device. # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. device = dpctl.select_default_device() print("Using device ...") device.print_device_info() with dpctl.device_context(device): inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize) inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype) np.copyto(inp_ndarray, A) partial_sums_buf = dpctl_mem.MemoryUSMShared( partial_sums.size * partial_sums.dtype.itemsize) partial_sums_ndarray = np.ndarray( partial_sums.shape, buffer=partial_sums_buf, dtype=partial_sums.dtype, ) np.copyto(partial_sums_ndarray, partial_sums) result = sum_recursive_reduction(global_size, work_group_size, inp_ndarray, partial_sums_ndarray) return result
def driver(): # measure running time times = list() xbuf = dpctl_mem.MemoryUSMShared(X.size * X.dtype.itemsize) x_ndarray = np.ndarray(X.shape, buffer=xbuf, dtype=X.dtype) np.copyto(x_ndarray, X) dbuf = dpctl_mem.MemoryUSMShared(D.size * D.dtype.itemsize) d_ndarray = np.ndarray(D.shape, buffer=dbuf, dtype=D.dtype) np.copyto(d_ndarray, D) for repeat in range(args.r): start = time() pairwise_distance[global_size, local_size](x_ndarray, d_ndarray, X.shape[0], X.shape[1]) end = time() total_time = end - start times.append(total_time) np.copyto(X, x_ndarray) np.copyto(D, d_ndarray) return times
def test_create_program_from_source(self): oclSrc = " \ kernel void axpy(global int* a, global int* b, global int* c, int d) { \ size_t index = get_global_id(0); \ c[index] = d*a[index] + b[index]; \ }" with dpctl.device_context("opencl:gpu:0"): q = dpctl.get_current_queue() prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize) a = np.ndarray((1024), buffer=abuf, dtype="i") b = np.ndarray((1024), buffer=bbuf, dtype="i") c = np.ndarray((1024), buffer=cbuf, dtype="i") a[:] = np.arange(1024) b[:] = np.arange(1024, 0, -1) c[:] = 0 d = 2 args = [] args.append(a.base) args.append(b.base) args.append(c.base) args.append(ctypes.c_int(d)) r = [1024] q.submit(axpyKernel, args, r) self.assertTrue(np.allclose(c, a * d + b))
def test_context_multi_device(): try: d = dpctl.SyclDevice("cpu") except ValueError: pytest.skip() if d.default_selector_score < 0: pytest.skip() n = d.max_compute_units n1 = n // 2 n2 = n - n1 if n1 == 0 or n2 == 0: pytest.skip() d1, d2 = d.create_sub_devices(partition=(n1, n2)) ctx = dpctl.SyclContext((d1, d2)) assert ctx.device_count == 2 assert type(repr(ctx)) is str q1 = dpctl.SyclQueue(ctx, d1) q2 = dpctl.SyclQueue(ctx, d2) import dpctl.memory as dpmem shmem_1 = dpmem.MemoryUSMShared(256, queue=q1) shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2) shmem_2.copy_from_device(shmem_1) # create context for single sub-device ctx1 = dpctl.SyclContext(d1) q1 = dpctl.SyclQueue(ctx1, d1) shmem_1 = dpmem.MemoryUSMShared(256, queue=q1) cap = ctx1._get_capsule() cap2 = ctx1._get_capsule() del ctx1 del cap2 # exercise deleter of non-renamed capsule ctx2 = dpctl.SyclContext(cap) q2 = dpctl.SyclQueue(ctx2, d1) shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2) shmem_2.copy_from_device(shmem_1)
def test_ctor_invalid(): m = dpm.MemoryUSMShared(12) with pytest.raises(ValueError): dpt.usm_ndarray((4, ), dtype="i4", buffer=m) m = dpm.MemoryUSMShared(64) with pytest.raises(ValueError): dpt.usm_ndarray((4, ), dtype="u1", buffer=m, strides={"not": "valid"})
def __init__(self, shape, dtype="d", host_buffer=None): nelems = np.prod(shape) bytes = nelems * np.dtype(dtype).itemsize shmem = dpmem.MemoryUSMShared(bytes) if isinstance(host_buffer, np.ndarray): shmem.copy_from_host(host_buffer.view(dtype="|u1")) self.arr = np.ndarray(shape, dtype=dtype, buffer=shmem)
def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype, queue=None): nbytes = n_opts * 5 * np.dtype(dtype).itemsize usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue) params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype) seed = 1234 bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue) return params
def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype): usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize) # usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize) params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype) seed = 1234 bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed) return params
def produce_event(profiling=False): oclSrc = " \ kernel void add(global int* a) { \ size_t index = get_global_id(0); \ a[index] = a[index] + 1; \ }" if profiling: q = dpctl.SyclQueue("opencl:cpu", property="enable_profiling") else: q = dpctl.SyclQueue("opencl:cpu") prog = dpctl_prog.create_program_from_source(q, oclSrc) addKernel = prog.get_sycl_kernel("add") bufBytes = 1024 * np.dtype("i").itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((1024), buffer=abuf, dtype="i") a[:] = np.arange(1024) args = [] args.append(a.base) r = [1024] ev = q.submit(addKernel, args, r) return ev
def test_has_usm_memory(filter_str): a = np.ones(1023, dtype=np.float32) with dpctl.device_context(filter_str): # test usm_ndarray da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared") usm_mem = has_usm_memory(da) assert da.usm_data._pointer == usm_mem._pointer # test usm allocated numpy.ndarray buf = dpctl_mem.MemoryUSMShared(a.size * a.dtype.itemsize) ary_buf = np.ndarray(a.shape, buffer=buf, dtype=a.dtype) usm_mem = has_usm_memory(ary_buf) assert buf._pointer == usm_mem._pointer usm_mem = has_usm_memory(a) assert usm_mem is None
def test_get_wait_list(): if has_cpu(): oclSrc = " \ kernel void add_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = a[index] + 1; \ } \ kernel void sqrt_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = sqrt(a[index]); \ } \ kernel void sin_k(global float* a) { \ size_t index = get_global_id(0); \ a[index] = sin(a[index]); \ }" q = dpctl.SyclQueue("opencl:cpu") prog = dpctl_prog.create_program_from_source(q, oclSrc) addKernel = prog.get_sycl_kernel("add_k") sqrtKernel = prog.get_sycl_kernel("sqrt_k") sinKernel = prog.get_sycl_kernel("sin_k") bufBytes = 1024 * np.dtype("f").itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((1024), buffer=abuf, dtype="f") a[:] = np.arange(1024) args = [] args.append(a.base) r = [1024] ev_1 = q.submit(addKernel, args, r) ev_2 = q.submit(sqrtKernel, args, r, dEvents=[ev_1]) ev_3 = q.submit(sinKernel, args, r, dEvents=[ev_2]) try: wait_list = ev_3.get_wait_list() except ValueError: pytest.fail( "Failed to get a list of waiting events from SyclEvent") assert len(wait_list)
def test_context_multi_device(): try: d = dpctl.SyclDevice("cpu") except ValueError: pytest.skip() if d.default_selector_score < 0: pytest.skip() n = d.max_compute_units n1 = n // 2 n2 = n - n1 if n1 == 0 or n2 == 0: pytest.skip() d1, d2 = d.create_sub_devices(partition=(n1, n2)) ctx = dpctl.SyclContext((d1, d2)) assert ctx.device_count == 2 q1 = dpctl.SyclQueue(ctx, d1) q2 = dpctl.SyclQueue(ctx, d2) import dpctl.memory as dpmem shmem_1 = dpmem.MemoryUSMShared(256, queue=q1) shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2) shmem_2.copy_from_device(shmem_1)
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Demonstrates SYCL USM memory usage in Python using dpctl.memory. """ import dpctl.memory as dpmem # allocate USM-shared byte-buffer ms = dpmem.MemoryUSMShared(16) # allocate USM-device byte-buffer md = dpmem.MemoryUSMDevice(16) # allocate USM-host byte-buffer mh = dpmem.MemoryUSMHost(16) # specify alignment mda = dpmem.MemoryUSMDevice(128, alignment=16) # allocate using given queue, # i.e. on the device and bound to the context stored in the queue mdq = dpmem.MemoryUSMDevice(256, queue=mda.sycl_queue) # information about device associate with USM buffer
ph, sl, sh, tl, th, rl, rh, vl, vh, seed, queue=queue) return params # ==== dry run === usm_mem = dpctl_mem.MemoryUSMShared(3 * 5 * np.dtype("d").itemsize) opts = np.ndarray((3, 5), buffer=usm_mem, dtype="d") # copy from Host NumPy to USM buffer opts[:, :] = np.array([ [81.2, 81.8, 29, 0.01, 0.02], [24.24, 22.1, 10, 0.02, 0.08], [100, 100, 30, 0.01, 0.12], ]) # GPU computation Xgpu = bs.black_scholes_price(opts) # compute prices in CPython X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d") print( "Correctness check: allclose(Xgpu, Xref) == ",
# # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Demonstrates host to device copy functions using dpctl.memory. """ import dpctl import dpctl.memory as dpmem import numpy as np ms = dpmem.MemoryUSMShared(32) md = dpmem.MemoryUSMDevice(32) host_buf = np.random.randint(0, 42, dtype=np.uint8, size=32) # copy host byte-like object to USM-device buffer md.copy_from_host(host_buf) # copy USM-device buffer to USM-shared buffer in parallel (using sycl::queue::memcpy) ms.copy_from_device(md) # build numpy array reusing host-accessible USM-shared memory X = np.ndarray((len(ms), ), buffer=ms, dtype=np.uint8) # Display Python object NumPy ndarray is viewing into print("numpy.ndarray.base: ", X.base)
def test_create_program_from_source(ctype_str, dtype, ctypes_ctor): try: q = dpctl.SyclQueue("opencl", property="enable_profiling") except dpctl.SyclQueueCreationError: pytest.skip("OpenCL queue could not be created") # OpenCL conventions for indexing global_id is opposite to # that of SYCL (and DPCTL) oclSrc = ("kernel void axpy(" " global " + ctype_str + " *a, global " + ctype_str + " *b," " global " + ctype_str + " *c, " + ctype_str + " d) {" " size_t index = get_global_id(0);" " c[index] = d * a[index] + b[index];" "}") prog = dpctl_prog.create_program_from_source(q, oclSrc) axpyKernel = prog.get_sycl_kernel("axpy") n_elems = 1024 * 512 lws = 128 bufBytes = n_elems * dtype.itemsize abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) bbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) cbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q) a = np.ndarray((n_elems, ), buffer=abuf, dtype=dtype) b = np.ndarray((n_elems, ), buffer=bbuf, dtype=dtype) c = np.ndarray((n_elems, ), buffer=cbuf, dtype=dtype) a[:] = np.arange(n_elems) b[:] = np.arange(n_elems, 0, -1) c[:] = 0 d = 2 args = [a.base, b.base, c.base, ctypes_ctor(d)] assert n_elems % lws == 0 for r in ( [ n_elems, ], [2, n_elems], [2, 2, n_elems], ): c[:] = 0 timer = dpctl.SyclTimer() with timer(q): q.submit(axpyKernel, args, r).wait() ref_c = a * np.array(d, dtype=dtype) + b host_dt, device_dt = timer.dt assert type(host_dt) is float and type(device_dt) is float assert np.allclose(c, ref_c), "Failed for {}".format(r) for gr, lr in ( ( [ n_elems, ], [lws], ), ([2, n_elems], [2, lws // 2]), ([2, 2, n_elems], [2, 2, lws // 4]), ): c[:] = 0 timer = dpctl.SyclTimer() with timer(q): q.submit(axpyKernel, args, gr, lr, [dpctl.SyclEvent()]).wait() ref_c = a * np.array(d, dtype=dtype) + b host_dt, device_dt = timer.dt assert type(host_dt) is float and type(device_dt) is float assert np.allclose(c, ref_c), "Faled for {}, {}".formatg(r, lr)
def as_usm_obj(obj, queue=None, usm_type="shared", copy=True): """ Determine and return a SYCL device accesible object. We try to determine if the provided object defines a valid __sycl_usm_array_interface__ dictionary. If not, we create a USM memory of `usm_type` and try to copy the data `obj` holds. Only numpy.ndarray is supported currently as `obj` if the object is not already allocated using USM. Args: obj: Object to be tested and data copied from. usm_type: USM type used in case obj is not already allocated using USM. queue (dpctl.SyclQueue): SYCL queue to be used to allocate USM memory in case obj is not already USM allocated. copy (bool): Flag to determine if we copy data from obj. Returns: A Python object allocated using USM memory. Raises: TypeError: 1. If obj is not allocated on USM memory or is not of type numpy.ndarray, TypeError is raised. 2. If queue is not of type dpctl.SyclQueue. ValueError: 1. In case obj is not USM allocated, users need to pass the SYCL queue to be used for creating new memory. ValuieError is raised if queue argument is not provided. 2. If usm_type is not valid. 3. If dtype of the passed ndarray(obj) is not supported. """ usm_mem = has_usm_memory(obj) if queue is None: raise ValueError( "Queue can not be None. Please provide the SYCL queue to be used.") if not isinstance(queue, dpctl.SyclQueue): raise TypeError("queue has to be of dpctl.SyclQueue type. Got %s" % (type(queue))) if usm_mem is None: if not isinstance(obj, np.ndarray): raise TypeError("Obj is not USM allocated and is not of type " "numpy.ndarray. Obj type: %s" % (type(obj))) if obj.dtype not in [np.dtype(typ) for typ in supported_numpy_dtype]: raise ValueError("dtype is not supprted. Supported dtypes " "are: %s" % (supported_numpy_dtype)) size = np.prod(obj.shape) if usm_type == "shared": usm_mem = dpctl_mem.MemoryUSMShared(size * obj.dtype.itemsize, queue=queue) elif usm_type == "device": usm_mem = dpctl_mem.MemoryUSMDevice(size * obj.dtype.itemsize, queue=queue) elif usm_type == "host": usm_mem = dpctl_mem.MemoryUSMHost(size * obj.dtype.itemsize, queue=queue) else: raise ValueError("Supported usm_type are: 'shared', " "'device' and 'host'. Provided: %s" % (usm_type)) if copy: # Copy data from numpy.ndarray copy_from_numpy_to_usm_obj(usm_mem, obj) return usm_mem