Esempio n. 1
0
def sum_reduce(A):
    global_size = len(A)
    work_group_size = 64
    nb_work_groups = global_size // work_group_size
    if (global_size % work_group_size) != 0:
        nb_work_groups += 1

    partial_sums = np.zeros(nb_work_groups).astype(A.dtype)

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize)
        inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype)
        np.copyto(inp_ndarray, A)

        partial_sums_buf = dpctl_mem.MemoryUSMShared(
            partial_sums.size * partial_sums.dtype.itemsize)
        partial_sums_ndarray = np.ndarray(
            partial_sums.shape,
            buffer=partial_sums_buf,
            dtype=partial_sums.dtype,
        )
        np.copyto(partial_sums_ndarray, partial_sums)

        result = sum_recursive_reduction(global_size, work_group_size,
                                         inp_ndarray, partial_sums_ndarray)

    return result
Esempio n. 2
0
def driver():
    # measure running time
    times = list()

    xbuf = dpctl_mem.MemoryUSMShared(X.size * X.dtype.itemsize)
    x_ndarray = np.ndarray(X.shape, buffer=xbuf, dtype=X.dtype)
    np.copyto(x_ndarray, X)

    dbuf = dpctl_mem.MemoryUSMShared(D.size * D.dtype.itemsize)
    d_ndarray = np.ndarray(D.shape, buffer=dbuf, dtype=D.dtype)
    np.copyto(d_ndarray, D)

    for repeat in range(args.r):
        start = time()
        pairwise_distance[global_size, local_size](x_ndarray, d_ndarray,
                                                   X.shape[0], X.shape[1])
        end = time()

        total_time = end - start
        times.append(total_time)

    np.copyto(X, x_ndarray)
    np.copyto(D, d_ndarray)

    return times
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = d*a[index] + b[index];                                  \
        }"

        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            axpyKernel = prog.get_sycl_kernel("axpy")

            abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            a = np.ndarray((1024), buffer=abuf, dtype="i")
            b = np.ndarray((1024), buffer=bbuf, dtype="i")
            c = np.ndarray((1024), buffer=cbuf, dtype="i")
            a[:] = np.arange(1024)
            b[:] = np.arange(1024, 0, -1)
            c[:] = 0
            d = 2
            args = []

            args.append(a.base)
            args.append(b.base)
            args.append(c.base)
            args.append(ctypes.c_int(d))

            r = [1024]

            q.submit(axpyKernel, args, r)
            self.assertTrue(np.allclose(c, a * d + b))
Esempio n. 4
0
def test_context_multi_device():
    try:
        d = dpctl.SyclDevice("cpu")
    except ValueError:
        pytest.skip()
    if d.default_selector_score < 0:
        pytest.skip()
    n = d.max_compute_units
    n1 = n // 2
    n2 = n - n1
    if n1 == 0 or n2 == 0:
        pytest.skip()
    d1, d2 = d.create_sub_devices(partition=(n1, n2))
    ctx = dpctl.SyclContext((d1, d2))
    assert ctx.device_count == 2
    assert type(repr(ctx)) is str
    q1 = dpctl.SyclQueue(ctx, d1)
    q2 = dpctl.SyclQueue(ctx, d2)
    import dpctl.memory as dpmem

    shmem_1 = dpmem.MemoryUSMShared(256, queue=q1)
    shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2)
    shmem_2.copy_from_device(shmem_1)
    # create context for single sub-device
    ctx1 = dpctl.SyclContext(d1)
    q1 = dpctl.SyclQueue(ctx1, d1)
    shmem_1 = dpmem.MemoryUSMShared(256, queue=q1)
    cap = ctx1._get_capsule()
    cap2 = ctx1._get_capsule()
    del ctx1
    del cap2  # exercise deleter of non-renamed capsule
    ctx2 = dpctl.SyclContext(cap)
    q2 = dpctl.SyclQueue(ctx2, d1)
    shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2)
    shmem_2.copy_from_device(shmem_1)
Esempio n. 5
0
def test_ctor_invalid():
    m = dpm.MemoryUSMShared(12)
    with pytest.raises(ValueError):
        dpt.usm_ndarray((4, ), dtype="i4", buffer=m)
    m = dpm.MemoryUSMShared(64)
    with pytest.raises(ValueError):
        dpt.usm_ndarray((4, ), dtype="u1", buffer=m, strides={"not": "valid"})
Esempio n. 6
0
 def __init__(self, shape, dtype="d", host_buffer=None):
     nelems = np.prod(shape)
     bytes = nelems * np.dtype(dtype).itemsize
     shmem = dpmem.MemoryUSMShared(bytes)
     if isinstance(host_buffer, np.ndarray):
         shmem.copy_from_host(host_buffer.view(dtype="|u1"))
     self.arr = np.ndarray(shape, dtype=dtype, buffer=shmem)
Esempio n. 7
0
def gen_option_params(n_opts,
                      pl,
                      ph,
                      sl,
                      sh,
                      tl,
                      th,
                      rl,
                      rh,
                      vl,
                      vh,
                      dtype,
                      queue=None):
    nbytes = n_opts * 5 * np.dtype(dtype).itemsize
    usm_mem = dpctl_mem.MemoryUSMShared(nbytes, queue=queue)
    params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype)
    seed = 1234
    bs.populate_params(params,
                       pl,
                       ph,
                       sl,
                       sh,
                       tl,
                       th,
                       rl,
                       rh,
                       vl,
                       vh,
                       seed,
                       queue=queue)
    return params
Esempio n. 8
0
def gen_option_params(n_opts, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, dtype):
    usm_mem = dpctl_mem.MemoryUSMShared(n_opts * 5 * np.dtype(dtype).itemsize)
    # usm_mem2 = dpctl_mem.MemoryUSMDevice(n_opts * 5 * np.dtype(dtype).itemsize)
    params = np.ndarray(shape=(n_opts, 5), buffer=usm_mem, dtype=dtype)
    seed = 1234
    bs.populate_params(params, pl, ph, sl, sh, tl, th, rl, rh, vl, vh, seed)
    return params
Esempio n. 9
0
def produce_event(profiling=False):
    oclSrc = "                                                                 \
            kernel void add(global int* a) {                                   \
                size_t index = get_global_id(0);                               \
                a[index] = a[index] + 1;                                       \
            }"

    if profiling:
        q = dpctl.SyclQueue("opencl:cpu", property="enable_profiling")
    else:
        q = dpctl.SyclQueue("opencl:cpu")
    prog = dpctl_prog.create_program_from_source(q, oclSrc)
    addKernel = prog.get_sycl_kernel("add")

    bufBytes = 1024 * np.dtype("i").itemsize
    abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    a = np.ndarray((1024), buffer=abuf, dtype="i")
    a[:] = np.arange(1024)
    args = []

    args.append(a.base)
    r = [1024]
    ev = q.submit(addKernel, args, r)

    return ev
Esempio n. 10
0
def test_has_usm_memory(filter_str):
    a = np.ones(1023, dtype=np.float32)

    with dpctl.device_context(filter_str):
        # test usm_ndarray
        da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared")
        usm_mem = has_usm_memory(da)
        assert da.usm_data._pointer == usm_mem._pointer

        # test usm allocated numpy.ndarray
        buf = dpctl_mem.MemoryUSMShared(a.size * a.dtype.itemsize)
        ary_buf = np.ndarray(a.shape, buffer=buf, dtype=a.dtype)
        usm_mem = has_usm_memory(ary_buf)
        assert buf._pointer == usm_mem._pointer

        usm_mem = has_usm_memory(a)
        assert usm_mem is None
Esempio n. 11
0
def test_get_wait_list():
    if has_cpu():
        oclSrc = "                                                             \
            kernel void add_k(global float* a) {                               \
                size_t index = get_global_id(0);                               \
                a[index] = a[index] + 1;                                       \
            }                                                                  \
            kernel void sqrt_k(global float* a) {                              \
                size_t index = get_global_id(0);                               \
                a[index] = sqrt(a[index]);                                     \
            }                                                                  \
            kernel void sin_k(global float* a) {                               \
                size_t index = get_global_id(0);                               \
                a[index] = sin(a[index]);                                      \
            }"

        q = dpctl.SyclQueue("opencl:cpu")
        prog = dpctl_prog.create_program_from_source(q, oclSrc)
        addKernel = prog.get_sycl_kernel("add_k")
        sqrtKernel = prog.get_sycl_kernel("sqrt_k")
        sinKernel = prog.get_sycl_kernel("sin_k")

        bufBytes = 1024 * np.dtype("f").itemsize
        abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
        a = np.ndarray((1024), buffer=abuf, dtype="f")
        a[:] = np.arange(1024)
        args = []

        args.append(a.base)
        r = [1024]
        ev_1 = q.submit(addKernel, args, r)
        ev_2 = q.submit(sqrtKernel, args, r, dEvents=[ev_1])
        ev_3 = q.submit(sinKernel, args, r, dEvents=[ev_2])

        try:
            wait_list = ev_3.get_wait_list()
        except ValueError:
            pytest.fail(
                "Failed to get a list of waiting events from SyclEvent")
        assert len(wait_list)
Esempio n. 12
0
def test_context_multi_device():
    try:
        d = dpctl.SyclDevice("cpu")
    except ValueError:
        pytest.skip()
    if d.default_selector_score < 0:
        pytest.skip()
    n = d.max_compute_units
    n1 = n // 2
    n2 = n - n1
    if n1 == 0 or n2 == 0:
        pytest.skip()
    d1, d2 = d.create_sub_devices(partition=(n1, n2))
    ctx = dpctl.SyclContext((d1, d2))
    assert ctx.device_count == 2
    q1 = dpctl.SyclQueue(ctx, d1)
    q2 = dpctl.SyclQueue(ctx, d2)
    import dpctl.memory as dpmem

    shmem_1 = dpmem.MemoryUSMShared(256, queue=q1)
    shmem_2 = dpmem.MemoryUSMDevice(256, queue=q2)
    shmem_2.copy_from_device(shmem_1)
Esempio n. 13
0
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Demonstrates SYCL USM memory usage in Python using dpctl.memory.
"""

import dpctl.memory as dpmem

# allocate USM-shared byte-buffer
ms = dpmem.MemoryUSMShared(16)

# allocate USM-device byte-buffer
md = dpmem.MemoryUSMDevice(16)

# allocate USM-host byte-buffer
mh = dpmem.MemoryUSMHost(16)

# specify alignment
mda = dpmem.MemoryUSMDevice(128, alignment=16)

# allocate using given queue,
# i.e. on the device and bound to the context stored in the queue
mdq = dpmem.MemoryUSMDevice(256, queue=mda.sycl_queue)

# information about device associate with USM buffer
Esempio n. 14
0
                       ph,
                       sl,
                       sh,
                       tl,
                       th,
                       rl,
                       rh,
                       vl,
                       vh,
                       seed,
                       queue=queue)
    return params


# ==== dry run ===
usm_mem = dpctl_mem.MemoryUSMShared(3 * 5 * np.dtype("d").itemsize)
opts = np.ndarray((3, 5), buffer=usm_mem, dtype="d")
# copy from Host NumPy to USM buffer
opts[:, :] = np.array([
    [81.2, 81.8, 29, 0.01, 0.02],
    [24.24, 22.1, 10, 0.02, 0.08],
    [100, 100, 30, 0.01, 0.12],
])
# GPU computation
Xgpu = bs.black_scholes_price(opts)

# compute prices in CPython
X_ref = np.array([ref_python_black_scholes(*opt) for opt in opts], dtype="d")

print(
    "Correctness check: allclose(Xgpu, Xref) == ",
Esempio n. 15
0
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Demonstrates host to device copy functions using dpctl.memory.
"""

import dpctl
import dpctl.memory as dpmem
import numpy as np

ms = dpmem.MemoryUSMShared(32)
md = dpmem.MemoryUSMDevice(32)

host_buf = np.random.randint(0, 42, dtype=np.uint8, size=32)

# copy host byte-like object to USM-device buffer
md.copy_from_host(host_buf)

# copy USM-device buffer to USM-shared buffer in parallel (using sycl::queue::memcpy)
ms.copy_from_device(md)

# build numpy array reusing host-accessible USM-shared memory
X = np.ndarray((len(ms), ), buffer=ms, dtype=np.uint8)

# Display Python object NumPy ndarray is viewing into
print("numpy.ndarray.base: ", X.base)
Esempio n. 16
0
def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
    try:
        q = dpctl.SyclQueue("opencl", property="enable_profiling")
    except dpctl.SyclQueueCreationError:
        pytest.skip("OpenCL queue could not be created")
    # OpenCL conventions for indexing global_id is opposite to
    # that of SYCL (and DPCTL)
    oclSrc = ("kernel void axpy("
              "   global " + ctype_str + " *a, global " + ctype_str + " *b,"
              "   global " + ctype_str + " *c, " + ctype_str + " d) {"
              "   size_t index = get_global_id(0);"
              "   c[index] = d * a[index] + b[index];"
              "}")
    prog = dpctl_prog.create_program_from_source(q, oclSrc)
    axpyKernel = prog.get_sycl_kernel("axpy")

    n_elems = 1024 * 512
    lws = 128
    bufBytes = n_elems * dtype.itemsize
    abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    bbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    cbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    a = np.ndarray((n_elems, ), buffer=abuf, dtype=dtype)
    b = np.ndarray((n_elems, ), buffer=bbuf, dtype=dtype)
    c = np.ndarray((n_elems, ), buffer=cbuf, dtype=dtype)
    a[:] = np.arange(n_elems)
    b[:] = np.arange(n_elems, 0, -1)
    c[:] = 0
    d = 2
    args = [a.base, b.base, c.base, ctypes_ctor(d)]

    assert n_elems % lws == 0

    for r in (
        [
            n_elems,
        ],
        [2, n_elems],
        [2, 2, n_elems],
    ):
        c[:] = 0
        timer = dpctl.SyclTimer()
        with timer(q):
            q.submit(axpyKernel, args, r).wait()
            ref_c = a * np.array(d, dtype=dtype) + b
        host_dt, device_dt = timer.dt
        assert type(host_dt) is float and type(device_dt) is float
        assert np.allclose(c, ref_c), "Failed for {}".format(r)

    for gr, lr in (
        (
            [
                n_elems,
            ],
            [lws],
        ),
        ([2, n_elems], [2, lws // 2]),
        ([2, 2, n_elems], [2, 2, lws // 4]),
    ):
        c[:] = 0
        timer = dpctl.SyclTimer()
        with timer(q):
            q.submit(axpyKernel, args, gr, lr, [dpctl.SyclEvent()]).wait()
            ref_c = a * np.array(d, dtype=dtype) + b
        host_dt, device_dt = timer.dt
        assert type(host_dt) is float and type(device_dt) is float
        assert np.allclose(c, ref_c), "Faled for {}, {}".formatg(r, lr)
Esempio n. 17
0
def as_usm_obj(obj, queue=None, usm_type="shared", copy=True):
    """
    Determine and return a SYCL device accesible object.

    We try to determine if the provided object defines a valid
    __sycl_usm_array_interface__ dictionary.
    If not, we create a USM memory of `usm_type` and try to copy the data
    `obj` holds. Only numpy.ndarray is supported currently as `obj` if
    the object is not already allocated using USM.

    Args:
        obj: Object to be tested and data copied from.
        usm_type: USM type used in case obj is not already allocated using USM.
        queue (dpctl.SyclQueue): SYCL queue to be used to allocate USM
            memory in case obj is not already USM allocated.
        copy (bool): Flag to determine if we copy data from obj.

    Returns:
        A Python object allocated using USM memory.

    Raises:
        TypeError:
            1. If obj is not allocated on USM memory or is not of type
               numpy.ndarray, TypeError is raised.
            2. If queue is not of type dpctl.SyclQueue.
        ValueError:
            1. In case obj is not USM allocated, users need to pass
               the SYCL queue to be used for creating new memory. ValuieError
               is raised if queue argument is not provided.
            2. If usm_type is not valid.
            3. If dtype of the passed ndarray(obj) is not supported.
    """
    usm_mem = has_usm_memory(obj)

    if queue is None:
        raise ValueError(
            "Queue can not be None. Please provide the SYCL queue to be used.")
    if not isinstance(queue, dpctl.SyclQueue):
        raise TypeError("queue has to be of dpctl.SyclQueue type. Got %s" %
                        (type(queue)))

    if usm_mem is None:
        if not isinstance(obj, np.ndarray):
            raise TypeError("Obj is not USM allocated and is not of type "
                            "numpy.ndarray. Obj type: %s" % (type(obj)))

        if obj.dtype not in [np.dtype(typ) for typ in supported_numpy_dtype]:
            raise ValueError("dtype is not supprted. Supported dtypes "
                             "are: %s" % (supported_numpy_dtype))

        size = np.prod(obj.shape)
        if usm_type == "shared":
            usm_mem = dpctl_mem.MemoryUSMShared(size * obj.dtype.itemsize,
                                                queue=queue)
        elif usm_type == "device":
            usm_mem = dpctl_mem.MemoryUSMDevice(size * obj.dtype.itemsize,
                                                queue=queue)
        elif usm_type == "host":
            usm_mem = dpctl_mem.MemoryUSMHost(size * obj.dtype.itemsize,
                                              queue=queue)
        else:
            raise ValueError("Supported usm_type are: 'shared', "
                             "'device' and 'host'. Provided: %s" % (usm_type))

        if copy:
            # Copy data from numpy.ndarray
            copy_from_numpy_to_usm_obj(usm_mem, obj)

    return usm_mem