Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(description="Black-Scholes")
    parser.add_argument("--options",
                        dest="options",
                        type=int,
                        default=10000000)
    args = parser.parse_args()
    options = args.options

    if dpctl.has_gpu_queues():
        print("\nScheduling on OpenCL GPU\n")
        with dpctl.device_context("opencl:gpu") as gpu_queue:
            run(10)
    else:
        print("\nSkip scheduling on OpenCL GPU\n")
    # if dpctl.has_gpu_queues(dpctl.backend_type.level_zero):
    #    print("\nScheduling on Level Zero GPU\n")
    #    with dpctl.device_context("level0:gpu") as gpu_queue:
    #        run(10)
    # else:
    #    print("\nSkip scheduling on Level Zero GPU\n")
    if dpctl.has_cpu_queues():
        print("\nScheduling on OpenCL CPU\n")
        with dpctl.device_context("opencl:cpu") as cpu_queue:
            run(10)
    else:
        print("\nSkip scheduling on OpenCL CPU\n")
Ejemplo n.º 2
0
def test_trigonometric_fn(filter_str, trig_op, input_arrays):
    # FIXME: Why does archcosh fail on Gen12 discrete graphics card?
    if trig_op == "arccosh" and is_gen12(filter_str):
        pytest.skip()

    a, b = input_arrays
    trig_fn = getattr(np, trig_op)
    actual = np.empty(shape=a.shape, dtype=a.dtype)
    expected = np.empty(shape=a.shape, dtype=a.dtype)

    if trig_op == "arctan2":

        @njit
        def f(a, b):
            return trig_fn(a, b)

        device = dpctl.SyclDevice(filter_str)
        with dpctl.device_context(device), assert_auto_offloading():
            actual = f(a, b)
        expected = trig_fn(a, b)
    else:

        @njit
        def f(a):
            return trig_fn(a)

        device = dpctl.SyclDevice(filter_str)
        with dpctl.device_context(device), assert_auto_offloading():
            actual = f(a)
        expected = trig_fn(a)

    np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
Ejemplo n.º 3
0
def test_get_current_device_type_inside_nested_device_ctxt():
    assert dpctl.get_current_device_type() is not None

    with dpctl.device_context("opencl:cpu:0"):
        assert dpctl.get_current_device_type() == dpctl.device_type.cpu

        with dpctl.device_context("opencl:gpu:0"):
            assert dpctl.get_current_device_type() == dpctl.device_type.gpu
        assert dpctl.get_current_device_type() == dpctl.device_type.cpu

    assert dpctl.get_current_device_type() is not None
Ejemplo n.º 4
0
    def test_get_current_device_type_inside_nested_device_ctxt(self):
        self.assertNotEqual(dpctl.get_current_device_type(), None)

        with dpctl.device_context("opencl:cpu:0"):
            self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.cpu)

            with dpctl.device_context("opencl:gpu:0"):
                self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.gpu)
            self.assertEqual(dpctl.get_current_device_type(), dpctl.device_type.cpu)

        self.assertNotEqual(dpctl.get_current_device_type(), None)
Ejemplo n.º 5
0
def test_caching_kernel_using_same_queue(filter_str):
    """Test kernel caching when the same queue is used to submit a kernel
    multiple times.

    Args:
        filter_str: SYCL filter selector string
    """
    global_size = 10
    N = global_size

    def data_parallel_sum(a, b, c):
        i = dppy.get_global_id(0)
        c[i] = a[i] + b[i]

    a = np.array(np.random.random(N), dtype=np.float32)
    b = np.array(np.random.random(N), dtype=np.float32)
    c = np.ones_like(a)

    with dpctl.device_context(filter_str) as gpu_queue:
        func = dppy.kernel(data_parallel_sum)
        cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
            func._get_argtypes(a, b, c), gpu_queue)

        for i in range(10):
            _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
                func._get_argtypes(a, b, c), gpu_queue)
            assert _kernel == cached_kernel
Ejemplo n.º 6
0
def test_all(dtype, shape, filter_str):
    size = 1
    for i in range(len(shape)):
        size *= shape[i]

    for i in range(2**size):
        t = i

        a = np.empty(size, dtype=dtype)

        for j in range(size):
            a[j] = 0 if t % 2 == 0 else j + 1
            t = t >> 1

        a = a.reshape(shape)

        def fn(a):
            return np.all(a)

        f = njit(fn)
        with dpctl.device_context(filter_str), dpnp_debug():
            actual = f(a)

        expected = fn(a)
        np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
Ejemplo n.º 7
0
    def test_dppy_fallback_false(self):
        @numba.jit
        def fill_value(i):
            return i

        def inner_call_fallback():
            x = 10
            a = np.empty(shape=x, dtype=np.float32)

            for i in numba.prange(x):
                a[i] = fill_value(i)

            return a

        try:
            config.DEBUG = 1
            config.FALLBACK_ON_CPU = 0
            with warnings.catch_warnings(record=True) as w:
                device = dpctl.SyclDevice("opencl:gpu")
                with dpctl.device_context(device):
                    dppy = numba.njit(parallel=True)(inner_call_fallback)
                    dppy_fallback_false = dppy()

        finally:
            ref_result = inner_call_fallback()
            config.FALLBACK_ON_CPU = 1
            config.DEBUG = 0

            not np.testing.assert_array_equal(dppy_fallback_false, ref_result)
            assert "Failed to offload parfor" not in str(w[-1].message)
def run_kmeans(
    arrayP,
    arrayPclusters,
    arrayC,
    arrayCsum,
    arrayCnumpoint,
    NUMBER_OF_POINTS,
    NUMBER_OF_CENTROIDS,
):

    with dpctl.device_context(base_kmeans_gpu_graph.get_device_selector()):
        for i in range(REPEAT):
            # for i1 in range(NUMBER_OF_CENTROIDS):
            #     arrayC[i1, 0] = arrayP[i1, 0]
            #     arrayC[i1, 1] = arrayP[i1, 1]

            arrayC, arrayCsum, arrayCnumpoint = kmeans(
                arrayP,
                arrayPclusters,
                arrayC,
                arrayCsum,
                arrayCnumpoint,
                NUMBER_OF_POINTS,
                NUMBER_OF_CENTROIDS,
            )
Ejemplo n.º 9
0
    def test_multiple_prange(self):
        @njit
        def f(a, b):
            # dimensions must be provided as scalar
            m, n = a.shape
            for i in prange(m):
                val = 10
                for j in prange(n):
                    b[i, j] = a[i, j] * val

            for i in prange(m):
                for j in prange(n):
                    a[i, j] = a[i, j] * 10

        m = 8
        n = 8
        a = np.ones((m, n))
        b = np.ones((m, n))

        device = dpctl.SyclDevice("opencl:gpu")
        with assert_auto_offloading(
                parfor_offloaded=2), dpctl.device_context(device):
            f(a, b)

        assert np.all(b == 10)
        assert np.all(a == 10)
Ejemplo n.º 10
0
def main():
    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        c = f1(a, b)

    print("c:", c, hex(c.ctypes.data))
    for i in range(N):
        for j in range(N):
            for k in range(N):
                for l in range(N):  # noqa
                    for m in range(N):
                        if c[i, j, k, l, m] != 2.0:
                            print(
                                "First index not equal to 2.0 was",
                                i,
                                j,
                                k,
                                l,
                                m,
                            )
                            break

    print("Done...")
Ejemplo n.º 11
0
def local_memory():
    """
    This example demonstrates the usage of numba-dppy's `local.array`
    intrinsic function. The function is used to create a static array
    allocated on the devices local address space.
    """
    blocksize = 10

    @dppy.kernel
    def reverse_array(A):
        lm = dppy.local.array(shape=10, dtype=float32)
        i = dppy.get_global_id(0)

        # preload
        lm[i] = A[i]
        # barrier local or global will both work as we only have one work group
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
        # write
        A[i] += lm[blocksize - 1 - i]

    arr = np.arange(blocksize).astype(np.float32)
    print(arr)

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        reverse_array[blocksize, dppy.DEFAULT_LOCAL_SIZE](arr)

    # the output should be `orig[::-1] + orig, i.e. [9, 9, 9, ...]``
    print(arr)
Ejemplo n.º 12
0
def no_arg_barrier_support():
    """
    This example demonstrates the usage of numba_dppy's ``barrier``
    intrinsic function. The ``barrier`` function is usable only inside
    a ``kernel`` and is equivalent to OpenCL's ``barrier`` function.
    """
    @dppy.kernel
    def twice(A):
        i = dppy.get_global_id(0)
        d = A[i]
        # no argument defaults to global mem fence
        dppy.barrier()
        A[i] = d * 2

    N = 10
    arr = np.arange(N).astype(np.float32)
    print(arr)

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        twice[N, dppy.DEFAULT_LOCAL_SIZE](arr)

    # the output should be `arr * 2, i.e. [0, 2, 4, 6, ...]`
    print(arr)
Ejemplo n.º 13
0
def main():
    size = 9
    scale = 3.0

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        result = rand()
        # Random values in a given shape (3, 2)
        print(result)

        result = random_sample(size)
        # Array of shape (9,) with random floats in the
        # half-open interval [0.0, 1.0)
        print(result)

        result = random_exponential(scale, size)
        # Array of shape (9,) with samples from an exponential distribution
        print(result)

        result = random_normal(0.0, 0.1, size)
        # Array of shape (9,) with samples from a normal distribution
        print(result)

    print("Done...")
Ejemplo n.º 14
0
def gen_data_usm(nopt, dims):
    X, Y, D = gen_data(nopt, dims)

    with dpctl.device_context(get_device_selector()) as gpu_queue:
        X_usm = dpt.usm_ndarray(
            X.shape,
            dtype=X.dtype,
            buffer="device",
            buffer_ctor_kwargs={"queue": gpu_queue},
        )
        Y_usm = dpt.usm_ndarray(
            Y.shape,
            dtype=Y.dtype,
            buffer="device",
            buffer_ctor_kwargs={"queue": gpu_queue},
        )
        D_usm = dpt.usm_ndarray(
            D.shape,
            dtype=D.dtype,
            buffer="device",
            buffer_ctor_kwargs={"queue": gpu_queue},
        )

    X_usm.usm_data.copy_from_host(X.reshape((-1)).view("u1"))
    Y_usm.usm_data.copy_from_host(Y.reshape((-1)).view("u1"))

    return (X_usm, Y_usm, D_usm)
Ejemplo n.º 15
0
def test_caching_kernel_using_same_context(filter_str):
    """Test kernel caching for the scenario where different SYCL queues that
    share a SYCL context are used to submit a kernel.

    Args:
        filter_str: SYCL filter selector string
    """
    global_size = 10
    N = global_size

    def data_parallel_sum(a, b, c):
        i = dppy.get_global_id(0)
        c[i] = a[i] + b[i]

    a = np.array(np.random.random(N), dtype=np.float32)
    b = np.array(np.random.random(N), dtype=np.float32)
    c = np.ones_like(a)

    # Set the global queue to the default device so that the cached_kernel gets
    # created for that device
    dpctl.set_global_queue(filter_str)
    func = dppy.kernel(data_parallel_sum)
    default_queue = dpctl.get_current_queue()
    cached_kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
        func._get_argtypes(a, b, c), default_queue)
    for i in range(0, 10):
        # Each iteration create a fresh queue that will share the same context
        with dpctl.device_context(filter_str) as gpu_queue:
            _kernel = func[global_size, dppy.DEFAULT_LOCAL_SIZE].specialize(
                func._get_argtypes(a, b, c), gpu_queue)
            assert _kernel == cached_kernel
Ejemplo n.º 16
0
    def test_device_array_args_gpu(self):
        c = np.ones_like(a)

        with dpctl.device_context("opencl:gpu"):
            data_parallel_sum[global_size, dppy.DEFAULT_LOCAL_SIZE](a, b, c)

        assert np.all(c == d)
Ejemplo n.º 17
0
def main():
    blockdim = 512, 1
    griddim = int(math.ceil(float(OPT_N) / blockdim[0])), 1

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        for i in range(iterations):
            black_scholes_dppy[blockdim, griddim](
                callResult,
                putResult,
                stockPrice,
                optionStrike,
                optionYears,
                RISKFREE,
                VOLATILITY,
            )

    print("callResult : \n", callResult)
    print("putResult : \n", putResult)

    print("Done...")
Ejemplo n.º 18
0
def test_dpctl_api(filter_str):
    device = dpctl.SyclDevice(filter_str)
    with dpctl.device_context(device):
        dpctl.lsplatform()
        dpctl.get_current_queue()
        dpctl.get_num_activated_queues()
        dpctl.is_in_device_context()
Ejemplo n.º 19
0
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void add(global int* a, global int* b, global int* c) {         \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + b[index];                                    \
        }                                                                      \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + d*b[index];                                  \
        }"
        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            self.assertIsNotNone(prog)

            self.assertTrue(prog.has_sycl_kernel("add"))
            self.assertTrue(prog.has_sycl_kernel("axpy"))

            addKernel = prog.get_sycl_kernel("add")
            axpyKernel = prog.get_sycl_kernel("axpy")

            self.assertEqual(addKernel.get_function_name(), "add")
            self.assertEqual(axpyKernel.get_function_name(), "axpy")
            self.assertEqual(addKernel.get_num_args(), 3)
            self.assertEqual(axpyKernel.get_num_args(), 4)
Ejemplo n.º 20
0
def test_proper_lowering(filter_str):
    # We perform eager compilation at the site of
    # @dppy.kernel. This takes the default dpctl
    # queue which is level_zero backed. Level_zero
    # is not yet supported on Windows platform and
    # hence we skip these tests if the platform is
    # Windows regardless of which backend filter_str
    # specifies.
    if skip_if_win():
        pytest.skip()

    # This will trigger eager compilation
    @dppy.kernel("void(float32[::1])")
    def twice(A):
        i = dppy.get_global_id(0)
        d = A[i]
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
        A[i] = d * 2

    N = 256
    arr = np.random.random(N).astype(np.float32)
    orig = arr.copy()

    with dpctl.device_context(filter_str):
        twice[N, N // 2](arr)

    # The computation is correct?
    np.testing.assert_allclose(orig * 2, arr)
Ejemplo n.º 21
0
    def test_dppy_func_ndarray(self):
        @dppy.func
        def g(a):
            return a + 1

        @dppy.kernel
        def f(a, b):
            i = dppy.get_global_id(0)
            b[i] = g(a[i])

        @dppy.kernel
        def h(a, b):
            i = dppy.get_global_id(0)
            b[i] = g(a[i]) + 1

        a = np.ones(self.N)
        b = np.ones(self.N)

        device = dpctl.SyclDevice("opencl:gpu")
        with dpctl.device_context(device):
            f[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)

            assert np.all(b == 2)

            h[self.N, dppy.DEFAULT_LOCAL_SIZE](a, b)

            assert np.all(b == 3)
Ejemplo n.º 22
0
def test_local_memory(filter_str):
    blocksize = 10

    if skip_if_win():
        pytest.skip()

    @dppy.kernel("void(float32[::1])")
    def reverse_array(A):
        lm = dppy.local.array(shape=10, dtype=np.float32)
        i = dppy.get_global_id(0)

        # preload
        lm[i] = A[i]
        # barrier local or global will both work as we only have one work group
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
        # write
        A[i] += lm[blocksize - 1 - i]

    arr = np.arange(blocksize).astype(np.float32)
    orig = arr.copy()

    with dpctl.device_context(filter_str):
        reverse_array[blocksize, blocksize](arr)

    expected = orig[::-1] + orig
    np.testing.assert_allclose(expected, arr)
Ejemplo n.º 23
0
def test_unary_ops(filter_str, unary_op, input_arrays, get_shape, capfd):
    a = input_arrays[0]
    op, name = unary_op
    if name != "argsort" and name != "copy":
        a = np.reshape(a, get_shape)
    if name == "cumprod" and (filter_str == "opencl:cpu:0"
                              or a.dtype == np.int32 or is_gen12(filter_str)):
        pytest.skip()
    if name == "cumsum" and (filter_str == "opencl:cpu:0"
                             or a.dtype == np.int32 or is_gen12(filter_str)):
        pytest.skip()
    if name == "mean" and is_gen12(filter_str):
        pytest.skip()
    if name == "argmax" and is_gen12(filter_str):
        pytest.skip()

    actual = np.empty(shape=a.shape, dtype=a.dtype)
    expected = np.empty(shape=a.shape, dtype=a.dtype)

    f = njit(op)
    device = dpctl.SyclDevice(filter_str)
    with dpctl.device_context(device), dpnp_debug():
        actual = f(a)
        captured = capfd.readouterr()
        assert "dpnp implementation" in captured.out

    expected = op(a)
    np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=0)
Ejemplo n.º 24
0
def sum_reduce(A):
    global_size = len(A)
    work_group_size = 64
    nb_work_groups = global_size // work_group_size
    if (global_size % work_group_size) != 0:
        nb_work_groups += 1

    partial_sums = np.zeros(nb_work_groups).astype(A.dtype)

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        inp_buf = dpctl_mem.MemoryUSMShared(A.size * A.dtype.itemsize)
        inp_ndarray = np.ndarray(A.shape, buffer=inp_buf, dtype=A.dtype)
        np.copyto(inp_ndarray, A)

        partial_sums_buf = dpctl_mem.MemoryUSMShared(
            partial_sums.size * partial_sums.dtype.itemsize)
        partial_sums_ndarray = np.ndarray(
            partial_sums.shape,
            buffer=partial_sums_buf,
            dtype=partial_sums.dtype,
        )
        np.copyto(partial_sums_ndarray, partial_sums)

        result = sum_recursive_reduction(global_size, work_group_size,
                                         inp_ndarray, partial_sums_ndarray)

    return result
Ejemplo n.º 25
0
def test_kernel_arg_accessor(filter_str, input_arrays, kernel):
    a, b, actual = input_arrays
    expected = a + b
    device = dpctl.SyclDevice(filter_str)
    with dpctl.device_context(device):
        call_kernel(global_size, local_size, a, b, actual, kernel)
    np.testing.assert_allclose(actual, expected, rtol=1e-5, atol=0)
Ejemplo n.º 26
0
    def test_sum_reduction(self):
        # This test will only work for even case
        N = 1024
        assert N % 2 == 0

        A = np.array(np.random.random(N), dtype=np.float32)
        A_copy = A.copy()
        # at max we will require half the size of A to store sum
        R = np.array(np.random.random(math.ceil(N / 2)), dtype=np.float32)

        device = dpctl.SyclDevice("opencl:gpu")
        with dpctl.device_context(device):
            total = N

            while total > 1:
                # call kernel
                global_size = total // 2
                reduction_kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](
                    A, R, global_size
                )
                total = total // 2

            result = A_copy.sum()
            max_abs_err = result - R[0]
            assert max_abs_err < 1e-4
Ejemplo n.º 27
0
def main():
    global_size = 10
    N = global_size
    print("N", N)

    a = np.array(np.random.random(N), dtype=np.float32)
    b = np.array(np.random.random(N), dtype=np.float32)
    c = np.ones_like(a)

    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        da = dpt.usm_ndarray(a.shape, dtype=a.dtype, buffer="shared")
        da.usm_data.copy_from_host(a.reshape((-1)).view("|u1"))

        db = dpt.usm_ndarray(b.shape, dtype=b.dtype, buffer="shared")
        db.usm_data.copy_from_host(b.reshape((-1)).view("|u1"))

        dc = dpt.usm_ndarray(c.shape, dtype=c.dtype, buffer="shared")

        driver(da, db, dc, global_size)

    print("Done...")
Ejemplo n.º 28
0
def main():
    global_size = 64
    local_size = 32
    N = global_size * local_size
    print("N", N)

    a = np.ones(N, dtype=np.float32)
    b = np.ones(N, dtype=np.float32)

    print("a:", a, hex(a.ctypes.data))
    print("b:", b, hex(b.ctypes.data))

    # Use the environment variable SYCL_DEVICE_FILTER to change
    # the default device. See
    # https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
    device = dpctl.select_default_device()
    print("Using device ...")
    device.print_device_info()

    with dpctl.device_context(device):
        c = f1(a, b)

    print("RESULT c:", c, hex(c.ctypes.data))
    for i in range(N):
        if c[i] != 2.0:
            print("First index not equal to 2.0 was", i)
            break

    print("Done...")
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = d*a[index] + b[index];                                  \
        }"

        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            axpyKernel = prog.get_sycl_kernel("axpy")

            abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            a = np.ndarray((1024), buffer=abuf, dtype="i")
            b = np.ndarray((1024), buffer=bbuf, dtype="i")
            c = np.ndarray((1024), buffer=cbuf, dtype="i")
            a[:] = np.arange(1024)
            b[:] = np.arange(1024, 0, -1)
            c[:] = 0
            d = 2
            args = []

            args.append(a.base)
            args.append(b.base)
            args.append(c.base)
            args.append(ctypes.c_int(d))

            r = [1024]

            q.submit(axpyKernel, args, r)
            self.assertTrue(np.allclose(c, a * d + b))
Ejemplo n.º 30
0
def test_kernel_atomic_simple(filter_str, input_arrays, kernel_result_pair):
    a, dtype = input_arrays
    kernel, expected = kernel_result_pair
    device = dpctl.SyclDevice(filter_str)
    with dpctl.device_context(device):
        kernel[global_size, dppy.DEFAULT_LOCAL_SIZE](a)
    assert a[0] == expected