def test_single_device(device_idx, full_len, benchmark=False): pwr = 50 a = numpy.arange(full_len).astype(numpy.uint64) context = Context.from_devices([api.platforms[0].devices[device_idx]]) queue = Queue(context.device) program = Program(context.device, src) a_dev = Array.from_host(queue, a) queue.synchronize() t1 = time.time() program.kernel.sum(queue, full_len, None, a_dev, numpy.int32(pwr)) queue.synchronize() t2 = time.time() print(f"Single device time (device {device_idx}):", t2 - t1) a_res = a_dev.get(queue) if not benchmark: a_ref = calc_ref(a, pwr) assert (a_ref == a_res).all()
def test_queue(mock_or_real_context): context, _mocked = mock_or_real_context queue = Queue(context.device) assert queue.device == context.devices[0] queue.synchronize()
def test_allocate_and_copy(mock_or_real_context): context, _mocked = mock_or_real_context length = 100 dtype = numpy.dtype('int32') size = length * dtype.itemsize arr = numpy.arange(length).astype(dtype) buf = Buffer.allocate(context.device, size) assert buf.size == size assert buf.offset == 0 # Just covering the existence of the attribute. # Hard to actually check it without running a kernel assert buf.kernel_arg is not None queue = Queue(context.device) buf.set(queue, arr) # Read the whole buffer res = numpy.empty_like(arr) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Read a subregion buf_region = buf.get_sub_region(25 * dtype.itemsize, 50 * dtype.itemsize) arr_region = arr[25:25+50] res_region = numpy.empty_like(arr_region) buf_region.get(queue, res_region) queue.synchronize() assert (res_region == arr_region).all() # Write a subregion arr_region = (numpy.ones(50) * 100).astype(dtype) arr[25:25+50] = arr_region buf_region.set(queue, arr_region) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Subregion of subregion if context.api.id == cuda_api_id(): # In OpenCL that leads to segfault, but with CUDA we just emulate that with pointers. arr_region2 = (numpy.ones(20) * 200).astype(dtype) arr[25+20:25+40] = arr_region2 buf_region2 = buf_region.get_sub_region(20 * dtype.itemsize, 20 * dtype.itemsize) buf_region2.set(queue, arr_region2) buf.get(queue, res) queue.synchronize() assert (res == arr).all() # Device-to-device copy buf2 = Buffer.allocate(context.device, size * 2) buf2.set(queue, numpy.ones(length * 2, dtype)) buf2_view = buf2.get_sub_region(50 * dtype.itemsize, 100 * dtype.itemsize) buf2_view.set(queue, buf) res2 = numpy.empty(length * 2, dtype) buf2.get(queue, res2) queue.synchronize() assert (res2[50:150] == arr).all() assert (res2[:50] == 1).all() assert (res2[150:] == 1).all() # Device-to-device copy (no_async) buf2 = Buffer.allocate(context.device, size * 2) buf2.set(queue, numpy.ones(length * 2, dtype)) buf2_view = buf2.get_sub_region(50 * dtype.itemsize, 100 * dtype.itemsize) buf2_view.set(queue, buf, no_async=True) res2 = numpy.empty(length * 2, dtype) buf2.get(queue, res2) queue.synchronize() assert (res2[50:150] == arr).all() assert (res2[:50] == 1).all() assert (res2[150:] == 1).all()