def test_async_copy_to_device_and_back(self): arr = np.arange(1024) hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) gotarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) stream = roc.stream() ct = len(stream._signals) devarr = roc.to_device(hostarr, stream=stream) self.assertEqual(ct + 1, len(stream._signals)) devarr.copy_to_host(gotarr, stream=stream) self.assertEqual(ct + 2, len(stream._signals)) stream.synchronize() self.assertEqual(0, len(stream._signals)) np.testing.assert_equal(hostarr, gotarr)
def test_ctx_managed_stream(self): logger.info("context info: %s", roc.get_context().agent) @roc.jit("int32[:], int32[:]") def add1_kernel(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = src[i] + 1 blksz = 256 gridsz = 10**5 nitems = blksz * gridsz ntimes = 500 arr = np.arange(nitems, dtype=np.int32) logger.info("make coarse_arr") coarse_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) coarse_arr[:] = arr logger.info("make coarse_res_arr") coarse_res_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) coarse_res_arr[:] = 0 logger.info("make stream") stream = roc.stream() with stream.auto_synchronize(): logger.info("make gpu_res_arr") gpu_res_arr = roc.device_array_like(coarse_arr) logger.info("make gpu_arr") gpu_arr = roc.to_device(coarse_arr, stream=stream) for i in range(ntimes): logger.info("launch kernel: %d", i) add1_kernel[gridsz, blksz, stream](gpu_res_arr, gpu_arr) gpu_arr.copy_to_device(gpu_res_arr, stream=stream) logger.info("get kernel result") gpu_res_arr.copy_to_host(coarse_res_arr, stream=stream) logger.info("synchronize on ctx __exit__") logger.info("compare result") np.testing.assert_equal(coarse_res_arr, coarse_arr + ntimes)
def test_ctx_managed_stream(self): logger.info('context info: %s', roc.get_context().agent) @roc.jit("int32[:], int32[:]") def add1_kernel(dst, src): i = roc.get_global_id(0) if i < dst.size: dst[i] = src[i] + 1 blksz = 256 gridsz = 10**5 nitems = blksz * gridsz ntimes = 500 arr = np.arange(nitems, dtype=np.int32) logger.info('make coarse_arr') coarse_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) coarse_arr[:] = arr logger.info('make coarse_res_arr') coarse_res_arr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) coarse_res_arr[:] = 0 logger.info("make stream") stream = roc.stream() with stream.auto_synchronize(): logger.info('make gpu_res_arr') gpu_res_arr = roc.device_array_like(coarse_arr) logger.info('make gpu_arr') gpu_arr = roc.to_device(coarse_arr, stream=stream) for i in range(ntimes): logger.info('launch kernel: %d', i) add1_kernel[gridsz, blksz, stream](gpu_res_arr, gpu_arr) gpu_arr.copy_to_device(gpu_res_arr, stream=stream) logger.info('get kernel result') gpu_res_arr.copy_to_host(coarse_res_arr, stream=stream) logger.info("synchronize on ctx __exit__") logger.info("compare result") np.testing.assert_equal(coarse_res_arr, coarse_arr + ntimes)
def test_async_copy_to_device(self): arr = np.arange(1024) devarr = roc.to_device(arr) # allocate pinned array equivalent hostarr = roc.coarsegrain_array(shape=arr.shape, dtype=arr.dtype) hostarr[:] = arr + 100 stream = roc.stream() ct = len(stream._signals) devarr.copy_to_device(hostarr, stream=stream) self.assertEqual(ct + 1, len(stream._signals), "no new async signal") # implicit synchronization got = devarr.copy_to_host() self.assertEqual(0, len(stream._signals), "missing implicit synchronization") np.testing.assert_equal(hostarr, got)