def test_mempool(self): n = 10 # things to alloc nbytes = ctypes.sizeof(ctypes.c_double) * n dGPU_agent = self.gpu CPU_agent = self.cpu # allocate a GPU memory pool gpu_ctx = Context(dGPU_agent) gpu_only_mem = gpu_ctx.mempoolalloc(nbytes) # allocate a CPU memory pool, allow the GPU access to it cpu_ctx = Context(CPU_agent) cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent]) ## Test writing to allocated area src = np.random.random(n).astype(np.float64) roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes) # clear z0 = np.zeros_like(src) roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes) ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value) for k in range(n): self.assertEqual(ref[k], 0) # copy back from dGPU roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes) for k in range(n): self.assertEqual(ref[k], src[k])
def check_mempool_with_flags(self, finegrain): dGPU_agent = self.gpu gpu_ctx = Context(dGPU_agent) CPU_agent = self.cpu cpu_ctx = Context(CPU_agent) # get mempool with specific flags cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])
def test_memalloc(self): """ Tests Context.memalloc() for a given, in the parlance of HSA,\ `component`. Testing includes specialisations for the supported components of dGPUs and APUs. """ n = 10 # things to alloc nbytes = ctypes.sizeof(ctypes.c_double) * n # run if a dGPU is present if dgpu_present: # find a host accessible region dGPU_agent = self.gpu CPU_agent = self.cpu gpu_ctx = Context(dGPU_agent) gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False) ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True) # on dGPU systems, all host mem is host accessible cpu_ctx = Context(CPU_agent) cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True) # Test writing to allocated area src = np.random.random(n).astype(np.float64) roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes) roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes) # clear z0 = np.zeros_like(src) roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes) ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value) for k in range(n): self.assertEqual(ref[k], 0) # copy back from dGPU roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes) for k in range(n): self.assertEqual(ref[k], src[k]) else: #TODO: write APU variant pass
def test_memalloc(self): """ Tests Context.memalloc() for a given, in the parlance of HSA,\ `component`. Testing includes specialisations for the supported components of dGPUs and APUs. """ n = 10 # things to alloc nbytes = ctypes.sizeof(ctypes.c_double) * n # run if a dGPU is present if dgpu_present: # find a host accessible region dGPU_agent = self.gpu CPU_agent = self.cpu gpu_ctx = Context(dGPU_agent) gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False) ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True) # on dGPU systems, all host mem is host accessible cpu_ctx = Context(CPU_agent) cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True) # Test writing to allocated area src = np.random.random(n).astype(np.float64) roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes) roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes) # clear z0 = np.zeros_like(src) roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes) ref = (n * ctypes.c_double).from_address( ha_mem.device_pointer.value) for k in range(n): self.assertEqual(ref[k], 0) # copy back from dGPU roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes) for k in range(n): self.assertEqual(ref[k], src[k]) else: #TODO: write APU variant pass
def test_mempool_amd_example(self): dGPU_agent = self.gpu gpu_ctx = Context(dGPU_agent) CPU_agent = self.cpu cpu_ctx = Context(CPU_agent) kNumInt = 1024 kSize = kNumInt * ctypes.sizeof(ctypes.c_int) dependent_signal = roc.create_signal(0) completion_signal = roc.create_signal(0) ## allocate host src and dst, allow gpu access flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False) host_src = cpu_ctx.mempoolalloc(kSize, **flags) host_dst = cpu_ctx.mempoolalloc(kSize, **flags) # there's a loop in `i` here over GPU hardware i = 0 # get gpu local pool local_memory = gpu_ctx.mempoolalloc(kSize) host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value) host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value) host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32) host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32) # print("GPU: %s"%gpu_ctx._agent.name) # print("CPU: %s"%cpu_ctx._agent.name) roc.hsa_signal_store_relaxed(completion_signal, 1); q = queue.Queue() class validatorThread(threading.Thread): def run(self): val = roc.hsa_signal_wait_acquire( completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0, ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE) q.put(val) # wait_res # this could be a call on the signal itself dependent_signal.store_relaxed(1) roc.hsa_signal_store_relaxed(dependent_signal, 1); h2l_start = threading.Semaphore(value=0) class l2hThread(threading.Thread): def run(self): dep_signal = drvapi.hsa_signal_t(dependent_signal._id) roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value, cpu_ctx._agent._id, local_memory.device_pointer.value, gpu_ctx._agent._id, kSize, 1, ctypes.byref(dep_signal), completion_signal) h2l_start.release() # signal h2l to start class h2lThread(threading.Thread): def run(self): h2l_start.acquire() # to wait until l2h thread has started roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value, gpu_ctx._agent._id, host_src.device_pointer.value, cpu_ctx._agent._id, kSize, 0, None, dependent_signal) timeout = 10 # 10 seconds timeout # # init thread instances validator = validatorThread() l2h = l2hThread() h2l = h2lThread() # run them validator.start() l2h.start() h2l.start() # join l2h.join(timeout) h2l.join(timeout) validator.join(timeout) # verify wait_res = q.get() self.assertEqual(wait_res, 0) np.testing.assert_allclose(host_dst_view, host_src_view)
def test_mempool_amd_example(self): dGPU_agent = self.gpu gpu_ctx = Context(dGPU_agent) CPU_agent = self.cpu cpu_ctx = Context(CPU_agent) kNumInt = 1024 kSize = kNumInt * ctypes.sizeof(ctypes.c_int) dependent_signal = roc.create_signal(0) completion_signal = roc.create_signal(0) ## allocate host src and dst, allow gpu access flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False) host_src = cpu_ctx.mempoolalloc(kSize, **flags) host_dst = cpu_ctx.mempoolalloc(kSize, **flags) # there's a loop in `i` here over GPU hardware i = 0 # get gpu local pool local_memory = gpu_ctx.mempoolalloc(kSize) host_src_view = (kNumInt * ctypes.c_int).from_address( host_src.device_pointer.value) host_dst_view = (kNumInt * ctypes.c_int).from_address( host_dst.device_pointer.value) host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32) host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32) # print("GPU: %s"%gpu_ctx._agent.name) # print("CPU: %s"%cpu_ctx._agent.name) roc.hsa_signal_store_relaxed(completion_signal, 1) q = queue.Queue() class validatorThread(threading.Thread): def run(self): val = roc.hsa_signal_wait_acquire( completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0, ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE) q.put(val) # wait_res # this could be a call on the signal itself dependent_signal.store_relaxed(1) roc.hsa_signal_store_relaxed(dependent_signal, 1) h2l_start = threading.Semaphore(value=0) class l2hThread(threading.Thread): def run(self): dep_signal = drvapi.hsa_signal_t(dependent_signal._id) roc.hsa_amd_memory_async_copy( host_dst.device_pointer.value, cpu_ctx._agent._id, local_memory.device_pointer.value, gpu_ctx._agent._id, kSize, 1, ctypes.byref(dep_signal), completion_signal) h2l_start.release() # signal h2l to start class h2lThread(threading.Thread): def run(self): h2l_start.acquire() # to wait until l2h thread has started roc.hsa_amd_memory_async_copy( local_memory.device_pointer.value, gpu_ctx._agent._id, host_src.device_pointer.value, cpu_ctx._agent._id, kSize, 0, None, dependent_signal) timeout = 10 # 10 seconds timeout # # init thread instances validator = validatorThread() l2h = l2hThread() h2l = h2lThread() # run them validator.start() l2h.start() h2l.start() # join l2h.join(timeout) h2l.join(timeout) validator.join(timeout) # verify wait_res = q.get() self.assertEqual(wait_res, 0) np.testing.assert_allclose(host_dst_view, host_src_view)