def __call__(self, *args): self._sentry_resource_limit() ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] # contains lambdas to execute on return retr = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values, retr) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignment align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = kernargs.value + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align # Actual Kernel launch qq = ctx.default_queue if self.stream is None: hsa.implicit_sync() # Dispatch signal = None if self.stream is not None: signal = hsa.create_signal(1) qq.insert_barrier(self.stream._get_last_signal()) qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size, signal=signal) if self.stream is not None: self.stream._add_signal(signal) # retrieve auto converted arrays for wb in retr: wb() # Free kernel region if kernargs is not None: if self.stream is None: kernarg_region.free(kernargs) else: self.stream._add_callback( lambda: kernarg_region.free(kernargs))
def __call__(self, *args): self._sentry_resource_limit() ctx, symbol, kernargs, kernarg_region = self.bind() # Unpack pyobject values into ctypes scalar values expanded_values = [] # contains lambdas to execute on return retr = [] for ty, val in zip(self.argument_types, args): _unpack_argument(ty, val, expanded_values, retr) # Insert kernel arguments base = 0 for av in expanded_values: # Adjust for alignemnt align = ctypes.sizeof(av) pad = _calc_padding_for_alignment(align, base) base += pad # Move to offset offseted = kernargs.value + base asptr = ctypes.cast(offseted, ctypes.POINTER(type(av))) # Assign value asptr[0] = av # Increment offset base += align # Actual Kernel launch qq = ctx.default_queue if self.stream is None: hsa.implicit_sync() # Dispatch signal = None if self.stream is not None: signal = hsa.create_signal(1) qq.insert_barrier(self.stream._get_last_signal()) qq.dispatch(symbol, kernargs, workgroup_size=self.local_size, grid_size=self.global_size, signal=signal) if self.stream is not None: self.stream._add_signal(signal) # retrieve auto converted arrays for wb in retr: wb() # Free kernel region if kernargs is not None: if self.stream is None: kernarg_region.free(kernargs) else: self.stream._add_callback(lambda: kernarg_region.free(kernargs))
def test_mempool_amd_example(self): dGPU_agent = self.gpu gpu_ctx = Context(dGPU_agent) CPU_agent = self.cpu cpu_ctx = Context(CPU_agent) kNumInt = 1024 kSize = kNumInt * ctypes.sizeof(ctypes.c_int) dependent_signal = roc.create_signal(0) completion_signal = roc.create_signal(0) ## allocate host src and dst, allow gpu access flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False) host_src = cpu_ctx.mempoolalloc(kSize, **flags) host_dst = cpu_ctx.mempoolalloc(kSize, **flags) # there's a loop in `i` here over GPU hardware i = 0 # get gpu local pool local_memory = gpu_ctx.mempoolalloc(kSize) host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value) host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value) host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32) host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32) # print("GPU: %s"%gpu_ctx._agent.name) # print("CPU: %s"%cpu_ctx._agent.name) roc.hsa_signal_store_relaxed(completion_signal, 1); q = queue.Queue() class validatorThread(threading.Thread): def run(self): val = roc.hsa_signal_wait_acquire( completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0, ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE) q.put(val) # wait_res # this could be a call on the signal itself dependent_signal.store_relaxed(1) roc.hsa_signal_store_relaxed(dependent_signal, 1); h2l_start = threading.Semaphore(value=0) class l2hThread(threading.Thread): def run(self): dep_signal = drvapi.hsa_signal_t(dependent_signal._id) roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value, cpu_ctx._agent._id, local_memory.device_pointer.value, gpu_ctx._agent._id, kSize, 1, ctypes.byref(dep_signal), completion_signal) h2l_start.release() # signal h2l to start class h2lThread(threading.Thread): def run(self): h2l_start.acquire() # to wait until l2h thread has started roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value, gpu_ctx._agent._id, host_src.device_pointer.value, cpu_ctx._agent._id, kSize, 0, None, dependent_signal) timeout = 10 # 10 seconds timeout # # init thread instances validator = validatorThread() l2h = l2hThread() h2l = h2lThread() # run them validator.start() l2h.start() h2l.start() # join l2h.join(timeout) h2l.join(timeout) validator.join(timeout) # verify wait_res = q.get() self.assertEqual(wait_res, 0) np.testing.assert_allclose(host_dst_view, host_src_view)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate( 2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1),signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)
def test_mempool_amd_example(self): dGPU_agent = self.gpu gpu_ctx = Context(dGPU_agent) CPU_agent = self.cpu cpu_ctx = Context(CPU_agent) kNumInt = 1024 kSize = kNumInt * ctypes.sizeof(ctypes.c_int) dependent_signal = roc.create_signal(0) completion_signal = roc.create_signal(0) ## allocate host src and dst, allow gpu access flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False) host_src = cpu_ctx.mempoolalloc(kSize, **flags) host_dst = cpu_ctx.mempoolalloc(kSize, **flags) # there's a loop in `i` here over GPU hardware i = 0 # get gpu local pool local_memory = gpu_ctx.mempoolalloc(kSize) host_src_view = (kNumInt * ctypes.c_int).from_address( host_src.device_pointer.value) host_dst_view = (kNumInt * ctypes.c_int).from_address( host_dst.device_pointer.value) host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32) host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32) # print("GPU: %s"%gpu_ctx._agent.name) # print("CPU: %s"%cpu_ctx._agent.name) roc.hsa_signal_store_relaxed(completion_signal, 1) q = queue.Queue() class validatorThread(threading.Thread): def run(self): val = roc.hsa_signal_wait_acquire( completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0, ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE) q.put(val) # wait_res # this could be a call on the signal itself dependent_signal.store_relaxed(1) roc.hsa_signal_store_relaxed(dependent_signal, 1) h2l_start = threading.Semaphore(value=0) class l2hThread(threading.Thread): def run(self): dep_signal = drvapi.hsa_signal_t(dependent_signal._id) roc.hsa_amd_memory_async_copy( host_dst.device_pointer.value, cpu_ctx._agent._id, local_memory.device_pointer.value, gpu_ctx._agent._id, kSize, 1, ctypes.byref(dep_signal), completion_signal) h2l_start.release() # signal h2l to start class h2lThread(threading.Thread): def run(self): h2l_start.acquire() # to wait until l2h thread has started roc.hsa_amd_memory_async_copy( local_memory.device_pointer.value, gpu_ctx._agent._id, host_src.device_pointer.value, cpu_ctx._agent._id, kSize, 0, None, dependent_signal) timeout = 10 # 10 seconds timeout # # init thread instances validator = validatorThread() l2h = l2hThread() h2l = h2lThread() # run them validator.start() l2h.start() h2l.start() # join l2h.join(timeout) h2l.join(timeout) validator.join(timeout) # verify wait_res = q.get() self.assertEqual(wait_res, 0) np.testing.assert_allclose(host_dst_view, host_src_view)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate(2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1), signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)