def test_create_program(self): brig_file = get_brig_file() symbol = '&__vector_copy_kernel' brig_module = BrigModule.from_file(brig_file) program = Program() program.add_module(brig_module) code = program.finalize(self.gpu.isa) ex = Executable() ex.load(self.gpu, code) ex.freeze() sym = ex.get_symbol(self.gpu, symbol) self.assertGreater(sym.kernarg_segment_size, 0)
def test_from_file(self): brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate( 2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1),signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)
def test_coarse_grained_kernel_execution(self): """ This tests the execution of a kernel on a dGPU using coarse memory regions for the buffers. NOTE: the code violates the HSA spec in that it uses a coarse region for kernargs, this is a performance hack. """ from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\ Executable # get a brig file brig_file = get_brig_file() brig_module = BrigModule.from_file(brig_file) self.assertGreater(len(brig_module), 0) # use existing GPU regions for computation space gpu_regions = self.gpu.regions gpu_only_coarse_regions = list() gpu_host_accessible_coarse_regions = list() for r in gpu_regions: if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED): if r.host_accessible: gpu_host_accessible_coarse_regions.append(r) else: gpu_only_coarse_regions.append(r) # check we have 1+ coarse gpu region(s) of each type self.assertGreater(len(gpu_only_coarse_regions), 0) self.assertGreater(len(gpu_host_accessible_coarse_regions), 0) # Compilation phase: # FIXME: this is dubious, assume launching agent is indexed first agent = roc.components[0] prog = Program() prog.add_module(brig_module) # get kernel and load code = prog.finalize(agent.isa) ex = Executable() ex.load(agent, code) ex.freeze() # extract symbols sym = ex.get_symbol(agent, "&__vector_copy_kernel") self.assertNotEqual(sym.kernel_object, 0) self.assertGreater(sym.kernarg_segment_size, 0) # attempt kernel excution import ctypes import numpy as np # Do memory allocations # allocate and initialise memory nelem = 1024 * 1024 src = np.random.random(nelem).astype(np.float32) z0 = np.zeros_like(src) # alloc host accessible memory nbytes = ctypes.sizeof(ctypes.c_float) * nelem gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0] host_in_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_in_ptr.value, None, "pointer must not be NULL") host_out_ptr = gpu_host_accessible_region.allocate(nbytes) self.assertNotEqual(host_out_ptr.value, None, "pointer must not be NULL") # init mem with data roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes) roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes) # alloc gpu only memory gpu_only_region = gpu_only_coarse_regions[0] gpu_in_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL") gpu_out_ptr = gpu_only_region.allocate(nbytes) self.assertNotEqual(gpu_out_ptr.value, None, "pointer must not be NULL") # copy memory from host accessible location to gpu only roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes) # Do kernargs # Find a coarse region (for better performance on dGPU) in which # to place kernargs. NOTE: This violates the HSA spec kernarg_regions = list() for r in gpu_host_accessible_coarse_regions: # NOTE: VIOLATION if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG): kernarg_regions.append(r) self.assertGreater(len(kernarg_regions), 0) # use first region for args kernarg_region = kernarg_regions[0] kernarg_ptr = kernarg_region.allocate(2 * ctypes.sizeof(ctypes.c_void_p)) self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL") # wire in gpu memory argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value) argref[0] = gpu_in_ptr.value argref[1] = gpu_out_ptr.value # signal sig = roc.create_signal(1) # create queue and dispatch job queue = agent.create_queue_single(32) queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1), grid_size=(nelem, 1, 1), signal=None) # copy result back to host accessible memory to check roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes) # check the data is recovered ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value) np.testing.assert_equal(ref, src) # free roc.hsa_memory_free(host_in_ptr) roc.hsa_memory_free(host_out_ptr) roc.hsa_memory_free(gpu_in_ptr) roc.hsa_memory_free(gpu_out_ptr)