Beispiel #1
0
    def test_create_program(self):
        brig_file = get_brig_file()
        symbol = '&__vector_copy_kernel'
        brig_module = BrigModule.from_file(brig_file)
        program = Program()
        program.add_module(brig_module)
        code = program.finalize(self.gpu.isa)

        ex = Executable()
        ex.load(self.gpu, code)
        ex.freeze()

        sym = ex.get_symbol(self.gpu, symbol)
        self.assertGreater(sym.kernarg_segment_size, 0)
Beispiel #2
0
    def test_create_program(self):
        brig_file = get_brig_file()
        symbol = '&__vector_copy_kernel'
        brig_module = BrigModule.from_file(brig_file)
        program = Program()
        program.add_module(brig_module)
        code = program.finalize(self.gpu.isa)

        ex = Executable()
        ex.load(self.gpu, code)
        ex.freeze()

        sym = ex.get_symbol(self.gpu, symbol)
        self.assertGreater(sym.kernarg_segment_size, 0)
Beispiel #3
0
 def test_from_file(self):
     brig_file = get_brig_file()
     brig_module = BrigModule.from_file(brig_file)
     self.assertGreater(len(brig_module), 0)
Beispiel #4
0
    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
           # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(
                2 * ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
                           grid_size=(nelem, 1, 1),signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)
Beispiel #5
0
 def test_from_file(self):
     brig_file = get_brig_file()
     brig_module = BrigModule.from_file(brig_file)
     self.assertGreater(len(brig_module), 0)
Beispiel #6
0
    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                            "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                            "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
                            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
            # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(2 *
                                              ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym,
                       kernarg_ptr,
                       workgroup_size=(256, 1, 1),
                       grid_size=(nelem, 1, 1),
                       signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)