Beispiel #1
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignment
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol,
                    kernargs,
                    workgroup_size=self.local_size,
                    grid_size=self.global_size,
                    signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(
                    lambda: kernarg_region.free(kernargs))
Beispiel #2
0
    def __call__(self, *args):
        self._sentry_resource_limit()

        ctx, symbol, kernargs, kernarg_region = self.bind()

        # Unpack pyobject values into ctypes scalar values
        expanded_values = []

        # contains lambdas to execute on return
        retr = []
        for ty, val in zip(self.argument_types, args):
            _unpack_argument(ty, val, expanded_values, retr)

        # Insert kernel arguments
        base = 0
        for av in expanded_values:
            # Adjust for alignemnt
            align = ctypes.sizeof(av)
            pad = _calc_padding_for_alignment(align, base)
            base += pad
            # Move to offset
            offseted = kernargs.value + base
            asptr = ctypes.cast(offseted, ctypes.POINTER(type(av)))
            # Assign value
            asptr[0] = av
            # Increment offset
            base += align

        # Actual Kernel launch
        qq = ctx.default_queue

        if self.stream is None:
            hsa.implicit_sync()

        # Dispatch
        signal = None
        if self.stream is not None:
            signal = hsa.create_signal(1)
            qq.insert_barrier(self.stream._get_last_signal())

        qq.dispatch(symbol, kernargs, workgroup_size=self.local_size,
                    grid_size=self.global_size, signal=signal)

        if self.stream is not None:
            self.stream._add_signal(signal)

        # retrieve auto converted arrays
        for wb in retr:
            wb()

        # Free kernel region
        if kernargs is not None:
            if self.stream is None:
                kernarg_region.free(kernargs)
            else:
                self.stream._add_callback(lambda: kernarg_region.free(kernargs))
Beispiel #3
0
    def test_mempool_amd_example(self):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)
        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        kNumInt = 1024
        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)

        dependent_signal = roc.create_signal(0)
        completion_signal = roc.create_signal(0)

        ## allocate host src and dst, allow gpu access
        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)

        # there's a loop in `i` here over GPU hardware
        i = 0

        # get gpu local pool
        local_memory = gpu_ctx.mempoolalloc(kSize)

        host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value)
        host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value)

        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)

        # print("GPU: %s"%gpu_ctx._agent.name)
        # print("CPU: %s"%cpu_ctx._agent.name)

        roc.hsa_signal_store_relaxed(completion_signal, 1);

        q = queue.Queue()

        class validatorThread(threading.Thread):
            def run(self):
                val = roc.hsa_signal_wait_acquire(
                    completion_signal,
                    enums.HSA_SIGNAL_CONDITION_EQ,
                    0,
                    ctypes.c_uint64(-1),
                    enums.HSA_WAIT_STATE_ACTIVE)

                q.put(val)  # wait_res

        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
        roc.hsa_signal_store_relaxed(dependent_signal, 1);

        h2l_start = threading.Semaphore(value=0)

        class l2hThread(threading.Thread):
            def run(self):
                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
                roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value,
                                        cpu_ctx._agent._id,
                                        local_memory.device_pointer.value,
                                        gpu_ctx._agent._id, kSize, 1,
                                        ctypes.byref(dep_signal),
                                        completion_signal)
                h2l_start.release()  # signal h2l to start

        class h2lThread(threading.Thread):
            def run(self):
                h2l_start.acquire()  # to wait until l2h thread has started
                roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value,
                                            gpu_ctx._agent._id,
                                            host_src.device_pointer.value,
                                            cpu_ctx._agent._id, kSize, 0,
                                            None,
                                            dependent_signal)

        timeout = 10  # 10 seconds timeout
        # # init thread instances
        validator = validatorThread()
        l2h = l2hThread()
        h2l = h2lThread()
        # run them
        validator.start()
        l2h.start()
        h2l.start()
        # join
        l2h.join(timeout)
        h2l.join(timeout)
        validator.join(timeout)
        # verify
        wait_res = q.get()
        self.assertEqual(wait_res, 0)
        np.testing.assert_allclose(host_dst_view, host_src_view)
Beispiel #4
0
    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
           # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(
                2 * ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym, kernarg_ptr, workgroup_size=(256, 1, 1),
                           grid_size=(nelem, 1, 1),signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)
Beispiel #5
0
    def test_mempool_amd_example(self):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)
        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        kNumInt = 1024
        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)

        dependent_signal = roc.create_signal(0)
        completion_signal = roc.create_signal(0)

        ## allocate host src and dst, allow gpu access
        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)

        # there's a loop in `i` here over GPU hardware
        i = 0

        # get gpu local pool
        local_memory = gpu_ctx.mempoolalloc(kSize)

        host_src_view = (kNumInt * ctypes.c_int).from_address(
            host_src.device_pointer.value)
        host_dst_view = (kNumInt * ctypes.c_int).from_address(
            host_dst.device_pointer.value)

        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)

        # print("GPU: %s"%gpu_ctx._agent.name)
        # print("CPU: %s"%cpu_ctx._agent.name)

        roc.hsa_signal_store_relaxed(completion_signal, 1)

        q = queue.Queue()

        class validatorThread(threading.Thread):
            def run(self):
                val = roc.hsa_signal_wait_acquire(
                    completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0,
                    ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE)

                q.put(val)  # wait_res

        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
        roc.hsa_signal_store_relaxed(dependent_signal, 1)

        h2l_start = threading.Semaphore(value=0)

        class l2hThread(threading.Thread):
            def run(self):
                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
                roc.hsa_amd_memory_async_copy(
                    host_dst.device_pointer.value, cpu_ctx._agent._id,
                    local_memory.device_pointer.value, gpu_ctx._agent._id,
                    kSize, 1, ctypes.byref(dep_signal), completion_signal)
                h2l_start.release()  # signal h2l to start

        class h2lThread(threading.Thread):
            def run(self):
                h2l_start.acquire()  # to wait until l2h thread has started
                roc.hsa_amd_memory_async_copy(
                    local_memory.device_pointer.value, gpu_ctx._agent._id,
                    host_src.device_pointer.value, cpu_ctx._agent._id, kSize,
                    0, None, dependent_signal)

        timeout = 10  # 10 seconds timeout
        # # init thread instances
        validator = validatorThread()
        l2h = l2hThread()
        h2l = h2lThread()
        # run them
        validator.start()
        l2h.start()
        h2l.start()
        # join
        l2h.join(timeout)
        h2l.join(timeout)
        validator.join(timeout)
        # verify
        wait_res = q.get()
        self.assertEqual(wait_res, 0)
        np.testing.assert_allclose(host_dst_view, host_src_view)
Beispiel #6
0
    def test_coarse_grained_kernel_execution(self):
        """
        This tests the execution of a kernel on a dGPU using coarse memory
        regions for the buffers.
        NOTE: the code violates the HSA spec in that it uses a coarse region
        for kernargs, this is a performance hack.
        """

        from numba.roc.hsadrv.driver import BrigModule, Program, hsa,\
                Executable

        # get a brig file
        brig_file = get_brig_file()
        brig_module = BrigModule.from_file(brig_file)
        self.assertGreater(len(brig_module), 0)

        # use existing GPU regions for computation space
        gpu_regions = self.gpu.regions
        gpu_only_coarse_regions = list()
        gpu_host_accessible_coarse_regions = list()
        for r in gpu_regions:
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_COARSE_GRAINED):
                if r.host_accessible:
                    gpu_host_accessible_coarse_regions.append(r)
                else:
                    gpu_only_coarse_regions.append(r)

        # check we have 1+ coarse gpu region(s) of each type
        self.assertGreater(len(gpu_only_coarse_regions), 0)
        self.assertGreater(len(gpu_host_accessible_coarse_regions), 0)

        # Compilation phase:

        # FIXME: this is dubious, assume launching agent is indexed first
        agent = roc.components[0]

        prog = Program()
        prog.add_module(brig_module)

        # get kernel and load
        code = prog.finalize(agent.isa)

        ex = Executable()
        ex.load(agent, code)
        ex.freeze()

        # extract symbols
        sym = ex.get_symbol(agent, "&__vector_copy_kernel")
        self.assertNotEqual(sym.kernel_object, 0)
        self.assertGreater(sym.kernarg_segment_size, 0)

        # attempt kernel excution
        import ctypes
        import numpy as np

        # Do memory allocations

        # allocate and initialise memory
        nelem = 1024 * 1024

        src = np.random.random(nelem).astype(np.float32)
        z0 = np.zeros_like(src)

        # alloc host accessible memory
        nbytes = ctypes.sizeof(ctypes.c_float) * nelem
        gpu_host_accessible_region = gpu_host_accessible_coarse_regions[0]
        host_in_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_in_ptr.value, None,
                            "pointer must not be NULL")
        host_out_ptr = gpu_host_accessible_region.allocate(nbytes)
        self.assertNotEqual(host_out_ptr.value, None,
                            "pointer must not be NULL")

        # init mem with data
        roc.hsa_memory_copy(host_in_ptr, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(host_out_ptr, z0.ctypes.data, z0.nbytes)

        # alloc gpu only memory
        gpu_only_region = gpu_only_coarse_regions[0]
        gpu_in_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_in_ptr.value, None, "pointer must not be NULL")
        gpu_out_ptr = gpu_only_region.allocate(nbytes)
        self.assertNotEqual(gpu_out_ptr.value, None,
                            "pointer must not be NULL")

        # copy memory from host accessible location to gpu only
        roc.hsa_memory_copy(gpu_in_ptr, host_in_ptr, src.nbytes)

        # Do kernargs

        # Find a coarse region (for better performance on dGPU) in which
        # to place kernargs. NOTE: This violates the HSA spec
        kernarg_regions = list()
        for r in gpu_host_accessible_coarse_regions:
            # NOTE: VIOLATION
            if r.supports(enums.HSA_REGION_GLOBAL_FLAG_KERNARG):
                kernarg_regions.append(r)
        self.assertGreater(len(kernarg_regions), 0)

        # use first region for args
        kernarg_region = kernarg_regions[0]

        kernarg_ptr = kernarg_region.allocate(2 *
                                              ctypes.sizeof(ctypes.c_void_p))

        self.assertNotEqual(kernarg_ptr, None, "pointer must not be NULL")

        # wire in gpu memory
        argref = (2 * ctypes.c_size_t).from_address(kernarg_ptr.value)
        argref[0] = gpu_in_ptr.value
        argref[1] = gpu_out_ptr.value

        # signal
        sig = roc.create_signal(1)

        # create queue and dispatch job

        queue = agent.create_queue_single(32)
        queue.dispatch(sym,
                       kernarg_ptr,
                       workgroup_size=(256, 1, 1),
                       grid_size=(nelem, 1, 1),
                       signal=None)

        # copy result back to host accessible memory to check
        roc.hsa_memory_copy(host_out_ptr, gpu_out_ptr, src.nbytes)

        # check the data is recovered
        ref = (nelem * ctypes.c_float).from_address(host_out_ptr.value)
        np.testing.assert_equal(ref, src)

        # free
        roc.hsa_memory_free(host_in_ptr)
        roc.hsa_memory_free(host_out_ptr)
        roc.hsa_memory_free(gpu_in_ptr)
        roc.hsa_memory_free(gpu_out_ptr)