Beispiel #1
0
    def test_mempool(self):
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        dGPU_agent = self.gpu
        CPU_agent = self.cpu

        # allocate a GPU memory pool
        gpu_ctx = Context(dGPU_agent)
        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)

        # allocate a CPU memory pool, allow the GPU access to it
        cpu_ctx = Context(CPU_agent)
        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])

        ## Test writing to allocated area
        src = np.random.random(n).astype(np.float64)
        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
        roc.hsa_memory_copy(gpu_only_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)


        # clear
        z0 = np.zeros_like(src)
        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
        for k in range(n):
            self.assertEqual(ref[k], 0)

        # copy back from dGPU
        roc.hsa_memory_copy(cpu_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
        for k in range(n):
            self.assertEqual(ref[k], src[k])
Beispiel #2
0
    def check_mempool_with_flags(self, finegrain):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)

        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        # get mempool with specific flags
        cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])
Beispiel #3
0
    def check_mempool_with_flags(self, finegrain):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)

        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        # get mempool with specific flags
        cpu_ctx.mempoolalloc(1024, allow_access_to=[gpu_ctx._agent])
Beispiel #4
0
    def test_memalloc(self):
        """
            Tests Context.memalloc() for a given, in the parlance of HSA,\
            `component`. Testing includes specialisations for the supported
            components of dGPUs and APUs.
        """
        n = 10 # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        # run if a dGPU is present
        if dgpu_present:
            # find a host accessible region
            dGPU_agent = self.gpu
            CPU_agent = self.cpu
            gpu_ctx = Context(dGPU_agent)
            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)

            # on dGPU systems, all host mem is host accessible
            cpu_ctx = Context(CPU_agent)
            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)

            # Test writing to allocated area
            src = np.random.random(n).astype(np.float64)
            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data, src.nbytes)
            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer, src.nbytes)
            roc.hsa_memory_copy(gpu_only_mem.device_pointer, ha_mem.device_pointer, src.nbytes)

            # clear
            z0 = np.zeros_like(src)
            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data, z0.nbytes)
            ref = (n * ctypes.c_double).from_address(ha_mem.device_pointer.value)
            for k in range(n):
                self.assertEqual(ref[k], 0)

            # copy back from dGPU
            roc.hsa_memory_copy(ha_mem.device_pointer, gpu_only_mem.device_pointer, src.nbytes)
            for k in range(n):
                self.assertEqual(ref[k], src[k])

        else: #TODO: write APU variant
            pass
Beispiel #5
0
    def test_mempool(self):
        n = 10  # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        dGPU_agent = self.gpu
        CPU_agent = self.cpu

        # allocate a GPU memory pool
        gpu_ctx = Context(dGPU_agent)
        gpu_only_mem = gpu_ctx.mempoolalloc(nbytes)

        # allocate a CPU memory pool, allow the GPU access to it
        cpu_ctx = Context(CPU_agent)
        cpu_mem = cpu_ctx.mempoolalloc(nbytes, allow_access_to=[gpu_ctx.agent])

        ## Test writing to allocated area
        src = np.random.random(n).astype(np.float64)
        roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data,
                            src.nbytes)
        roc.hsa_memory_copy(gpu_only_mem.device_pointer,
                            cpu_mem.device_pointer, src.nbytes)

        # clear
        z0 = np.zeros_like(src)
        roc.hsa_memory_copy(cpu_mem.device_pointer, z0.ctypes.data, z0.nbytes)
        ref = (n * ctypes.c_double).from_address(cpu_mem.device_pointer.value)
        for k in range(n):
            self.assertEqual(ref[k], 0)

        # copy back from dGPU
        roc.hsa_memory_copy(cpu_mem.device_pointer,
                            gpu_only_mem.device_pointer, src.nbytes)
        for k in range(n):
            self.assertEqual(ref[k], src[k])
Beispiel #6
0
    def test_memalloc(self):
        """
            Tests Context.memalloc() for a given, in the parlance of HSA,\
            `component`. Testing includes specialisations for the supported
            components of dGPUs and APUs.
        """
        n = 10  # things to alloc
        nbytes = ctypes.sizeof(ctypes.c_double) * n

        # run if a dGPU is present
        if dgpu_present:
            # find a host accessible region
            dGPU_agent = self.gpu
            CPU_agent = self.cpu
            gpu_ctx = Context(dGPU_agent)
            gpu_only_mem = gpu_ctx.memalloc(nbytes, hostAccessible=False)
            ha_mem = gpu_ctx.memalloc(nbytes, hostAccessible=True)

            # on dGPU systems, all host mem is host accessible
            cpu_ctx = Context(CPU_agent)
            cpu_mem = cpu_ctx.memalloc(nbytes, hostAccessible=True)

            # Test writing to allocated area
            src = np.random.random(n).astype(np.float64)
            roc.hsa_memory_copy(cpu_mem.device_pointer, src.ctypes.data,
                                src.nbytes)
            roc.hsa_memory_copy(ha_mem.device_pointer, cpu_mem.device_pointer,
                                src.nbytes)
            roc.hsa_memory_copy(gpu_only_mem.device_pointer,
                                ha_mem.device_pointer, src.nbytes)

            # clear
            z0 = np.zeros_like(src)
            roc.hsa_memory_copy(ha_mem.device_pointer, z0.ctypes.data,
                                z0.nbytes)
            ref = (n * ctypes.c_double).from_address(
                ha_mem.device_pointer.value)
            for k in range(n):
                self.assertEqual(ref[k], 0)

            # copy back from dGPU
            roc.hsa_memory_copy(ha_mem.device_pointer,
                                gpu_only_mem.device_pointer, src.nbytes)
            for k in range(n):
                self.assertEqual(ref[k], src[k])

        else:  #TODO: write APU variant
            pass
Beispiel #7
0
    def test_mempool_amd_example(self):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)
        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        kNumInt = 1024
        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)

        dependent_signal = roc.create_signal(0)
        completion_signal = roc.create_signal(0)

        ## allocate host src and dst, allow gpu access
        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)

        # there's a loop in `i` here over GPU hardware
        i = 0

        # get gpu local pool
        local_memory = gpu_ctx.mempoolalloc(kSize)

        host_src_view = (kNumInt * ctypes.c_int).from_address(host_src.device_pointer.value)
        host_dst_view = (kNumInt * ctypes.c_int).from_address(host_dst.device_pointer.value)

        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)

        # print("GPU: %s"%gpu_ctx._agent.name)
        # print("CPU: %s"%cpu_ctx._agent.name)

        roc.hsa_signal_store_relaxed(completion_signal, 1);

        q = queue.Queue()

        class validatorThread(threading.Thread):
            def run(self):
                val = roc.hsa_signal_wait_acquire(
                    completion_signal,
                    enums.HSA_SIGNAL_CONDITION_EQ,
                    0,
                    ctypes.c_uint64(-1),
                    enums.HSA_WAIT_STATE_ACTIVE)

                q.put(val)  # wait_res

        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
        roc.hsa_signal_store_relaxed(dependent_signal, 1);

        h2l_start = threading.Semaphore(value=0)

        class l2hThread(threading.Thread):
            def run(self):
                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
                roc.hsa_amd_memory_async_copy(host_dst.device_pointer.value,
                                        cpu_ctx._agent._id,
                                        local_memory.device_pointer.value,
                                        gpu_ctx._agent._id, kSize, 1,
                                        ctypes.byref(dep_signal),
                                        completion_signal)
                h2l_start.release()  # signal h2l to start

        class h2lThread(threading.Thread):
            def run(self):
                h2l_start.acquire()  # to wait until l2h thread has started
                roc.hsa_amd_memory_async_copy(local_memory.device_pointer.value,
                                            gpu_ctx._agent._id,
                                            host_src.device_pointer.value,
                                            cpu_ctx._agent._id, kSize, 0,
                                            None,
                                            dependent_signal)

        timeout = 10  # 10 seconds timeout
        # # init thread instances
        validator = validatorThread()
        l2h = l2hThread()
        h2l = h2lThread()
        # run them
        validator.start()
        l2h.start()
        h2l.start()
        # join
        l2h.join(timeout)
        h2l.join(timeout)
        validator.join(timeout)
        # verify
        wait_res = q.get()
        self.assertEqual(wait_res, 0)
        np.testing.assert_allclose(host_dst_view, host_src_view)
Beispiel #8
0
    def test_mempool_amd_example(self):
        dGPU_agent = self.gpu
        gpu_ctx = Context(dGPU_agent)
        CPU_agent = self.cpu
        cpu_ctx = Context(CPU_agent)

        kNumInt = 1024
        kSize = kNumInt * ctypes.sizeof(ctypes.c_int)

        dependent_signal = roc.create_signal(0)
        completion_signal = roc.create_signal(0)

        ## allocate host src and dst, allow gpu access
        flags = dict(allow_access_to=[gpu_ctx.agent], finegrain=False)
        host_src = cpu_ctx.mempoolalloc(kSize, **flags)
        host_dst = cpu_ctx.mempoolalloc(kSize, **flags)

        # there's a loop in `i` here over GPU hardware
        i = 0

        # get gpu local pool
        local_memory = gpu_ctx.mempoolalloc(kSize)

        host_src_view = (kNumInt * ctypes.c_int).from_address(
            host_src.device_pointer.value)
        host_dst_view = (kNumInt * ctypes.c_int).from_address(
            host_dst.device_pointer.value)

        host_src_view[:] = i + 2016 + np.arange(0, kNumInt, dtype=np.int32)
        host_dst_view[:] = np.zeros(kNumInt, dtype=np.int32)

        # print("GPU: %s"%gpu_ctx._agent.name)
        # print("CPU: %s"%cpu_ctx._agent.name)

        roc.hsa_signal_store_relaxed(completion_signal, 1)

        q = queue.Queue()

        class validatorThread(threading.Thread):
            def run(self):
                val = roc.hsa_signal_wait_acquire(
                    completion_signal, enums.HSA_SIGNAL_CONDITION_EQ, 0,
                    ctypes.c_uint64(-1), enums.HSA_WAIT_STATE_ACTIVE)

                q.put(val)  # wait_res

        # this could be a call on the signal itself dependent_signal.store_relaxed(1)
        roc.hsa_signal_store_relaxed(dependent_signal, 1)

        h2l_start = threading.Semaphore(value=0)

        class l2hThread(threading.Thread):
            def run(self):
                dep_signal = drvapi.hsa_signal_t(dependent_signal._id)
                roc.hsa_amd_memory_async_copy(
                    host_dst.device_pointer.value, cpu_ctx._agent._id,
                    local_memory.device_pointer.value, gpu_ctx._agent._id,
                    kSize, 1, ctypes.byref(dep_signal), completion_signal)
                h2l_start.release()  # signal h2l to start

        class h2lThread(threading.Thread):
            def run(self):
                h2l_start.acquire()  # to wait until l2h thread has started
                roc.hsa_amd_memory_async_copy(
                    local_memory.device_pointer.value, gpu_ctx._agent._id,
                    host_src.device_pointer.value, cpu_ctx._agent._id, kSize,
                    0, None, dependent_signal)

        timeout = 10  # 10 seconds timeout
        # # init thread instances
        validator = validatorThread()
        l2h = l2hThread()
        h2l = h2lThread()
        # run them
        validator.start()
        l2h.start()
        h2l.start()
        # join
        l2h.join(timeout)
        h2l.join(timeout)
        validator.join(timeout)
        # verify
        wait_res = q.get()
        self.assertEqual(wait_res, 0)
        np.testing.assert_allclose(host_dst_view, host_src_view)