Beispiel #1
0
    def test_matmul_fast(self):
        blocksize = 20
        gridsize = 20

        @hsa.jit
        def matmulfast(A, B, C):
            x = hsa.get_global_id(0)
            y = hsa.get_global_id(1)

            tx = hsa.get_local_id(0)
            ty = hsa.get_local_id(1)

            sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                hsa.barrier(hsa.CLK_GLOBAL_MEM_FENCE)

            C[x, y] = tmp

        N = gridsize * blocksize
        A = np.random.random((N, N)).astype(np.float32)
        B = np.random.random((N, N)).astype(np.float32)
        C = np.zeros_like(A)

        griddim = gridsize, gridsize
        blockdim = blocksize, blocksize

        with hsa.register(A, B, C):
            ts = timer()
            matmulfast[griddim, blockdim](A, B, C)
            te = timer()
            print("1st GPU time:", te - ts)

        with hsa.register(A, B, C):
            ts = timer()
            matmulfast[griddim, blockdim](A, B, C)
            te = timer()
            print("2nd GPU time:", te - ts)

        ts = timer()
        ans = np.dot(A, B)
        te = timer()
        print("CPU time:", te - ts)
        np.testing.assert_allclose(ans, C, rtol=1e-5)
Beispiel #2
0
    def test_matmul_fast(self):
        blocksize = 20
        gridsize = 20

        @hsa.jit
        def matmulfast(A, B, C):
            x = hsa.get_global_id(0)
            y = hsa.get_global_id(1)

            tx = hsa.get_local_id(0)
            ty = hsa.get_local_id(1)

            sA = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)
            sB = hsa.shared.array(shape=(blocksize, blocksize), dtype=float32)

            if x >= C.shape[0] or y >= C.shape[1]:
                return

            tmp = 0

            for i in range(gridsize):
                # preload
                sA[tx, ty] = A[x, ty + i * blocksize]
                sB[tx, ty] = B[tx + i * blocksize, y]
                # wait for preload to end
                hsa.barrier(1)
                # compute loop
                for j in range(blocksize):
                    tmp += sA[tx, j] * sB[j, ty]
                # wait for compute to end
                hsa.barrier(1)

            C[x, y] = tmp

        N = gridsize * blocksize
        A = np.random.random((N, N)).astype(np.float32)
        B = np.random.random((N, N)).astype(np.float32)
        C = np.zeros_like(A)

        griddim = gridsize, gridsize
        blockdim = blocksize, blocksize

        with hsa.register(A, B, C):
            ts = timer()
            matmulfast[griddim, blockdim](A, B, C)
            te = timer()
            print("1st GPU time:", te - ts)

        with hsa.register(A, B, C):
            ts = timer()
            matmulfast[griddim, blockdim](A, B, C)
            te = timer()
            print("2nd GPU time:", te - ts)

        ts = timer()
        ans = np.dot(A, B)
        te = timer()
        print("CPU time:", te - ts)
        np.testing.assert_allclose(ans, C, rtol=1e-5)
 def launcher(support, samples, bandwidth, pdf):
     assert pdf.ndim == 1
     assert support.ndim == 1
     assert samples.ndim == 1
     assert support.size == pdf.size
     with hsa.register(support, samples, pdf):
         threads = WAVESIZE * 8
         blocks = support.size
         hsa_uni_kde[blocks, threads](support, samples, bandwidth, pdf)
 def launcher(support, samples, bandwidth, pdf):
     assert pdf.ndim == 1
     assert support.ndim == 1
     assert samples.ndim == 1
     assert support.size == pdf.size
     with hsa.register(support, samples, pdf):
         threads = WAVESIZE * 8
         blocks = support.size
         hsa_uni_kde[blocks, threads](support, samples, bandwidth, pdf)
    def launcher(support, samples, bandwidths, pdf):
        assert support.shape[0] == pdf.size
        assert support.shape[1] == samples.shape[1]
        assert bandwidths.size == support.shape[1]

        threads = WAVESIZE * 4
        blocks = (support.shape[0] + threads - 1) // threads

        with hsa.register(support, samples, bandwidths, pdf):
            hsa_multi_kde[blocks, threads](support, samples, bandwidths, pdf)
    def launcher(support, samples, bandwidths, pdf):
        assert support.shape[0] == pdf.size
        assert support.shape[1] == samples.shape[1]
        assert bandwidths.size == support.shape[1]

        threads = WAVESIZE * 4
        blocks = (support.shape[0] + threads - 1) // threads

        with hsa.register(support, samples, bandwidths, pdf):
            hsa_multi_kde[blocks, threads](support, samples, bandwidths, pdf)
Beispiel #7
0
    def test_matmul_naive(self):
        @hsa.jit
        def matmul(A, B, C):
            i = hsa.get_global_id(0)
            j = hsa.get_global_id(1)

            if i >= C.shape[0] or j >= C.shape[1]:
                return

            tmp = 0

            for k in range(A.shape[1]):
                tmp += A[i, k] * B[k, j]

            C[i, j] = tmp

        N = 256
        A = np.random.random((N, N)).astype(np.float32)
        B = np.random.random((N, N)).astype(np.float32)
        C = np.zeros_like(A)

        with hsa.register(A, B, C):
            ts = timer()
            matmul[(N // 16, N // 16), (16, 16)](A, B, C)
            te = timer()
            print("1st GPU time:", te - ts)

        with hsa.register(A, B, C):
            ts = timer()
            matmul[(N // 16, N // 16), (16, 16)](A, B, C)
            te = timer()
            print("2nd GPU time:", te - ts)

        ts = timer()
        ans = np.dot(A, B)
        te = timer()
        print("CPU time:", te - ts)
        np.testing.assert_allclose(ans, C, rtol=1e-5)
Beispiel #8
0
    def test_matmul_naive(self):
        @hsa.jit
        def matmul(A, B, C):
            i = hsa.get_global_id(0)
            j = hsa.get_global_id(1)

            if i >= C.shape[0] or j >= C.shape[1]:
                return

            tmp = 0

            for k in range(A.shape[1]):
                tmp += A[i, k] * B[k, j]

            C[i, j] = tmp

        N = 256
        A = np.random.random((N, N)).astype(np.float32)
        B = np.random.random((N, N)).astype(np.float32)
        C = np.zeros_like(A)

        with hsa.register(A, B, C):
            ts = timer()
            matmul[(N // 16, N // 16), (16, 16)](A, B, C)
            te = timer()
            print("1st GPU time:", te - ts)

        with hsa.register(A, B, C):
            ts = timer()
            matmul[(N // 16, N // 16), (16, 16)](A, B, C)
            te = timer()
            print("2nd GPU time:", te - ts)

        ts = timer()
        ans = np.dot(A, B)
        te = timer()
        print("CPU time:", te - ts)
        np.testing.assert_allclose(ans, C, rtol=1e-5)