Ejemplo n.º 1
0
    def test_device_ir(A, B):
        n = A.shape[0]
        ib = tvm.tir.ir_builder.create()

        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(tx, "thread_extent", n)

        temp = ib.allocate(dtype, (n, ),
                           scope="shared.dyn")  # n is symbolic size

        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)

        temp[tx] = Aptr[tx]
        depth = tvm.tir.log2(cast(n, "float32"))

        with ib.for_range(0, cast(tvm.tir.ceil(depth), n.dtype)) as i:
            ib.emit(
                tvm.tir.Call(None, "tir.tvm_storage_sync",
                             tvm.runtime.convert(["shared"])))
            d = n >> (i + 1)
            with ib.if_scope(tx < d):
                temp[tx] += temp[tx + d]

        Bptr[0] = temp[0]
        return ib.get()
Ejemplo n.º 2
0
    def test_device_ir(A, B, C):
        n = A.shape[0]
        ib = tvm.tir.ir_builder.create()

        values_per_thread = 4
        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(tx, "thread_extent",
                      tvm.tir.indexdiv(n, values_per_thread))

        A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn")  # fp16
        B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn")  # fp32

        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)

        with ib.for_range(0, values_per_thread, kind="vectorize") as i:
            A_sh[tx * values_per_thread + i] = Aptr[tx * values_per_thread + i]
            B_sh[tx * values_per_thread + i] = Bptr[tx * values_per_thread + i]

        with ib.for_range(0, values_per_thread) as i:
            Cptr[tx * values_per_thread +
                 i] = (cast(A_sh[tx * values_per_thread + i], "float32") +
                       B_sh[tx * values_per_thread + i])

        return ib.get()
Ejemplo n.º 3
0
    def test_device_ir(A, B, C):
        n = A.shape[0]
        ib = tvm.tir.ir_builder.create()

        tx = te.thread_axis("threadIdx.x")
        ib.scope_attr(tx, "thread_extent", n)

        A_sh = ib.allocate(A.dtype, (n, ), scope="shared.dyn")  # i8
        B_sh = ib.allocate(B.dtype, (n, ), scope="shared.dyn")  # i16
        C_sh = ib.allocate(C.dtype, (n, ), scope="shared.dyn")  # i32

        Aptr = ib.buffer_ptr(A)
        Bptr = ib.buffer_ptr(B)
        Cptr = ib.buffer_ptr(C)

        A_sh[tx] = Aptr[tx]
        B_sh[tx] = Bptr[tx]

        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
        Cptr[tx] = C_sh[tx]
        return ib.get()
Ejemplo n.º 4
0
    def do_copy(A, B, n):
        ib = tvm.tir.ir_builder.create()
        A = ib.buffer_ptr(A)
        B = ib.buffer_ptr(B)

        tx = te.thread_axis("threadIdx.x")
        bx = te.thread_axis("blockIdx.x")

        max_threads = 32
        ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads))
        ib.scope_attr(tx, "thread_extent", max_threads)
        tid = bx * max_threads + tx

        with ib.if_scope(tid < n):
            B[tid] = cast(A[tid], "int32")

        return ib.get()
Ejemplo n.º 5
0
    def test_matmul_ir(A, B, C):
        ib = tvm.tir.ir_builder.create()

        tx = te.thread_axis("threadIdx.x")
        ty = te.thread_axis("threadIdx.y")
        bx = te.thread_axis("blockIdx.x")
        by = te.thread_axis("blockIdx.y")
        ib.scope_attr(tx, "thread_extent", block)
        ib.scope_attr(ty, "thread_extent", block)
        ib.scope_attr(bx, "thread_extent", n // block)
        ib.scope_attr(by, "thread_extent", n // block)

        A_sh = ib.allocate(A.dtype, (block, block),
                           scope="shared.dyn",
                           name="A_sh")  # fp16
        B_sh = ib.allocate(B.dtype, (block, block),
                           scope="shared.dyn",
                           name="B_sh")  # fp16
        # Create a dynamic shared memory for the accumulation.
        # This is for testing merging dynamic shared memory alloctions with different data type.
        # In practice, there is no need to allocate a shared memory for C.
        C_local = ib.allocate(C.dtype, (1, ), scope="local", name="C_local")
        C_sh = ib.allocate(C.dtype, (block, block),
                           scope="shared.dyn",
                           name="C_sh")  # fp32

        A_ptr = ib.buffer_ptr(A)
        B_ptr = ib.buffer_ptr(B)
        C_ptr = ib.buffer_ptr(C)

        C_local[0] = 0.0

        with ib.for_range(0, n // block, name="i") as i:
            A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
            B_sh[ty, tx] = B_ptr[i * block + ty, bx * block + tx]
            ib.emit(syncthread())

            with ib.for_range(0, block, name="k") as k:
                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
            ib.emit(syncthread())

        C_sh[ty, tx] = C_local[0]
        C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]

        return ib.get()