def transformed_element_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [16, 16])
    C = tir.match_buffer(c, [16, 16])

    for i_0 in range(0, 16):
        with tir.block([]):
            tir.reads([A[i_0, 0:16]])
            tir.writes([C[i_0, 0:16]])
            B = tir.alloc_buffer([16, 16])
            for j_0 in tir.serial(0, 16):
                with tir.block([16, 16], "") as [i, j]:
                    tir.bind(i, i_0)
                    tir.bind(j, j_0)
                    B[i, j] = A[i, j] + 1.0
            for j_0 in tir.serial(0, 16):
                with tir.block([16, 16], "") as [i, j]:
                    tir.bind(i, i_0)
                    tir.bind(j, j_0)
                    C[i, j] = B[i, j] * 2.0
Ejemplo n.º 2
0
def compacted_storage_align_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (16, 16), "float32")
    C = tir.match_buffer(c, (16, 16), "float32")
    for i in range(0, 16):
        with tir.block([]):
            tir.reads(A[i, 0:16])
            tir.writes(C[i, 0:16])
            B = tir.alloc_buffer((1, 16), strides=(31, 1), dtypes="float32")
            for j in range(0, 16):
                with tir.block() as []:
                    tir.reads(A[i, j])
                    tir.writes(B[0, j])
                    tir.block_attr({"buffer_dim_align": [[0, 0, 16, 15]]})
                    B[0, j] = A[i, j] + 1.0
            for j in range(0, 16):
                with tir.block() as []:
                    tir.reads(B[0, j])
                    tir.writes(C[i, j])
                    C[i, j] = B[0, j] * 2.0
Ejemplo n.º 3
0
def compacted_predicate_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (32), "float32")
    C = tir.match_buffer(c, (32), "float32")

    for i, j in tir.grid(5, 7):
        with tir.block([]) as []:
            tir.reads(A[i * 7 + j])
            tir.writes(C[i * 7 + j])
            tir.where(i * 7 + j < 32)
            C[i * 7 + j] = A[i * 7 + j] + 1.0
def matmul(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [128, 128])
    B = tir.match_buffer(b, [128, 128])
    C = tir.match_buffer(c, [128, 128])

    with tir.block([128, 128, tir.reduce_axis(0, 128)],
                   "update") as [vi, vj, vk]:
        with tir.init():
            C[vi, vj] = tir.float32(0)
        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def matmul_not_same_buffer_access(a: ty.handle, b: ty.handle,
                                  c: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128))
    B = tir.match_buffer(b, (128, 128))
    C = tir.match_buffer(c, (128, 128))

    with tir.block([128, 128, tir.reduce_axis(0, 128)], "C") as [vi, vj, vk]:
        with tir.init():
            C[vi, vj] = 0.0
        C[vj, vi] = C[vj, vi] + A[vi, vk] * B[vk, vj]
    def main(a: ty.handle, b: ty.handle) -> None:
        A = tir.match_buffer(a, [64, 64, 64])
        B = tir.match_buffer(b, [64])

        with tir.block([64,
                        tir.reduce_axis(0, 64),
                        tir.reduce_axis(32, 64)]) as [i, j, k]:
            if (j == 0) and (k == 32):
                B[i] = tir.float32(0)
            B[i] += A[i, j, k]
Ejemplo n.º 7
0
def tir_conv2d(a: ty.handle, w: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, [16, 16, 14, 14])
    W = tir.match_buffer(w, [16, 3, 3, 32])
    B = tir.match_buffer(b, [16, 32, 14, 14])
    Apad = tir.alloc_buffer([16, 16, 16, 16])

    with tir.block([16, 16, 16, 16], "Apad") as [nn, cc, yy, xx]:
        Apad[nn, cc, yy, xx] = tir.if_then_else(
            yy >= 1 and yy - 1 < 14 and xx >= 1 and xx - 1 < 14,
            A[nn, cc, yy - 1, xx - 1],
            0.0,
            dtype="float32",
        )
    with tir.block(
        [16, 32, 14, 14, tir.reduce_axis(0, 16), tir.reduce_axis(0, 3), tir.reduce_axis(0, 3)], "B"
    ) as [nn, ff, yy, xx, rc, ry, rx]:
        with tir.init():
            B[nn, ff, yy, xx] = 0.0
        B[nn, ff, yy, xx] += Apad[nn, rc, yy + ry, xx + rx] * W[rc, ry, rx, ff]
def block_in_opaque_block(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128), "float32")
    B = tir.match_buffer(b, (128, 128), "float32")
    with tir.block([128], "B") as vi:
        tir.reads([A[0:128, 0:128]])
        tir.writes([B[0:128, 0:128]])
        B[vi, 0] = A[vi, 0]
        if A[vi, 0] == 0.0:
            with tir.block([], "C"):
                tir.reads([A[0:128, 0:128]])
                tir.writes([B[0:128, 0:128]])
                with tir.block([128], "D") as vj:
                    B[vi, vj] = A[vi, vj] * 3.0
        else:
            with tir.block([], "E"):
                tir.reads([A[0:128, 0:128]])
                tir.writes([B[0:128, 0:128]])
                with tir.block([128], "F") as vj:
                    B[vi, vj] = A[vi, vj] * 2.0
Ejemplo n.º 9
0
def matmul_m_128(a: ty.handle, b: ty.handle, c: ty.handle) -> None:
    m = tir.var("int32")
    A = tir.match_buffer(a, [m, 128])
    B = tir.match_buffer(b, [m, 128])
    C = tir.match_buffer(c, [m, m])

    with tir.block([m, m, tir.reduce_axis(0, 128)], "update") as [vi, vj, vk]:
        with tir.init():
            C[vi, vj] = 0.0
        C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def elementwise_not_affine(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128, 128))
    for i, j, k, l in tir.grid(128, 128, 128, 8):
        with tir.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
            tir.bind(vi, i)
            tir.bind(vj, j)
            tir.bind(vk, k)
            tir.bind(vl, l * 16)
            B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
def elementwise_reordered2(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128, 128))
    for k, j, i, l in tir.grid(128, 128, 128, 128):
        with tir.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
            tir.bind(vi, i)
            tir.bind(vj, j)
            tir.bind(vk, k)
            tir.bind(vl, l)
            B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
Ejemplo n.º 12
0
def rowsum_not_quasi_affine(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128))
    B = tir.match_buffer(b, (128, ))

    for i, k in tir.grid(128, 16):
        with tir.block([128, tir.reduce_axis(0, 128)], "B") as [vi, vk]:
            tir.bind(vi, i)
            tir.bind(vk, tir.floordiv(k * k, 2))
            with tir.init():
                B[vi] = 0.0
            B[vi] = B[vi] + A[vi, vk]
def opaque_access_func() -> None:
    A = tir.alloc_buffer([1024])
    B = tir.alloc_buffer([1024])
    for i in tir.serial(0, 8):
        with tir.block([8]) as [v]:
            tir.bind(v, i)
            tir.reads([A[v * 128 : v * 128 + 128]])
            tir.writes([B[v * 128 : v * 128 + 128]])
            tir.evaluate(
                tir.call_extern("test", B.data, v * 128, 128, A.data, v * 128, 128, dtype="float32")
            )
Ejemplo n.º 14
0
def elementwise_split_case0(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, [128, 128, 128])
    B = tir.match_buffer(b, [128, 128, 128])
    for i1, i2, i3, j1, j2, k1, k2 in tir.grid(2, 1, 64, 4, 32, 16, 8):
        with tir.block([128, 128, 128], "B") as [vi, vj, vk]:
            tir.bind(vi, ((i1 * 64) + i3))
            tir.bind(vj, ((j1 * 32) + j2))
            tir.bind(vk, ((k1 * 8) + k2))
            tir.reads([A[vi, vj, vk]])
            tir.writes([B[vi, vj, vk]])
            B[vi, vj, vk] = A[vi, vj, vk] * 2.0
Ejemplo n.º 15
0
def rowsum_transformed(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128))
    B = tir.match_buffer(b, (128, ))

    for io, ii_ko_fused, ki in tir.grid(32, 128, 4):
        with tir.block([128, tir.reduce_axis(0, 128)], "B") as [vi, vk]:
            tir.bind(vi, io * 4 + tir.floordiv(ii_ko_fused, 32))
            tir.bind(vk, tir.floormod(ii_ko_fused, 32) * 4 + ki)
            with tir.init():
                B[vi] = 0.0
            B[vi] = B[vi] + A[vi, vk]
def elementwise_reordered_with_predicate(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128, 128))
    for l, j, k, i in tir.grid(128, 128, 128, 128):
        with tir.block([128, 128, 128, 128], "B") as [vi, vj, vk, vl]:
            tir.where(i * 2097152 + j * 16384 + k * 128 + l < 100)
            tir.bind(vi, i)
            tir.bind(vj, j)
            tir.bind(vk, k)
            tir.bind(vl, l)
            B[vi, vj, vk, vl] = A[vi, vj, vk, vl] * 2.0
def elementwise_with_wrong_block_var_type(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128))
    for i, j, k in tir.grid(128, 128, 128):
        with tir.block([128, 128, tir.scan_axis(0, 128)], "B") as [vi, vj, vk]:
            tir.bind(vi, i)
            tir.bind(vj, j)
            tir.bind(vk, k)
            tir.reads([A[vi, vj, vk]])
            tir.writes([B[vi, vj, vk]])
            B[vi, vj, vk] = A[vi, vj, vk] * 2.0
Ejemplo n.º 18
0
def elementwise_fused(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128))
    for fused in tir.serial(0, 2097152):
        with tir.block([128, 128, 128], "B") as [vi, vj, vk]:
            tir.bind(vi, tir.floordiv(fused, 16384))
            tir.bind(vj, tir.floormod(tir.floordiv(fused, 128), 128))
            tir.bind(vk, tir.floormod(fused, 128))
            tir.reads([A[vi, vj, vk]])
            tir.writes([B[vi, vj, vk]])
            B[vi, vj, vk] = A[vi, vj, vk] * 2.0
def match_buffer_func() -> None:
    with tir.block([], "root"):
        A = tir.alloc_buffer((128, 128), "float32")
        B = tir.alloc_buffer((128, 128), "float32")
        tir.reads([])
        tir.writes([])
        # Need add read/write region manually to avoid triggering block access region detector
        with tir.block([8, 8], "block") as [vi, vj]:
            tir.reads(B[vi * 16 + 2 : vi * 16 + 12, vj * 16 + 2 : vj * 16 + 16])
            tir.writes(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16])
            AA = tir.match_buffer(A[vi * 16 : vi * 16 + 16, vj * 16 : vj * 16 + 16], (16, 16))
            B0 = tir.match_buffer(B[vi * 16 + 2 : vi * 16 + 6, vj * 16 + 2 : vj * 16 + 6], (4, 4))
            B1 = tir.match_buffer(B[vi * 16 + 8 : vi * 16 + 12, vj * 16 + 8 : vj * 16 + 16], (4, 8))
            with tir.block([16, 16], "AAA") as [i, j]:
                tir.reads([])
                tir.writes(AA[i, j])
                AAA = tir.match_buffer(AA[i, j], ())
                AAA[()] = 1.0
            tir.evaluate(B0.data)
            tir.evaluate(B1.data)
Ejemplo n.º 20
0
def compacted_gpu_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (16, 16), "float32")
    C = tir.match_buffer(c, (16, 16), "float32")
    for i0 in tir.thread_binding(0, 4, thread="blockIdx.x"):
        for i1 in tir.thread_binding(0, 2, thread="threadIdx.x"):
            for i2 in tir.thread_binding(0, 2, thread="vthread"):
                with tir.block([]):
                    tir.reads(A[i0 * 4 + i1 * 2 + i2, 0:16])
                    tir.writes(C[i0 * 4 + i1 * 2 + i2, 0:16])
                    B = tir.alloc_buffer([1, 16], "float32", scope="local")
                    for j in range(0, 16):
                        with tir.block() as []:
                            tir.reads(A[i0 * 4 + i1 * 2 + i2, j])
                            tir.writes(B[0, j])
                            B[0, j] = A[i0 * 4 + i1 * 2 + i2, j] + 1.0
                    for j in range(0, 16):
                        with tir.block() as []:
                            tir.reads(B[0, j])
                            tir.writes(C[i0 * 4 + i1 * 2 + i2, j])
                            C[i0 * 4 + i1 * 2 + i2, j] = B[0, j] * 2.0
Ejemplo n.º 21
0
def buffer_shape_mismatch(a: ty.handle) -> None:
    A = tir.match_buffer(a, (8, 8))
    for i, j in tir.grid(8, 2):
        with tir.block([]):
            tir.reads([])
            tir.writes([A[i, j * 4:j * 4 + 4]])
            sub_A = tir.match_buffer(
                A[i, j * 4:j * 4 + 4],
                (5))  # error: shape mismatched between 4 and 5
            for jj in range(0, 4):
                sub_A[i, j * 4 + jj] = 1
def func() -> None:
    A = tir.alloc_buffer((128, 128), "float32")
    B = tir.alloc_buffer((128, 128), "float32")
    C = tir.alloc_buffer((128, 128), "float32")
    D = tir.alloc_buffer((128, 128), "float32")
    with tir.block([]):
        # Need add read/write region manually to avoid triggering block access region detector
        tir.reads([B[0, 0], C[0:16, 0:16], A[4:12, 4:12]])
        tir.writes([A[0:12, 0:12]])
        for i, j in tir.grid(8, 8):
            A[i, j] = B[0, 0] + C[0, 0]
        with tir.block([2, 2]) as [vi, vj]:
            tir.reads([
                A[vi * 4 + 4:vi * 4 + 8, vj * 4 + 4:vj * 4 + 8], C[12:16,
                                                                   12:16]
            ])
            tir.writes([A[vi * 4 + 4:vi * 4 + 8, vj * 4 + 4:vj * 4 + 8]])
            for i, j in tir.grid(4, 4):
                A[vi * 4 + 4 + i, vj * 4 + 4 + j] += C[i + 12, j + 12]
        tir.evaluate(D.data)
Ejemplo n.º 23
0
def compacted_symbolic_func(a: ty.handle, c: ty.handle, n: ty.int32,
                            m: ty.int32) -> None:
    A = tir.match_buffer(a, (n, m), "float32")
    C = tir.match_buffer(c, (n, m), "float32")

    for i in range(0, n):
        with tir.block([]):
            tir.reads(A[i, m])
            tir.writes(C[i, m])
            B = tir.alloc_buffer((m, ), "float32")
            for j in range(0, m):
                with tir.block([]) as []:
                    tir.reads(A[i, j])
                    tir.writes(B[j])
                    B[j] = A[i, j] + 1.0
            for j in range(0, m):
                with tir.block([]) as []:
                    tir.reads(B[j])
                    tir.writes(C[i, j])
                    C[i, j] = B[j] * 2.0
Ejemplo n.º 24
0
def blockized_2(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [128, 128], "float32")
    B = tir.alloc_buffer([128, 128], "float32")
    C = tir.match_buffer(c, [128, 128], "float32")
    for i_o, j_o in tir.grid(8, 8):
        with tir.block([8, 8], "B_outer") as [vio, vjo]:
            tir.bind(vio, i_o)
            tir.bind(vjo, j_o)
            tir.reads([A[vio * 16:vio * 16 + 16, vjo * 16:vjo * 16 + 16, ]])
            tir.writes([B[vio * 16:vio * 16 + 16, vjo * 16:vjo * 16 + 16]])
            for i_i, j_i in tir.grid(16, 16):
                with tir.block([128, 128], "B_inner") as [vi, vj]:
                    tir.bind(vi, vio * 16 + i_i)
                    tir.bind(vj, vjo * 16 + j_i)
                    B[vi, vj] = A[vi, vj] * 2.0
    for i_o, j_o, i_i, j_i in tir.grid(4, 4, 32, 32):
        with tir.block([128, 128], "C") as [vi, vj]:
            tir.bind(vi, i_o * 32 + i_i)
            tir.bind(vj, j_o * 32 + j_i)
            C[vi, vj] = B[vi, vj] + 1.0
Ejemplo n.º 25
0
def compacted_match_buffer_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (16, 16))
    C = tir.match_buffer(c, (16, 16))
    for i in range(0, 16):
        with tir.block([]):
            A0 = tir.match_buffer(A[i, 0:16], (16))
            C0 = tir.match_buffer(C[i, 0:16], (16))
            B = tir.alloc_buffer((1, 16))
            with tir.block([]):
                B0 = tir.match_buffer(B[0, 0:16], (16))
                for j in range(0, 16):
                    with tir.block([]) as []:
                        A1 = tir.match_buffer(A0[j], ())
                        B1 = tir.match_buffer(B0[j], ())
                        B1[()] = A1[()] + 1.0
            for j in range(0, 16):
                with tir.block([]) as []:
                    C1 = tir.match_buffer(C0[j], ())
                    B2 = tir.match_buffer(B[0, j], ())
                    C1[()] = B2[()] * 2.0
Ejemplo n.º 26
0
 def batch_matmul(  # pylint: disable=no-self-argument
         a: ty.handle, b: ty.handle, c: ty.handle) -> None:
     tir.func_attr({"global_symbol": "batch_matmul", "tir.noalias": True})
     A = tir.match_buffer(a, [16, 128, 128])
     B = tir.match_buffer(b, [16, 128, 128])
     C = tir.match_buffer(c, [16, 128, 128])
     with tir.block([16, 128, 128, tir.reduce_axis(0, 128)],
                    "update") as [vn, vi, vj, vk]:
         with tir.init():
             C[vn, vi, vj] = 0.0
         C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
Ejemplo n.º 27
0
 def matmul(  # pylint: disable=no-self-argument
         a: ty.handle, b: ty.handle, c: ty.handle) -> None:
     tir.func_attr({"global_symbol": "matmul", "tir.noalias": True})
     A = tir.match_buffer(a, (1024, 1024), "float32")
     B = tir.match_buffer(b, (1024, 1024), "float32")
     C = tir.match_buffer(c, (1024, 1024), "float32")
     with tir.block([1024, 1024, tir.reduce_axis(0, 1024)],
                    "matmul") as [vi, vj, vk]:
         with tir.init():
             C[vi, vj] = 0.0
         C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
def unschedulable_func(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, (16, 16), "float32")
    C = tir.match_buffer(c, (16, 16), "float32")
    for i in range(0, 16):
        with tir.block([]):
            tir.reads(A[i, 0:16])
            tir.writes(C[i, 0:16])
            B = tir.alloc_buffer((16, 16), "float32")
            for j in range(0, 16):
                tir.store(B.data, i * 16 + j, A[i, j] + 1.0)
            for j in range(0, 16):
                C[i, j] = B[i, j] * 2.0
Ejemplo n.º 29
0
def blockized_after_compute_at(a: ty.handle, c: ty.handle) -> None:
    A = tir.match_buffer(a, [128, 128], "float32")
    B = tir.alloc_buffer([128, 128], "float32")
    C = tir.match_buffer(c, [128, 128], "float32")
    for i0_0, i1_0 in tir.grid(8, 8):
        for ax0, ax1 in tir.grid(16, 16):
            with tir.block([128, 128], "B") as [vi, vj]:
                tir.bind(vi, i0_0 * 16 + ax0)
                tir.bind(vj, i1_0 * 16 + ax1)
                B[vi, vj] = A[vi, vj] * 2.0
        with tir.block([8, 8], "C_outer") as [vi_o, vj_o]:
            tir.bind(vi_o, i0_0)
            tir.bind(vj_o, i1_0)
            tir.reads(
                [B[vi_o * 16:vi_o * 16 + 16, vj_o * 16:vj_o * 16 + 16, ]])
            tir.writes([C[vi_o * 16:vi_o * 16 + 16, vj_o * 16:vj_o * 16 + 16]])
            for i0_1, i1_1 in tir.grid(16, 16):
                with tir.block([128, 128], "C_inner") as [vi, vj]:
                    tir.bind(vi, vi_o * 16 + i0_1)
                    tir.bind(vj, vj_o * 16 + i1_1)
                    C[vi, vj] = B[vi, vj] + 1.0
Ejemplo n.º 30
0
def elementwise_with_anno(a: ty.handle, b: ty.handle) -> None:
    A = tir.match_buffer(a, (128, 128, 128))
    B = tir.match_buffer(b, (128, 128, 128))
    for i, j in tir.grid(128, 128):
        for k in tir.serial(0, 128, annotations={"useless_annotation": True}):
            with tir.block([128, 128, 128], "B") as [vi, vj, vk]:
                tir.bind(vi, i)
                tir.bind(vj, j)
                tir.bind(vk, k)
                tir.reads([A[vi, vj, vk]])
                tir.writes([B[vi, vj, vk]])
                B[vi, vj, vk] = A[vi, vj, vk] * 2.0