def multiple_reduction_blocks_rfactor(a: T.handle, f: T.handle) -> None: A = T.match_buffer(a, [16, 16, 16]) C = T.alloc_buffer([16, 16]) D = T.alloc_buffer([16, 16]) E = T.alloc_buffer([16, 16]) F = T.match_buffer(f, [16, 16]) C_rf = T.alloc_buffer([16, 16, 4]) for i, j1, k1o, k1i in T.grid(16, 16, 4, 4): with T.block([4, 16, 16, T.reduce_axis(0, 4)], "C_rf") as [vk1o, ci, cj, vk1i]: T.bind(vk1o, k1o) T.bind(ci, i) T.bind(cj, j1) T.bind(vk1i, k1i) with T.init(): C_rf[ci, cj, vk1o] = 0.0 C_rf[ci, cj, vk1o] = C_rf[ci, cj, vk1o] + A[ci, cj, ((vk1o * 4) + vk1i)] for i_1 in T.serial(0, 16): for j1_1 in T.serial(0, 16): for k1o_1 in T.serial(0, 4): with T.block([T.reduce_axis(0, 4), 16, 16], "C") as [vk1o_1, ci_1, cj_1]: T.bind(vk1o_1, k1o_1) T.bind(ci_1, i_1) T.bind(cj_1, j1_1) with T.init(): C[ci_1, cj_1] = 0.0 C[ci_1, cj_1] = C[ci_1, cj_1] + C_rf[ci_1, cj_1, vk1o_1] for k2o, k2i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "D") as [di, dj, dk]: T.bind(di, i_1) T.bind(dj, j1_1) T.bind(dk, (k2o * 4) + k2i) with T.init(): D[di, dj] = 0.0 D[di, dj] = (D[di, dj] + A[di, dj, dk]) + C[di, dj] for j2 in T.serial(0, 16): for k3o, k3i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "E") as [ei, ej, ek]: T.bind(ei, i_1) T.bind(ej, j2) T.bind(ek, (k3o * 4) + k3i) with T.init(): E[ei, ej] = 0.0 E[ei, ej] = (E[ei, ej] + A[ei, ej, ek]) + D[ei, ej] for k4o, k4i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "F") as [fi, fj, fk]: T.bind(fi, i_1) T.bind(fj, j2) T.bind(fk, (k4o * 4) + k4i) with T.init(): F[fi, fj] = 0.0 F[fi, fj] = (F[fi, fj] + A[fi, fj, fk]) + E[fi, fj]
def square_sum(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, [16, 256, 256]) C = T.match_buffer(c, [16]) with T.block([16, T.reduce_axis(0, 256), T.reduce_axis(0, 256)], "C") as [b, i, j]: with T.init(): C[b] = 0.0 C[b] = C[b] + A[b, i, j] * A[b, i, j]
def main(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [64, 64, 64]) B = T.match_buffer(b, [64]) with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]: if (j == 0) and (k == 32): B[i] = T.float32(0) B[i] += A[i, j, k]
def main(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [64, 64, 64]) B = T.match_buffer(b, [64]) with T.block([64, T.reduce_axis(0, 64), T.reduce_axis(32, 64)]) as [i, j, k]: BB = T.match_buffer(B[i], ()) AA = T.match_buffer(A[i, 0:64, 0:64], (64, 64)) if (j == 0) and (k == 32): BB[()] = T.float32(0) BB[()] += AA[j, k]
def matmul_decompose4(a: T.handle, b: T.handle, c: T.handle) -> None: C = T.match_buffer(c, [128, 128], elem_offset=0, align=128, offset_factor=1) B = T.match_buffer(b, [128, 128], elem_offset=0, align=128, offset_factor=1) A = T.match_buffer(a, [128, 128], elem_offset=0, align=128, offset_factor=1) # body with T.block([], "root"): T.reads([]) T.writes([]) for i0_0 in T.serial(0, 16): for i0_1_init, i1_init in T.grid(8, 128): with T.block([128, 128], "update_init") as [vi_init, vj_init]: T.bind(vi_init, ((i0_0 * 8) + i0_1_init)) T.bind(vj_init, i1_init) C[vi_init, vj_init] = T.float32(0) for i0_1, i1, i2_0, i2_1 in T.grid(8, 128, 19, 7): with T.block([128, 128, T.reduce_axis(0, 128)], "update_update") as [ vi, vj, vk, ]: T.where((((i2_0 * 7) + i2_1) < 128)) T.bind(vi, ((i0_0 * 8) + i0_1)) T.bind(vj, i1) T.bind(vk, ((i2_0 * 7) + i2_1)) C[vi, vj] = C[vi, vj] + (A[vi, vk] * B[vj, vk])
def transformed_func() -> None: A = T.alloc_buffer([128, 128]) with T.block([128, 128], "") as [i, j]: A[i, j] = T.float32(0) with T.block([32, 32, T.reduce_axis(0, 32)], "") as [i, j, k]: B = T.alloc_buffer([128, 128]) if k == 0: for ii, jj in T.grid(4, 4): B[i * 4 + ii, j * 4 + jj] = A[i * 4 + ii, j * 4 + jj] for ii, jj in T.grid(4, 4): with T.block([], ""): T.reads([B[((i * 4) + ii), ((j * 4) + jj)]]) T.writes([B[((i * 4) + ii), ((j * 4) + jj)]]) C = T.alloc_buffer([128, 128]) for kk in T.serial(0, 4): B[((i * 4) + ii), ((j * 4) + jj)] = (B[((i * 4) + ii), ((j * 4) + jj)] + C[((i * 4) + ii), ((k * 4) + kk)]) for kk in T.serial(0, 4): with T.block([], ""): T.reads([ B[((i * 4) + ii), ((j * 4) + jj)], C[((i * 4) + ii), ((k * 4) + kk)], ]) T.writes([B[((i * 4) + ii), ((j * 4) + jj)]]) D = T.alloc_buffer([128, 128]) B[((i * 4) + ii), ((j * 4) + jj)] = B[((i * 4) + ii), ((j * 4) + jj)] + (D[((j * 4) + jj), ( (k * 4) + kk)] * C[((i * 4) + ii), ((k * 4) + kk)])
def multiple_reduction_blocks(a: T.handle, f: T.handle) -> None: A = T.match_buffer(a, (16, 16, 16)) C = T.alloc_buffer((16, 16)) D = T.alloc_buffer((16, 16)) E = T.alloc_buffer((16, 16)) F = T.match_buffer(f, (16, 16)) for i in T.serial(0, 16): for j1 in T.serial(0, 16): for k1o, k1i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "C") as [ci, cj, ck]: T.bind(ci, i) T.bind(cj, j1) T.bind(ck, k1o * 4 + k1i) with T.init(): C[ci, cj] = 0.0 C[ci, cj] = C[ci, cj] + A[ci, cj, ck] for k2o, k2i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "D") as [di, dj, dk]: T.bind(di, i) T.bind(dj, j1) T.bind(dk, k2o * 4 + k2i) with T.init(): D[di, dj] = 0.0 D[di, dj] = D[di, dj] + A[di, dj, dk] + C[di, dj] for j2 in T.serial(0, 16): for k3o, k3i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "E") as [ei, ej, ek]: T.bind(ei, i) T.bind(ej, j2) T.bind(ek, k3o * 4 + k3i) with T.init(): E[ei, ej] = 0.0 E[ei, ej] = E[ei, ej] + A[ei, ej, ek] + D[ei, ej] for k4o, k4i in T.grid(4, 4): with T.block([16, 16, T.reduce_axis(0, 16)], "F") as [fi, fj, fk]: T.bind(fi, i) T.bind(fj, j2) T.bind(fk, k4o * 4 + k4i) with T.init(): F[fi, fj] = 0.0 F[fi, fj] = F[fi, fj] + A[fi, fj, fk] + E[fi, fj]
def rowsum_blockized(a: T.handle, b: T.handle) -> None: B = T.match_buffer(b, [32, 4]) A = T.match_buffer(a, [32, 4, 128]) for i0, i2_0 in T.grid(32, 16): with T.block([32, T.reduce_axis(0, 16)], "blockized_B") as [io, ko]: T.bind(io, i0) T.bind(ko, i2_0) with T.init(): for i1 in T.serial(0, 4): with T.block([4], "B_init") as [ii_init]: T.bind(ii_init, i1) B[io, ii_init] = 0.0 for i1_1, i2_1 in T.grid(4, 8): with T.block([4, T.reduce_axis(0, 128)], "B") as [ii, k]: T.bind(ii, i1_1) T.bind(k, ko * 8 + i2_1) B[io, ii] = B[io, ii] + A[io, ii, k]
def rowsum_zero_dim(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [128]) B = T.match_buffer(b, []) with T.block([T.reduce_axis(0, 128)], "B") as [k]: with T.init(): B[()] = 0.0 B[()] = B[()] + A[k]
def rowsum_wrong_reduce_pattern2(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, )) with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: with T.init(): B[vi] = 0.0 B[vi] = B[vi] - A[vi, vk]
def rowsum_not_compact_data_flow(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128,)) with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: with T.init(): B[vk] = 0.0 B[vk] = B[vk] + A[vi, vk]
def cuda_matmul_2(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=undefined-loop-variable A = T.match_buffer(a, [2048, 2048], "float32") B = T.match_buffer(b, [2048, 2048], "float32") C = T.match_buffer(c, [2048, 2048], "float32") A_shared = T.alloc_buffer([2048, 2048], "float32", scope="shared") B_shared = T.alloc_buffer([2048, 2048], "float32", scope="shared") A_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local") B_shared_local = T.alloc_buffer([2048, 2048], "float32", scope="local") C_local = T.alloc_buffer([2048, 2048], "float32", scope="local") with T.block([2048, 2048], "A_shared") as [v0, v1]: A_shared[v0, v1] = A[v0, v1] with T.block([2048, 2048], "B_shared") as [v0, v1]: B_shared[v0, v1] = B[v0, v1] with T.block([2048, 2048], "B_shared_local") as [v0, v1]: B_shared_local[v0, v1] = B_shared[v0, v1] for by in T.thread_binding(0, 32, thread="blockIdx.y"): for bx in T.thread_binding(0, 32, thread="blockIdx.x"): for vy in T.thread_binding(0, 2, thread="vthread.y"): for vx in T.thread_binding(0, 2, thread="vthread.x"): for ty in T.thread_binding(0, 8, thread="threadIdx.y"): for tx in T.thread_binding(0, 8, thread="threadIdx.x"): for k_0 in T.serial(0, 256): for k_1 in T.unroll(0, 8): for i, j in T.grid(1, 4): with T.block( [2048, 2048], "A_shared_local") as [v0, v1]: T.bind(v0, k_0 * 8 + k_1 + i) T.bind( v1, by * 64 + vy * 32 + ty * 4 + j) A_shared_local[v0, v1] = A_shared[v0, v1] for _, i, j in T.grid(1, 4, 4): with T.block([ 2048, 2048, T.reduce_axis(0, 2048) ], "C") as [vi, vj, vk]: T.bind( vi, by * 64 + vy * 32 + ty * 4 + i) T.bind( vj, bx * 64 + vx * 32 + tx * 4 + j) T.bind(vk, k_0 * 8 + k_1) with T.init(): C_local[vi, vj] = T.float32(0) C_local[vi, vj] = C_local[ vi, vj] + A_shared_local[ vk, vi] * B_shared_local[ vk, vj] for i, j in T.grid(4, 4): with T.block([2048, 2048], "C_local") as [v0, v1]: T.bind(v0, by * 64 + vy * 32 + ty * 4 + i) T.bind(v1, bx * 64 + vx * 32 + tx * 4 + j) C[v0, v1] = C_local[v0, v1]
def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=no-self-argument T.func_attr({"global_symbol": "main", "tir.noalias": True}) A = T.match_buffer(a, (16, 16), "float32") B = T.match_buffer(b, (16, 16), "float32") C = T.match_buffer(c, (16, 16), "float32") with T.block([16, 16, T.reduce_axis(0, 16)], "matmul") as [vi, vj, vk]: with T.init(): C[vi, vj] = 0.0 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
def tir_matmul(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, 128)) C = T.match_buffer(c, (128, 128)) with T.block([128, 128, T.reduce_axis(0, 128)]) as [i, j, k]: with T.init(): C[i, j] = 0.0 C[i, j] += A[i, k] * B[j, k]
def matmul_128(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]: with T.init(): C[vi, vj] = 0.0 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def matmul(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, 128)) C = T.match_buffer(c, (128, 128)) with T.block([128, 128, T.reduce_axis(0, 128)], "C") as [vi, vj, vk]: with T.init(): C[vi, vj] = 0.0 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def Matmul(a: T.handle, b: T.handle, c: T.handle) -> None: T.func_attr({"global_symbol": "main"}) A = T.match_buffer(a, (128, 256), "float32") B = T.match_buffer(b, (256, 512), "float32") C = T.match_buffer(c, (128, 512), "float32") with T.block([128, 256, T.reduce_axis(0, 512)], "matmul") as [vi, vj, vk]: with T.init(): C[vi, vj] = 0.0 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
def matmul(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) for i, j in T.grid(128, 128): with T.block([128, 128], "init") as [vi, vj]: C[vi, vj] = T.float32(0) for k in range(0, 128): with T.block([128, 128, T.reduce_axis(0, 128)], "update") as [vi, vj, vk]: C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def matmul(a: T.handle, b: T.handle, c: T.handle, n: T.int32) -> None: m = T.var("int32") A = T.match_buffer(a, [m, n]) B = T.match_buffer(b, [m, n]) C = T.match_buffer(c, [m, m]) with T.block([m, m, T.reduce_axis(0, n)], "update") as [vi, vj, vk]: with T.init(): C[vi, vj] = 0.0 C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vj, vk]
def matmul_rfactor(a: T.handle, b: T.handle, c: T.handle) -> None: A = T.match_buffer(a, [128, 128]) B = T.match_buffer(b, [128, 128]) C = T.match_buffer(c, [128, 128]) C_rf = T.alloc_buffer([4, 128, 128]) for i0, i1, i2_outer, i2_inner_outer, i2_inner_inner in T.grid( 128, 128, 4, 8, 4): with T.block([4, 128, 128, T.reduce_axis(0, 4), T.reduce_axis(0, 8)], "update_rf") as [ vi2_inner_inner, vi, vj, vi2_outer, vi2_inner_outer, ]: T.bind(vi2_inner_inner, i2_inner_inner) T.bind(vi, i0) T.bind(vj, i1) T.bind(vi2_outer, i2_outer) T.bind(vi2_inner_outer, i2_inner_outer) with T.init(): C_rf[vi2_inner_inner, vi, vj] = 0.0 C_rf[vi2_inner_inner, vi, vj] = C_rf[vi2_inner_inner, vi, vj] + (A[vi, ( ((vi2_outer * 32) + (vi2_inner_outer * 4)) + vi2_inner_inner)] * B[vj, ( ((vi2_outer * 32) + (vi2_inner_outer * 4)) + vi2_inner_inner)]) for i0_1, i1_1, i2_inner_inner_1 in T.grid(128, 128, 4): with T.block([T.reduce_axis(0, 4), 128, 128], "update") as [ vi2_inner_inner_1, vi_1, vj_1, ]: T.bind(vi2_inner_inner_1, i2_inner_inner_1) T.bind(vi_1, i0_1) T.bind(vj_1, i1_1) with T.init(): C[vi_1, vj_1] = 0.0 C[vi_1, vj_1] = C[vi_1, vj_1] + C_rf[vi2_inner_inner_1, vi_1, vj_1]
def main(a: T.handle, b: T.handle, c: T.handle) -> None: # pylint: disable=no-self-argument T.func_attr({"global_symbol": "main", "tir.noalias": True}) A = T.match_buffer(a, [16, 32, 32]) B = T.match_buffer(b, [16, 32, 32]) C = T.match_buffer(c, [16, 32, 32]) with T.block([16, 32, 32, T.reduce_axis(0, 32)], "update") as [vn, vi, vj, vk]: with T.init(): C[vn, vi, vj] = 0.0 C[vn, vi, vj] = C[vn, vi, vj] + A[vn, vi, vk] * B[vn, vj, vk]
def rowsum_cross_thread_reduction(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128,)) for i0 in T.serial(0, 128): for i1 in T.thread_binding(0, 128, thread="threadIdx.x"): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.bind(vi, i0) T.bind(vk, i1) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def rowsum_not_quasi_affine(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128,)) for i, k in T.grid(128, 16): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.bind(vi, i) T.bind(vk, T.floordiv(k * k, 2)) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def rowsum_unrolled(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128,)) for i0 in T.unroll(0, 128): for i1 in T.serial(0, 128): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.bind(vi, i0) T.bind(vk, i1) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def rowsum_transformed(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, )) for io, ii_ko_fused, ki in T.grid(32, 128, 4): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.bind(vi, io * 4 + T.floordiv(ii_ko_fused, 32)) T.bind(vk, T.floormod(ii_ko_fused, 32) * 4 + ki) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def matmul_decompose1(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [32, 4, 128], elem_offset=0, align=128, offset_factor=1) B = T.match_buffer(b, [32, 4], elem_offset=0, align=128, offset_factor=1) for i0 in T.serial(0, 32): with T.block([32], "blockized_B_init") as [io]: for i1 in T.serial(0, 4): with T.block([4], "B_init") as [ii]: B[io, ii] = T.float32(0) for i0, i2_o in T.grid(32, 16): with T.block([32, T.reduce_axis(0, 16)], "blockized_B_update") as [io, ko]: for i1, i2_i in T.grid(4, 8): with T.block([4, T.reduce_axis(0, 128)], "B") as [ii, k]: T.bind(ii, i1) T.bind(k, ((ko * 8) + i2_i)) B[io, ii] = B[io, ii] + A[io, ii, k]
def rowsum_predicate(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [128, 128], dtype="float32") B = T.match_buffer(b, [128], dtype="float32") for i, k_0, k_1 in T.grid(128, 13, 10): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.where(k_0 * 10 + k_1 < 128) T.bind(vi, i) T.bind(vk, k_0 * 10 + k_1) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def square_sum_rfactor(a: T.handle, c: T.handle) -> None: A = T.match_buffer(a, [16, 256, 256]) C = T.match_buffer(c, [16]) C_rf = T.alloc_buffer([16, 256]) for i0, i1, i2 in T.grid(16, 256, 256): with T.block([256, 16, T.reduce_axis(0, 256)], "C_rf") as [vi2, b, i]: T.bind(vi2, i2) T.bind(b, i0) T.bind(i, i1) with T.init(): C_rf[b, vi2] = 0.0 C_rf[b, vi2] = C_rf[b, vi2] + (A[b, i, vi2] * A[b, i, vi2]) for i0_1, i2_1 in T.grid(16, 256): with T.block([T.reduce_axis(0, 256), 16], "C") as [vi2_1, b_1]: T.bind(vi2_1, i2_1) T.bind(b_1, i0_1) with T.init(): C[b_1] = 0.0 C[b_1] = C[b_1] + C_rf[b_1, vi2_1]
def rowsum_not_serial(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (128, 128)) B = T.match_buffer(b, (128, )) for i in T.serial(0, 128): for k in T.parallel(0, 128): with T.block([128, T.reduce_axis(0, 128)], "B") as [vi, vk]: T.bind(vi, i) T.bind(vk, k) with T.init(): B[vi] = 0.0 B[vi] = B[vi] + A[vi, vk]
def rowsum_predicate_rfactor(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, [128, 128], dtype="float32") B = T.match_buffer(b, [128], dtype="float32") B_rf = T.alloc_buffer([128, 13], dtype="float32") for i, k_0, k_1 in T.grid(128, 13, 10): with T.block([13, 128, T.reduce_axis(0, 10)], "B_rf") as [vk_0, vi, vk_1]: T.where(k_0 * 10 + k_1 < 128) T.bind(vk_0, k_0) T.bind(vi, i) T.bind(vk_1, k_1) with T.init(): B_rf[vi, vk_0] = T.float32(0) B_rf[vi, vk_0] = B_rf[vi, vk_0] + A[vi, vk_0 * 10 + vk_1] for i, k_0 in T.grid(128, 13): with T.block([T.reduce_axis(0, 13), 128], "B") as [vk_0, vi]: T.bind(vk_0, k_0) T.bind(vi, i) with T.init(): B[vi] = T.float32(0) B[vi] = B[vi] + B_rf[vi, vk_0]