def compacted_complex_func(a: ty.handle, c: ty.handle, n: ty.int32) -> None: A = tir.match_buffer(a, (8, 8), "float32") C = tir.match_buffer(c, (8, 8), "float32") for i in range(0, 8): with tir.block([]): tir.reads(A[0, 8]) tir.writes(C[0, 8]) B = tir.alloc_buffer((1, 8), "float32") for j in range(0, 4): with tir.block([]) as []: D = tir.alloc_buffer((6, 1), "float32") tir.reads(A[i, j]) tir.writes(B[0, j]) for k in range(4, 8): D[k - 2, 0] = 1.0 for k in range(2, 4): tir.store(B.data, j, A[i, j] + D[k - 2, 0]) for j in range(3, 5): with tir.block([]) as []: tir.reads(B[0, j]) tir.writes(C[i, j]) C[i, j] = B[0, j] for j in range(6, 8): with tir.block([]) as []: tir.reads(B[0, j]) tir.writes(C[i, j]) C[i, j] = B[0, j]
def buffer_opaque_access(b: ty.handle, c: ty.handle) -> None: B = tir.match_buffer(b, [16, 16], "float32") C = tir.match_buffer(c, [16, 16], "float32") with tir.block([]): tir.reads([]) tir.writes(B[0:16, 0:16]) A = tir.allocate([256], "float32", "global") for i, j in tir.grid(16, 16): tir.store(A, i * 16 + j, 1) for i in range(0, 16): for j in range(0, 16): tir.evaluate(tir.load("float32", A, i * 16 + j)) for j in range(0, 16): tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, tir.float32(0), dtype="handle")) for i, j in tir.grid(16, 16): with tir.block([16, 16]) as [vi, vj]: tir.bind(vi, i) tir.bind(vj, j) C[vi, vj] = B[vi, vj]
def opaque_access_store(a: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, (128, 128)) B = tir.alloc_buffer((128, 128)) C = tir.match_buffer(c, (128, 128)) with tir.block([128, 128], "B") as [vi, vj]: B[vi, vj] = A[vi, vj] * 2.0 with tir.block([128, 128], "C") as [vi, vj]: tir.reads(B[0:128, 0:128]) tir.writes(C[0:128, 0:128]) tir.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0) C[vi, vj] = tir.load("float32", B.data, vi * 16 + vj) + 1.0
def unschedulable_func(a: ty.handle, c: ty.handle) -> None: A = tir.match_buffer(a, (16, 16), "float32") C = tir.match_buffer(c, (16, 16), "float32") for i in range(0, 16): with tir.block([]): tir.reads(A[i, 0:16]) tir.writes(C[i, 0:16]) B = tir.alloc_buffer((16, 16), "float32") for j in range(0, 16): tir.store(B.data, i * 16 + j, A[i, j] + 1.0) for j in range(0, 16): C[i, j] = B[i, j] * 2.0
def opaque_access(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [16, 16], "float32") B = tir.match_buffer(b, [16, 16], "float32") with tir.block([16, 16], "A") as [vi, vj]: tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, vi * 16 + vj, 1) with tir.block([16, 16], "B") as [vi, vj]: tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
def opaque_access_split(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, (16, 16)) B = tir.match_buffer(b, (16, 16)) for i, j0, j1 in tir.grid(16, 4, 4): with tir.block([16, 16], "A") as [vi, vj]: tir.bind(vi, i) tir.bind(vj, ((j0 * 4) + j1)) tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, ((vi * 16) + vj), 1, 1) for i, j0, j1 in tir.grid(16, 4, 4): with tir.block([16, 16], "B") as [vi, vj]: tir.bind(vi, i) tir.bind(vj, ((j0 * 4) + j1)) tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def opaque_access_fused(a: ty.handle, b: ty.handle) -> None: A = tir.match_buffer(a, [16, 16]) B = tir.match_buffer(b, [16, 16]) for i_j_fused in tir.serial(0, 256): with tir.block([16, 16], "A") as [vi, vj]: tir.bind(vi, tir.floordiv(i_j_fused, 16)) tir.bind(vj, tir.floormod(i_j_fused, 16)) tir.reads([]) tir.writes([A[0:16, 0:16]]) tir.store(A.data, ((vi * 16) + vj), 1, 1) for i_j_fused in tir.serial(0, 256): with tir.block([16, 16], "B") as [vi, vj]: tir.bind(vi, tir.floordiv(i_j_fused, 16)) tir.bind(vj, tir.floormod(i_j_fused, 16)) tir.reads([]) tir.writes([B[0:16, 0:16]]) tir.evaluate( tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))