def transformed_opaque_access(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (32, 64, 128)) B = T.match_buffer(b, (64, 64, 64)) for i, j, k in T.grid(2, 64, 8): with T.block(): T.reads([]) T.writes(A[i * 16:i * 16 + 16, j, k * 16:k * 16 + 16]) T.evaluate( T.intrin_test( A.data, i * 131072 + j * 128 + k * 16, 8192, 128, 16, 1, dtype="handle", )) for i, j, k in T.grid(64, 2, 8): with T.block(): T.reads([]) T.writes(B[i, j * 32:j * 32 + 32, k * 8:k * 8 + 8]) T.evaluate( T.intrin_test( B.data, i * 4096 + j * 2048 + k * 8, 64, 1, 32, 8, dtype="handle", ))
def symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32) -> None: A = T.match_buffer(a, (n * m, m)) B = T.match_buffer(b, (n * 2, m * 4)) for i in range(0, n): with T.block(): T.reads([]) T.writes([A[i * m:i * m + n, 0:m], B[i * n:i * n + 2, 0:m * 4]]) Bs_0 = T.var("int32") Bs_1 = T.var("int32") sub_A = T.match_buffer(A[i * m:i * m + m, 0:m], (m, m), offset_factor=1) sub_B = T.match_buffer(B[i * n:i * n + 2, 0:m * 4], (2, m * 4), strides=[Bs_0, Bs_1], offset_factor=1) for ii, jj in T.grid(m, m): sub_A[ii, jj] = 1 for j in range(0, 4): T.evaluate( T.intrin_test( sub_B.data, sub_B.elem_offset, sub_B.strides[0], sub_B.strides[1], sub_B.shape[0], sub_B.shape[1], dtype="handle", ))
def transformed_recursive_match(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (64, 64, 64)) B = T.match_buffer(b, (64, 64, 64)) for i, j, k in T.grid(64, 4, 4): with T.block(): T.reads([]) T.writes([ A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], B[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16], ]) for jj, kk in T.grid(4, 4): with T.block(): T.reads([]) T.writes([ A[i, j * 16 + jj * 4:j * 16 + jj * 4 + 4, k * 16 + kk * 4:k * 16 + kk * 4 + 4, ], B[i, j * 16 + jj * 4:j * 16 + jj * 4 + 4, k * 16 + kk * 4:k * 16 + kk * 4 + 4, ], ]) T.evaluate( T.intrin_test( A.data, i * 4096 + j * 1024 + jj * 256 + k * 16 + kk * 4, 64, 1, 4, 4, dtype="handle", )) for jjj, kkk in T.grid(4, 4): B[i, j * 16 + jj * 4 + jjj, k * 16 + kk * 4 + kkk] = 1
def recursive_match(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (64, 64, 64)) B = T.match_buffer(b, (64, 64, 64)) for i, j, k in T.grid(64, 4, 4): with T.block([]): T.reads([]) T.writes( [ A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16], B[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16], ] ) As_0 = T.var("int32") As_1 = T.var("int32") sub_A = T.match_buffer( A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16], (16, 16), strides=[As_0, As_1], offset_factor=1, ) sub_B = T.match_buffer( B[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16], (16, 16), offset_factor=1, ) for jj, kk in T.grid(4, 4): with T.block([]): T.reads([]) T.writes( [ sub_A[jj * 4 : jj * 4 + 4, kk * 4 : kk * 4 + 4], sub_B[jj * 4 : jj * 4 + 4, kk * 4 : kk * 4 + 4], ] ) Ass_0 = T.var("int32") Ass_1 = T.var("int32") sub_sub_A = T.match_buffer( sub_A[jj * 4 : jj * 4 + 4, kk * 4 : kk * 4 + 4], (4, 4), strides=[Ass_0, Ass_1], offset_factor=1, ) sub_sub_B = T.match_buffer( sub_B[jj * 4 : jj * 4 + 4, kk * 4 : kk * 4 + 4], (4, 4), offset_factor=1, ) T.evaluate( T.intrin_test( sub_sub_A.data, sub_sub_A.elem_offset, sub_sub_A.strides[0], sub_sub_A.strides[1], sub_sub_A.shape[0], sub_sub_A.shape[1], dtype="handle", ) ) for jjj, kkk in T.grid(4, 4): sub_sub_B[jjj, kkk] = 1
def high_dim_opaque_access_with_source_strides(a: T.handle) -> None: A = T.match_buffer(a, (16, 32, 64), strides=[2576, 80, 1]) for i, j, k in T.grid(16, 2, 4): with T.block([]): As_0 = T.var("int32") As_1 = T.var("int32") T.reads([]) T.writes(A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16]) sub_A = T.match_buffer( A[i, j * 16 : j * 16 + 16, k * 16 : k * 16 + 16], (16, 16), strides=[As_0, As_1], offset_factor=1, ) T.evaluate( T.intrin_test( sub_A.data, sub_A.elem_offset, sub_A.strides[0], sub_A.strides[1], sub_A.shape[0], sub_A.shape[1], dtype="handle", ) )
def opaque_access(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (32, 64, 128)) B = T.match_buffer(b, (64, 64, 64)) for i, j, k in T.grid(2, 64, 8): with T.block([]): T.reads([]) T.writes(A[i * 16 : i * 16 + 16, j, k * 16 : k * 16 + 16]) sub_A = T.match_buffer( A[i * 16 : i * 16 + 16, j, k * 16 : k * 16 + 16], (16, 1, 16), strides=[8192, 128, 1], offset_factor=1, ) T.evaluate( T.intrin_test( sub_A.data, sub_A.elem_offset, sub_A.strides[0], sub_A.strides[1], sub_A.shape[0], sub_A.shape[1], dtype="handle", ) ) for i, j, k in T.grid(64, 2, 8): with T.block([]): Bs_0 = T.var("int32") Bs_1 = T.var("int32") T.reads([]) T.writes(B[i, j * 32 : j * 32 + 32, k * 8 : k * 8 + 8]) sub_B = T.match_buffer( B[i, j * 32 : j * 32 + 32, k * 8 : k * 8 + 8], (32, 8), strides=[Bs_0, Bs_1], offset_factor=1, ) T.evaluate( T.intrin_test( sub_B.data, sub_B.elem_offset, sub_B.strides[0], sub_B.strides[1], sub_B.shape[0], sub_B.shape[1], dtype="handle", ) )
def transformed_high_dim_opaque_access(a: T.handle) -> None: A = T.match_buffer(a, (16, 32, 64)) for i, j, k in T.grid(16, 2, 4): with T.block(): T.reads([]) T.writes(A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16]) T.evaluate( T.intrin_test( A.data, i * 2048 + j * 1024 + k * 16, 64, 1, 16, 16, dtype="handle", ))
def transformed_high_dim_opaque_access_with_source_strides( a: T.handle) -> None: A = T.match_buffer(a, (16, 32, 64), strides=[2576, 80, 1]) for i, j, k in T.grid(16, 2, 4): with T.block(): T.reads([]) T.writes(A[i, j * 16:j * 16 + 16, k * 16:k * 16 + 16]) T.evaluate( T.intrin_test( A.data, i * 2576 + j * 1280 + k * 16, 80, 1, 16, 16, dtype="handle", ))
def transformed_rank0_buffer(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (8, 8)) B = T.match_buffer(b, (8, 8)) for i, j in T.grid(8, 8): with T.block(): T.reads([]) T.writes([A[i, j], B[i, j]]) A[i, j] = 1 T.evaluate( T.intrin_test( B.data, i * 8 + j, 0, 0, 0, 0, dtype="handle", ))
def rank0_buffer(a: T.handle, b: T.handle) -> None: A = T.match_buffer(a, (8, 8)) B = T.match_buffer(b, (8, 8)) for i, j in T.grid(8, 8): with T.block(): T.reads([]) T.writes([A[i, j], B[i, j]]) sub_A = T.match_buffer(A[i, j], (), offset_factor=1) sub_B = T.match_buffer(B[i, j], (), offset_factor=1) sub_A[()] = 1 T.evaluate( T.intrin_test( sub_B.data, sub_B.elem_offset, 0, 0, 0, 0, dtype="handle", ))
def transformed_symbolic_match(a: T.handle, b: T.handle, n: T.int32, m: T.int32) -> None: A = T.match_buffer(a, (n * m, m)) B = T.match_buffer(b, (n * 2, m * 4)) for i in range(0, n): with T.block(): T.reads([]) T.writes([A[i * m:i * m + n, 0:m], B[i * n:i * n + 2, 0:m * 4]]) for ii, jj in T.grid(m, m): A[i * m + ii, jj] = 1 for j in range(0, 4): T.evaluate( T.intrin_test( B.data, i * n * (m * 4), m * 4, 1, 2, m * 4, dtype="handle", ))