Exemple #1
0
def unified_element_wise_thread_x(a: T.handle, b: T.handle,
                                  c: T.handle) -> None:
    thread_x = T.env_thread("threadIdx.x")
    block_x = T.env_thread("blockIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [128, 128])
    T.launch_thread(block_x, 128)
    with T.launch_thread(thread_x, 4):
        for j0_1 in T.serial(0, 32):
            T.store(
                B.data,
                block_x * 128 + thread_x * 32 + j0_1,
                T.load("float32", A.data, block_x * 128 + thread_x * 32 + j0_1)
                * 2.0,
                True,
            )
    T.launch_thread(thread_x, 4)
    for j1_1 in T.serial(0, 32):
        T.store(
            C.data,
            block_x * 128 + thread_x * 32 + j1_1,
            T.load("float32", A.data, block_x * 128 + thread_x * 32 + j1_1) +
            1.0,
            True,
        )
Exemple #2
0
def unified_element_wise_kernels_with_different_size(a: T.handle, b: T.handle,
                                                     c: T.handle,
                                                     d: T.handle) -> None:
    block_x = T.env_thread("blockIdx.x")
    thread_x = T.env_thread("threadIdx.x")
    block_x_1 = T.env_thread("blockIdx.x")
    thread_x_1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [256, 256])
    D = T.match_buffer(d, [256, 256])
    with T.launch_thread(block_x, 128):
        T.launch_thread(thread_x, 128)
        T.store(
            B.data,
            block_x * 128 + thread_x,
            T.load("float32", A.data, block_x * 128 + thread_x) * 2.0,
            True,
        )
    T.launch_thread(block_x_1, 256)
    T.launch_thread(thread_x_1, 256)
    T.store(
        D.data,
        block_x_1 * 256 + thread_x_1,
        T.load("float32", C.data, block_x_1 * 256 + thread_x_1) + 1.0,
        True,
    )
Exemple #3
0
 def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(
         placeholder: T.handle, placeholder_1: T.handle,
         T_cast: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_2 = T.match_buffer(placeholder, [1, 75, 75, 64],
                                    dtype="uint8")
     placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
     T_cast_1 = T.match_buffer(T_cast, [1, 75, 75, 64], dtype="int16")
     # body
     for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
         T.store(
             T_cast_1.data,
             ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner,
             T.cast(
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(T.cast(
                                 T.load(
                                     "uint8", placeholder_2.data,
                                     ax0_ax1_fused * 4800 + ax2 * 64 +
                                     ax3_outer * 16 + ax3_inner), "int32") -
                                                94,
                                                1843157232,
                                                31,
                                                1,
                                                dtype="int32") +
                             T.load("int32", placeholder_3.data,
                                    ax3_outer * 16 + ax3_inner), 255), 0),
                     "uint8"), "int16"), True)
 def main(a: T.handle, b: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "T.noalias": True})
     # var definition
     threadIdx_x = T.env_thread("threadIdx.x")
     threadIdx_y = T.env_thread("threadIdx.y")
     blockIdx_x = T.env_thread("blockIdx.x")
     blockIdx_y = T.env_thread("blockIdx.y")
     blockIdx_z = T.env_thread("blockIdx.z")
     A = T.match_buffer(a, [14, 14, 256, 256], dtype="float32")
     B = T.match_buffer(b, [14, 14, 512, 256], dtype="float32")
     # body
     T.launch_thread(blockIdx_z, 196)
     B_local = T.allocate([6400000], "float32", "local")
     Apad_shared = T.allocate([512], "float32", "shared")
     Apad_shared_local = T.allocate([8], "float32", "local")
     T.launch_thread(blockIdx_y, 8)
     T.launch_thread(blockIdx_x, 4)
     T.launch_thread(threadIdx_y, 8)
     T.launch_thread(threadIdx_x, 8)
     for ff_c_init, nn_c_init in T.grid(8, 8):
         T.store(B_local, ff_c_init * 8 + nn_c_init, T.float32(0), True)
     for rc_outer, ry, rx in T.grid(32, 3, 3):
         for ax3_inner_outer in T.serial(0, 2):
             T.store(Apad_shared, T.ramp(threadIdx_y * 64 + threadIdx_x * 8 + ax3_inner_outer * 4, 1, 4), T.if_then_else(1 <= blockIdx_z // 14 + ry and blockIdx_z // 14 + ry < 15 and 1 <= rx + blockIdx_z % 14 and rx + blockIdx_z % 14 < 15, T.load("float32x4", A.data, T.ramp(ry * 917504 + blockIdx_z * 65536 + rx * 65536 + rc_outer * 2048 + threadIdx_y * 256 + blockIdx_x * 64 + threadIdx_x * 8 + ax3_inner_outer * 4 - 983040, 1, 4), T.broadcast(True, 4)), T.broadcast(T.float32(0), 4), dtype="float32x4"), T.broadcast(True, 4))
         for rc_inner in T.serial(0, 8):
             for ax3 in T.serial(0, 8):
                 T.store(Apad_shared_local, ax3, T.load("float32", Apad_shared, rc_inner * 64 + threadIdx_x * 8 + ax3), True)
             for ff_c, nn_c in T.grid(8, 8):
                 T.store(B_local, ff_c * 8 + nn_c, T.load("float32", B_local, ff_c * 8 + nn_c) + T.load("float32", Apad_shared_local, nn_c), True)
     for ff_inner_inner_inner, nn_inner_inner_inner in T.grid(8, 8):
         T.store(B.data, blockIdx_z * 131072 + blockIdx_y * 16384 + threadIdx_y * 2048 + ff_inner_inner_inner * 256 + blockIdx_x * 64 + threadIdx_x * 8 + nn_inner_inner_inner, T.load("float32", B_local, ff_inner_inner_inner * 8 + nn_inner_inner_inner), True)# fmt: on
def compacted_complex_func(a: T.handle, c: T.handle, n: T.int32) -> None:
    A = T.match_buffer(a, (8, 8), "float32")
    C = T.match_buffer(c, (8, 8), "float32")
    for i in range(0, 8):
        with T.block():
            T.reads(A[0, 8])
            T.writes(C[0, 8])
            B = T.alloc_buffer((1, 8), "float32")
            for j in range(0, 4):
                with T.block() as []:
                    D = T.alloc_buffer((6, 1), "float32")
                    T.reads(A[i, j])
                    T.writes(B[0, j])
                    for k in range(4, 8):
                        D[k - 2, 0] = 1.0
                    for k in range(2, 4):
                        T.store(B.data, j, A[i, j] + D[k - 2, 0])
            for j in range(3, 5):
                with T.block() as []:
                    T.reads(B[0, j])
                    T.writes(C[i, j])
                    C[i, j] = B[0, j]
            for j in range(6, 8):
                with T.block() as []:
                    T.reads(B[0, j])
                    T.writes(C[i, j])
                    C[i, j] = B[0, j]
Exemple #6
0
def buffer_opaque_access(b: T.handle, c: T.handle) -> None:
    B = T.match_buffer(b, [16, 16], "float32")
    C = T.match_buffer(c, [16, 16], "float32")

    with T.block([]):
        T.reads([])
        T.writes(B[0:16, 0:16])
        A = T.allocate([256], "float32", "global")
        for i, j in T.grid(16, 16):
            T.store(A, i * 16 + j, 1)
        for i in range(0, 16):
            for j in range(0, 16):
                T.evaluate(T.load("float32", A, i * 16 + j))
            for j in range(0, 16):
                T.evaluate(
                    T.tvm_fill_fragment(B.data,
                                        16,
                                        16,
                                        16,
                                        0,
                                        T.float32(0),
                                        dtype="handle"))

    for i, j in T.grid(16, 16):
        with T.block([16, 16]) as [vi, vj]:
            T.bind(vi, i)
            T.bind(vj, j)
            C[vi, vj] = B[vi, vj]
Exemple #7
0
 def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle,
                                        placeholder_3: T.handle,
                                        T_subtract: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol": "tvmgen_default_fused_cast_subtract",
         "tir.noalias": True
     })
     placeholder_4 = T.match_buffer(placeholder_2, [1, 224, 224, 3],
                                    dTpe="uint8",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder_3, [],
                                    dtype="int16",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
     T_subtract_1 = T.match_buffer(T_subtract, [1, 224, 224, 3],
                                   dtype="int16",
                                   elem_offset=0,
                                   align=128,
                                   offset_factor=1)
     # body
     for ax0_ax1_fused_1 in T.serial(0, 224):
         for ax2_1, ax3_inner_1 in T.grid(224, 3):
             T.store(T_subtract_1.data, (((ax0_ax1_fused_1 * 672) +
                                          (ax2_1 * 3)) + ax3_inner_1),
                     (T.cast(
                         T.load("uint8", placeholder_4.data,
                                (((ax0_ax1_fused_1 * 672) +
                                  (ax2_1 * 3)) + ax3_inner_1)), "int16") -
                      T.load("int16", placeholder_5.data, 0)), True)
 def threadpool_nested_parallel_loop(
         A: T.Buffer[(4, 4), "float32"], B: T.Buffer[(4, 4),
                                                     "float32"]) -> None:
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     for i in T.parallel(4):
         for j in T.parallel(4):
             T.store(B.data, i * 4 + j,
                     T.load("float32", A.data, i * 4 + j) * 2.0)
Exemple #9
0
def opaque_access_store(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (128, 128))
    B = T.alloc_buffer((128, 128))
    C = T.match_buffer(c, (128, 128))
    with T.block([128, 128], "B") as [vi, vj]:
        B[vi, vj] = A[vi, vj] * 2.0
    with T.block([128, 128], "C") as [vi, vj]:
        T.reads(B[0:128, 0:128])
        T.writes(C[0:128, 0:128])
        T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0)
        C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
def unschedulable_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")
    for i in range(0, 16):
        with T.block():
            T.reads(A[i, 0:16])
            T.writes(C[i, 0:16])
            B = T.alloc_buffer((16, 16), "float32")
            for j in range(0, 16):
                T.store(B.data, i * 16 + j, A[i, j] + 1.0)
            for j in range(0, 16):
                C[i, j] = B[i, j] * 2.0
Exemple #11
0
def partitioned_concat(a: T.handle, b: T.handle, c: T.handle) -> None:
    T.func_attr({
        "from_legacy_te_schedule": True,
        "global_symbol": "main",
        "tir.noalias": True
    })
    A = T.match_buffer(a, [16], dtype="float32")
    B = T.match_buffer(b, [16], dtype="float32")
    C = T.match_buffer(c, [32], dtype="float32")
    for i in T.serial(0, 16):
        T.store(C.data, i, T.load("float32", A.data, i), True)
    for i in T.serial(0, 16):
        T.store(C.data, i + 16, T.load("float32", B.data, i + 16), True)
def opaque_access_reorder(a: T.handle, b: T.handle) -> None:
    A = T.match_buffer(a, [16, 16], "float32")
    B = T.match_buffer(b, [16, 16], "float32")
    for j, i in T.grid(16, 16):
        with T.block("A"):
            vi, vj = T.axis.remap("SS", [i, j])
            T.reads([])
            T.writes([A[0:16, 0:16]])
            T.store(A.data, vi * 16 + vj, 1)
    for j, i in T.grid(16, 16):
        with T.block("B"):
            vi, vj = T.axis.remap("SS", [i, j])
            T.reads([])
            T.writes([B[0:16, 0:16]])
            T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle"))
Exemple #13
0
def opaque_access_store(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (128, 128))
    B = T.alloc_buffer((128, 128))
    C = T.match_buffer(c, (128, 128))
    for i, j in T.grid(128, 128):
        with T.block("B"):
            vi, vj = T.axis.remap("SS", [i, j])
            B[vi, vj] = A[vi, vj] * 2.0
    for i, j in T.grid(128, 128):
        with T.block("C"):
            vi, vj = T.axis.remap("SS", [i, j])
            T.reads(B[0:128, 0:128])
            T.writes(C[0:128, 0:128])
            T.store(C.data, vi * 128 + vj, B[vi, vj] + 1.0)
            C[vi, vj] = T.load("float32", B.data, vi * 16 + vj) + 1.0
Exemple #14
0
def unified_element_wise_vthread_x(a: T.handle, b: T.handle) -> None:
    vthread_x = T.env_thread("vthread.x")
    thread_x = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    T.launch_thread(vthread_x, 2)
    T.launch_thread(thread_x, 64)
    T.launch_thread(vthread_x, 2)
    for j_1 in T.serial(0, 64):
        T.store(
            B.data,
            vthread_x * 8256 + thread_x * 128 + j_1,
            T.load("float32", A.data, vthread_x * 8256 + thread_x * 128 + j_1)
            * 2.0,
            True,
        )
Exemple #15
0
def element_wise_two_thread_x_in_same_kernel_not_equal(a: T.handle,
                                                       b: T.handle,
                                                       c: T.handle) -> None:
    i = T.env_thread("blockIdx.x")
    j0 = T.env_thread("threadIdx.x")
    j1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [128, 64])
    T.launch_thread(i, 128)
    with T.launch_thread(j0, 128):
        T.store(B.data, i * 64 + j0,
                T.load("float32", A.data, i * 128 + j0) * 2.0, True)
    T.launch_thread(j1, 64)
    T.store(C.data, i * 64 + j1,
            T.load("float32", A.data, i * 128 + j1) + 1.0, True)
def opaque_access_split(a: T.handle, b: T.handle) -> None:
    A = T.match_buffer(a, (16, 16))
    B = T.match_buffer(b, (16, 16))
    for i, j0, j1 in T.grid(16, 4, 4):
        with T.block("A"):
            vi = T.axis.S(16, i)
            vj = T.axis.S(16, j0 * 4 + j1)
            T.reads([])
            T.writes([A[0:16, 0:16]])
            T.store(A.data, ((vi * 16) + vj), 1, 1)
    for i, j0, j1 in T.grid(16, 4, 4):
        with T.block("B"):
            vi = T.axis.S(16, i)
            vj = T.axis.S(16, j0 * 4 + j1)
            T.reads([])
            T.writes([B[0:16, 0:16]])
            T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
def compacted_opaque_access_annotated_func(a: T.handle) -> None:
    A = T.match_buffer(a, (1024, ), "float32")
    with T.block():
        B = T.alloc_buffer((1024, ), dtypes="float32")
        C = T.alloc_buffer((520, ), dtypes="float32")
        for i in range(0, 512):
            with T.block():
                # no annotation, opaque access will cover full region
                T.reads([])
                T.writes([])
                T.store(B.data, i, "float32", A[i])
            with T.block():
                # treat opaque access only access annotated regions, even if
                # they are not compatible with actual buffer accesses.
                T.reads([B[i]])
                T.writes([C[i:i + 9]])
                T.store(C.data, i, T.load("float32", B.data, i))
def opaque_access_fused(a: T.handle, b: T.handle) -> None:
    A = T.match_buffer(a, [16, 16])
    B = T.match_buffer(b, [16, 16])
    for i_j_fused in T.serial(0, 256):
        with T.block("A"):
            vi = T.axis.S(16, T.floordiv(i_j_fused, 16))
            vj = T.axis.S(16, T.floormod(i_j_fused, 16))
            T.reads([])
            T.writes([A[0:16, 0:16]])
            T.store(A.data, ((vi * 16) + vj), 1, 1)
    for i_j_fused in T.serial(0, 256):
        with T.block("B"):
            vi = T.axis.S(16, T.floordiv(i_j_fused, 16))
            vj = T.axis.S(16, T.floormod(i_j_fused, 16))
            T.reads([])
            T.writes([B[0:16, 0:16]])
            T.evaluate(T.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle"))
Exemple #19
0
def element_wise_vthread_x(a: T.handle, b: T.handle) -> None:
    i_0 = T.env_thread("vthread.x")
    i_1 = T.env_thread("threadIdx.x")
    j_0 = T.env_thread("vthread.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    T.launch_thread(i_0, 2)
    T.launch_thread(i_1, 64)
    T.launch_thread(j_0, 2)
    for j_1 in T.serial(0, 64):
        T.store(
            B.data,
            i_0 * 8192 + i_1 * 128 + j_0 * 64 + j_1,
            T.load("float32", A.data, i_0 * 8192 + i_1 * 128 + j_0 * 64 + j_1)
            * 2.0,
            True,
        )
Exemple #20
0
def element_wise_kernels_with_different_size(a: T.handle, b: T.handle,
                                             c: T.handle, d: T.handle) -> None:
    i0 = T.env_thread("blockIdx.x")
    j0 = T.env_thread("threadIdx.x")
    i1 = T.env_thread("blockIdx.x")
    j1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [256, 256])
    D = T.match_buffer(d, [256, 256])
    with T.launch_thread(i0, 128):
        T.launch_thread(j0, 128)
        T.store(B.data, i0 * 128 + j0,
                T.load("float32", A.data, i0 * 128 + j0) * 2.0, True)
    T.launch_thread(i1, 256)
    T.launch_thread(j1, 256)
    T.store(D.data, i1 * 256 + j1,
            T.load("float32", C.data, i1 * 256 + j1) + 1.0, True)
Exemple #21
0
def opaque_access(a: T.handle, b: T.handle) -> None:
    A = T.match_buffer(a, [16, 16], "float32")
    B = T.match_buffer(b, [16, 16], "float32")
    with T.block([16, 16], "A") as [vi, vj]:
        T.reads([])
        T.writes([A[0:16, 0:16]])
        T.store(A.data, vi * 16 + vj, 1)
    with T.block([16, 16], "B") as [vi, vj]:
        T.reads([])
        T.writes([B[0:16, 0:16]])
        T.evaluate(
            T.tvm_fill_fragment(B.data,
                                16,
                                16,
                                16,
                                0,
                                vi * 16 + vj,
                                dtype="handle"))
Exemple #22
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(
         placeholder_4: T.handle, placeholder_5: T.handle,
         placeholder_6: T.handle, T_cast_2: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_7 = T.match_buffer(placeholder_4, [1, 75, 75, 64],
                                    dtype="int16")
     placeholder_8 = T.match_buffer(placeholder_5, [1, 1, 64, 64],
                                    dtype="int16")
     placeholder_9 = T.match_buffer(placeholder_6, [1, 1, 1, 64],
                                    dtype="int32")
     T_cast_3 = T.match_buffer(T_cast_2, [1, 75, 75, 64], dtype="int16")
     # body
     PaddedInput = T.allocate([360000], "int16", "global")
     for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
         T.store(
             PaddedInput, i0_i1_fused * 4800 + i2 * 64 + i3,
             T.load("int16", placeholder_7.data,
                    i0_i1_fused * 4800 + i2 * 64 + i3), True)
     for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
         Conv2dOutput = T.allocate([64], "int32", "global")
         for ff in T.serial(0, 64):
             T.store(Conv2dOutput, ff, 0, True)
             for rc in T.serial(0, 64):
                 T.store(
                     Conv2dOutput, ff,
                     T.load("int32", Conv2dOutput, ff) + T.cast(
                         T.load("int16", PaddedInput,
                                ax0_ax1_fused_ax2_fused * 64 + rc), "int32")
                     * T.cast(
                         T.load("int16", placeholder_8.data, rc * 64 + ff),
                         "int32"), True)
         for ax3_inner_1 in T.serial(0, 64):
             T.store(
                 T_cast_3.data, ax0_ax1_fused_ax2_fused * 64 + ax3_inner_1,
                 T.cast(
                     T.cast(
                         T.max(
                             T.min(
                                 T.q_multiply_shift(
                                     T.load("int32", Conv2dOutput,
                                            ax3_inner_1) +
                                     T.load("int32", placeholder_9.data,
                                            ax3_inner_1),
                                     1843106743,
                                     31,
                                     -6,
                                     dtype="int32"), 255), 0), "uint8"),
                     "int16"), True)
Exemple #23
0
def partitioned_concat_3(
    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
) -> None:
    for i1, i2, i3 in T.grid(64, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3,
            T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3),
            True,
        )
    for i1, i2, i3 in T.grid(32, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3 + 50176,
            T.load("int8", placeholder_1.data, i1 * 784 + i2 * 28 + i3),
            True,
        )
    for i1, i2, i3 in T.grid(32, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3 + 75264,
            T.load("int8", placeholder_2.data, i1 * 784 + i2 * 28 + i3),
            True,
        )
Exemple #24
0
def concat_func_3(
    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
) -> None:
    for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
        for i2, i3 in T.grid(28, 28):
            if 96 <= i1:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder_2.data,
                           i1 * 784 + i2 * 28 + i3 - 75264),
                    True,
                )
            if 64 <= i1 and i1 < 96:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder_1.data,
                           i1 * 784 + i2 * 28 + i3 - 50176),
                    True,
                )
            if i1 < 64:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3),
                    True,
                )
Exemple #25
0
def element_wise_thread_x(a: T.handle, b: T.handle, c: T.handle) -> None:
    j1_0 = T.env_thread("threadIdx.x")
    j0_0 = T.env_thread("threadIdx.x")
    i = T.env_thread("blockIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [128, 128])
    T.launch_thread(i, 128)
    with T.launch_thread(j0_0, 4):
        for j0_1 in T.serial(0, 32):
            T.store(
                B.data,
                i * 128 + j0_0 * 32 + j0_1,
                T.load("float32", A.data, i * 128 + j0_0 * 32 + j0_1) * 2.0,
                True,
            )
    T.launch_thread(j1_0, 4)
    for j1_1 in T.serial(0, 32):
        T.store(
            C.data,
            i * 128 + j1_0 * 32 + j1_1,
            T.load("float32", A.data, i * 128 + j1_0 * 32 + j1_1) + 1.0,
            True,
        )
Exemple #26
0
 def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle,
                                             T_cast_6: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast",
         "tir.noalias": True
     })
     placeholder_29 = T.match_buffer(placeholder_28, [1, 112, 112, 64],
                                     dtype="uint8",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_7 = T.match_buffer(T_cast_6, [1, 56, 56, 64],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     # body
     tensor_2 = T.allocate([200704], "uint8", "global")
     for ax0_ax1_fused_4 in T.serial(0, 56):
         for ax2_4 in T.serial(0, 56):
             for ax3_init in T.serial(0, 64):
                 T.store(tensor_2, (((ax0_ax1_fused_4 * 3584) +
                                     (ax2_4 * 64)) + ax3_init), T.uint8(0),
                         True)
             for rv0_rv1_fused_1, ax3_2 in T.grid(9, 64):
                 T.store(
                     tensor_2,
                     (((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_2),
                     T.max(
                         T.load("uint8", tensor_2,
                                (((ax0_ax1_fused_4 * 3584) +
                                  (ax2_4 * 64)) + ax3_2)),
                         T.if_then_else(
                             ((((ax0_ax1_fused_4 * 2) +
                                T.floordiv(rv0_rv1_fused_1, 3)) < 112) and
                              (((ax2_4 * 2) +
                                T.floormod(rv0_rv1_fused_1, 3)) < 112)),
                             T.load("uint8", placeholder_29.data, (((
                                 ((ax0_ax1_fused_4 * 14336) +
                                  (T.floordiv(rv0_rv1_fused_1, 3) * 7168)) +
                                 (ax2_4 * 128)) + (T.floormod(
                                     rv0_rv1_fused_1, 3) * 64)) + ax3_2)),
                             T.uint8(0),
                             dtype="uint8")), True)
     for ax0_ax1_fused_5 in T.serial(0, 56):
         for ax2_5, ax3_3 in T.grid(56, 64):
             T.store(
                 T_cast_7.data,
                 (((ax0_ax1_fused_5 * 3584) + (ax2_5 * 64)) + ax3_3),
                 T.cast(
                     T.load("uint8", tensor_2, (((ax0_ax1_fused_5 * 3584) +
                                                 (ax2_5 * 64)) + ax3_3)),
                     "int16"), True)
Exemple #27
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(placeholder_10: T.handle, placeholder_11: T.handle, placeholder_12: T.handle, T_cast_4: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1", "tir.noalias": True})
     placeholder_13 = T.match_buffer(placeholder_10, [1, 75, 75, 64], dtype="int16")
     placeholder_14 = T.match_buffer(placeholder_11, [3, 3, 64, 64], dtype="int16")
     placeholder_15 = T.match_buffer(placeholder_12, [1, 1, 1, 64], dtype="int32")
     T_cast_5 = T.match_buffer(T_cast_4, [1, 75, 75, 64], dtype="int16")
     # body
     PaddedInput_1 = T.allocate([379456], "int16", "global")
     for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
         T.store(PaddedInput_1, i0_i1_fused_1 * 4928 + i2_1 * 64 + i3_1, T.if_then_else(1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76 and 1 <= i2_1 and i2_1 < 76, T.load("int16", placeholder_13.data, i0_i1_fused_1 * 4800 + i2_1 * 64 + i3_1 - 4864), T.int16(0), dtype="int16"), True)
     for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
         Conv2dOutput_1 = T.allocate([64], "int32", "global")
         for ff_1 in T.serial(0, 64):
             T.store(Conv2dOutput_1, ff_1, 0, True)
             for ry, rx, rc_1 in T.grid(3, 3, 64):
                 T.store(Conv2dOutput_1, ff_1, T.load("int32", Conv2dOutput_1, ff_1) + T.cast(T.load("int16", PaddedInput_1, T.floordiv(ax0_ax1_fused_ax2_fused_1, 75) * 4928 + ry * 4928 + rx * 64 + T.floormod(ax0_ax1_fused_ax2_fused_1, 75) * 64 + rc_1), "int32") * T.cast(T.load("int16", placeholder_14.data, ry * 12288 + rx * 4096 + rc_1 * 64 + ff_1), "int32"), True)
         for ax3_inner_2 in T.serial(0, 64):
             T.store(T_cast_5.data, ax0_ax1_fused_ax2_fused_1 * 64 + ax3_inner_2, T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_1, ax3_inner_2) + T.load("int32", placeholder_15.data, ax3_inner_2), 1608879842, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16"), True)
Exemple #28
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(placeholder_62: T.handle, placeholder_63: T.handle, placeholder_64: T.handle, T_cast_20: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast", "tir.noalias": True})
     placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64], dtype="int16", elem_offset=0, align=128, offset_factor=1)
     placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     PaddedInput_7 = T.allocate([157323], "int16", "global")
     for i0_i1_fused_7 in T.serial(0, 229):
         for i2_7, i3_7 in T.grid(229, 3):
             T.store(PaddedInput_7, (((i0_i1_fused_7*687) + (i2_7*3)) + i3_7), T.if_then_else(((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and (2 <= i2_7)) and (i2_7 < 226)), T.load("int16", placeholder_65.data, ((((i0_i1_fused_7*672) + (i2_7*3)) + i3_7) - 1350)), T.int16(0), dtype="int16"), True)
     for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
         Conv2dOutput_7 = T.allocate([64], "int32", "global")
         for ff_3 in T.serial(0, 64):
             T.store(Conv2dOutput_7, ff_3, 0, True)
             for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
                 T.store(Conv2dOutput_7, ff_3, (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast(T.load("int16", PaddedInput_7, (((((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112)*1374) + (ry_2*687)) + (T.floormod(ax0_ax1_fused_ax2_fused_7, 112)*6)) + (rx_2*3)) + rc_7)), "int32")*T.cast(T.load("int16", placeholder_66.data, ((((ry_2*1344) + (rx_2*192)) + (rc_7*64)) + ff_3)), "int32"))), True)
         for ax3_inner_7 in T.serial(0, 64):
             T.store(T_cast_21.data, ((ax0_ax1_fused_ax2_fused_7*64) + ax3_inner_7), T.cast(T.max(T.min(T.q_multiply_shift((T.load("int32", Conv2dOutput_7, ax3_inner_7) + T.load("int32", placeholder_67.data, ax3_inner_7)), 1939887962, 31, -9, dtype="int32"), 255), 0), "uint8"), True)
Exemple #29
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_(placeholder_16: T.handle, placeholder_17: T.handle, placeholder_18: T.handle, T_add: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_", "tir.noalias": True})
     placeholder_19 = T.match_buffer(placeholder_16, [1, 75, 75, 64], dtype="int16")
     placeholder_20 = T.match_buffer(placeholder_17, [1, 1, 64, 256], dtype="int16")
     placeholder_21 = T.match_buffer(placeholder_18, [1, 1, 1, 256], dtype="int32")
     T_add_1 = T.match_buffer(T_add, [1, 75, 75, 256], dtype="int32")
     # body
     PaddedInput_2 = T.allocate([360000], "int16", "global")
     for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
         T.store(PaddedInput_2, i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2, T.load("int16", placeholder_19.data, i0_i1_fused_2 * 4800 + i2_2 * 64 + i3_2), True)
     for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
         Conv2dOutput_2 = T.allocate([64], "int32", "global")
         for ax3_outer_1 in T.serial(0, 4):
             for ff_2 in T.serial(0, 64):
                 T.store(Conv2dOutput_2, ff_2, 0, True)
                 for rc_2 in T.serial(0, 64):
                     T.store(Conv2dOutput_2, ff_2, T.load("int32", Conv2dOutput_2, ff_2) + T.cast(T.load("int16", PaddedInput_2, ax0_ax1_fused_ax2_fused_2 * 64 + rc_2), "int32") * T.cast(T.load("int16", placeholder_20.data, rc_2 * 256 + ax3_outer_1 * 64 + ff_2), "int32"), True)
             for ax3_inner_3 in T.serial(0, 64):
                 T.store(T_add_1.data, ax0_ax1_fused_ax2_fused_2 * 256 + ax3_outer_1 * 64 + ax3_inner_3, T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_2, ax3_inner_3) + T.load("int32", placeholder_21.data, ax3_outer_1 * 64 + ax3_inner_3), 1711626602, 31, -8, dtype="int32") + 132, 255), 0), "uint8"), "int32") - 132, 2094289803, 31, -2, dtype="int32") + 136, True)
Exemple #30
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_(placeholder_22: T.handle, placeholder_23: T.handle, placeholder_24: T.handle, placeholder_25: T.handle, T_cast_6: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_", "tir.noalias": True})
     placeholder_29 = T.match_buffer(placeholder_22, [1, 75, 75, 64], dtype="int16")
     placeholder_27 = T.match_buffer(placeholder_23, [1, 1, 64, 256], dtype="int16")
     placeholder_26 = T.match_buffer(placeholder_24, [1, 1, 1, 256], dtype="int32")
     placeholder_28 = T.match_buffer(placeholder_25, [1, 75, 75, 256], dtype="int32")
     T_cast_7 = T.match_buffer(T_cast_6, [1, 75, 75, 256], dtype="uint8")
     # body
     PaddedInput_3 = T.allocate([360000], "int16", "global")
     for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
         T.store(PaddedInput_3, i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3, T.load("int16", placeholder_29.data, i0_i1_fused_3 * 4800 + i2_3 * 64 + i3_3), True)
     for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
         Conv2dOutput_3 = T.allocate([64], "int32", "global")
         for ax3_outer_2 in T.serial(0, 4):
             for ff_3 in T.serial(0, 64):
                 T.store(Conv2dOutput_3, ff_3, 0, True)
                 for rc_3 in T.serial(0, 64):
                     T.store(Conv2dOutput_3, ff_3, T.load("int32", Conv2dOutput_3, ff_3) + T.cast(T.load("int16", PaddedInput_3, ax0_ax1_fused_ax2_fused_3 * 64 + rc_3), "int32") * T.cast(T.load("int16", placeholder_27.data, rc_3 * 256 + ax3_outer_2 * 64 + ff_3), "int32"), True)
             for ax3_inner_4 in T.serial(0, 64):
                 T.store(T_cast_7.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4, T.cast(T.max(T.min(T.q_multiply_shift(T.cast(T.cast(T.max(T.min(T.q_multiply_shift(T.load("int32", Conv2dOutput_3, ax3_inner_4) + T.load("int32", placeholder_26.data, ax3_outer_2 * 64 + ax3_inner_4), 1343014664, 31, -8, dtype="int32") + 136, 255), 0), "uint8"), "int32") - 136, 1073903788, 31, 1, dtype="int32") + T.load("int32", placeholder_28.data, ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 + ax3_inner_4), 255), 0), "uint8"), True)