Example #1
0
def boolean_handling_after(a: T.Buffer[10, "int8"],
                           b: T.Buffer[10, "int8"]) -> None:
    T.preflattened_buffer(a, [10], dtype="bool", data=a.data)
    T.preflattened_buffer(b, [10], dtype="bool", data=b.data)
    # body
    for i0 in T.serial(10):
        b[i0] = T.cast(T.cast(a[i0], "bool"), "int8")
Example #2
0
 def main(placeholder: T.Buffer[(8192,), "int8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([128], "uint8")
     buffer_1 = T.buffer_decl([32], "uint8")
     buffer_2 = T.buffer_decl([112], "uint8")
     buffer_3 = T.buffer_decl([32], "uint8")
     buffer_4 = T.buffer_decl([112], "uint8")
     buffer_5 = T.buffer_decl([32], "uint8")
     buffer_6 = T.buffer_decl([112], "uint8")
     buffer_7 = T.buffer_decl([32], "uint8")
     T.preflattened_buffer(placeholder, [1, 16, 16, 32], "int8", data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], "int8", data=ethosu_write.data)
     # body
     p1_global = T.allocate([128], "uint8", "global", annotations={"disable_lower_builtin":True})
     p2_global = T.allocate([32], "uint8", "global", annotations={"disable_lower_builtin":True})
     p1_global_1 = T.buffer_decl([112], dtype="uint8", data=p1_global.data)
     p2_global_1 = T.buffer_decl([32], dtype="uint8", data=p2_global.data)
     T.evaluate(T.call_extern("ethosu_copy", buffer[0], 128, p1_global[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 32, p2_global[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global[0], 128, 12, p2_global[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 112, p1_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 32, p2_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[2], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_4[0], 112, p1_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_5[0], 32, p2_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[4], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_6[0], 112, p1_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_7[0], 32, p2_global_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 2, 16, 0, 16, ethosu_write[6], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1_global_1[0], 112, 12, p2_global_1[0], 32, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def flattened_unit_loop_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (32), "float32")
    C = T.match_buffer(c, (32), "float32")
    T.preflattened_buffer(A, (32), "float32", data=A.data)
    T.preflattened_buffer(C, (32), "float32", data=C.data)

    for x, z in T.grid(4, 8):
        C[x * 8 + z] = A[x * 8 + z] + 1.0
Example #4
0
 def main(placeholder_3: T.Buffer[(960,), "int8"], ethosu_write_1: T.Buffer[(1024,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([848], "uint8")
     buffer_1 = T.buffer_decl([160], "uint8")
     T.preflattened_buffer(placeholder_3, [1, 10, 12, 8], 'int8', data=placeholder_3.data)
     T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 16], 'int8', data=ethosu_write_1.data)
     # body
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8, 0, 8, placeholder_3[120], 0, 0, 0, T.float32(0.5), 10, "NHWC", 96, 8, 1, "int8", 8, 8, 16, 8, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 848, T.int8(-1), T.int8(-1), 12, buffer_1[0], 160, T.int8(-1), T.int8(-1), 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Example #5
0
 def main(placeholder_3: T.Buffer[(315,), "int8"], ethosu_write_1: T.Buffer[(240,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([160], "uint8")
     buffer_1 = T.buffer_decl([656], "uint8")
     T.preflattened_buffer(placeholder_3, [1, 7, 9, 5], 'int8', data=placeholder_3.data)
     T.preflattened_buffer(ethosu_write_1, [1, 3, 5, 16], 'int8', data=ethosu_write_1.data)
     # body
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 3, 5, 3, 3, 0, 5, placeholder_3[146], 0, 0, 0, T.float32(0.5), 10, "NHWC", 45, 5, 1, "int8", 3, 5, 16, 3, 0, 5, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 80, 16, 1, 3, 3, 1, 1, 1, 1, buffer_1[0], 656, 12, buffer[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def flattened_predicate_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (32), "float32")
    C = T.match_buffer(c, (32), "float32")
    T.preflattened_buffer(A, (32), "float32", data=A.data)
    T.preflattened_buffer(C, (32), "float32", data=C.data)

    for i, j in T.grid(5, 7):
        if i * 7 + j < 32:
            C[i * 7 + j] = A[i * 7 + j] + 1.0
 def tvm_test_cpacked(A: T.handle, B: T.handle, C: T.handle,
                      device_context: T.handle) -> T.handle:
     A_0 = T.match_buffer(A, (1, ), dtype="float32")
     A_0pre = T.preflattened_buffer(A_0, (1, ), dtype="float32")
     B_0 = T.match_buffer(B, (1, ), dtype="float32")
     B_0pre = T.preflattened_buffer(B_0, (1, ), dtype="float32")
     C_0 = T.match_buffer(C, (1, ), dtype="float32")
     C_0pre = T.preflattened_buffer(C_0, (1, ), dtype="float32")
     T.evaluate(C)
Example #8
0
def flattened_elementwise_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, 256, "float32")
    C = T.match_buffer(c, 256, "float32")
    T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
    T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)
    for i in T.serial(0, 16):
        B_new = T.allocate([16], "float32", "global")
        for j in T.serial(0, 16):
            B_new[j] = A[((i * 16) + j)] + 1.0
        for j in T.serial(0, 16):
            C[((i * 16) + j)] = B_new[j] * 2.0
Example #9
0
 def main(placeholder: T.Buffer[(8192,), "int8"], buffer1: T.Buffer[(368,), "uint8"], buffer2: T.Buffer[(96,), "uint8"], ethosu_write: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     # buffer definition
     T.preflattened_buffer(placeholder, [1, 16, 16, 32], dtype="int8", data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", data=ethosu_write.data)
     # body
     p1 = T.allocate([368], "uint8", "global")
     p2 = T.allocate([96], "uint8", "global") 
     T.evaluate(T.call_extern("ethosu_copy", buffer1[0], 368, p1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer2[0], 96, p2[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 8, 32, 16, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 8, 8, 16, 0, 8, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, p1[0], 192, p1[192], 176, 12, p2[0], 48, p2[48], 48, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None:
    A = T.match_buffer(a, (32), "float32")
    D = T.match_buffer(d, (32), "float32")
    T.preflattened_buffer(A, (32), "float32", data=A.data)
    T.preflattened_buffer(D, (32), "float32", data=D.data)

    for i in range(0, 32):
        B = T.allocate((32, ), "float32", "global")
        C = T.allocate((32, ), "float32", "global")
        B[i] = A[i] + 1.0
        C[i] = A[i] + B[i]
        D[i] = C[i] * 2.0
Example #11
0
def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None:
    A = T.match_buffer(a, 128, "float32")
    D = T.match_buffer(d, 128, "float32")
    T.preflattened_buffer(A, (4, 32), "float32", data=A.data)
    T.preflattened_buffer(D, (4, 32), "float32", data=D.data)

    for i, j in T.grid(4, 32):
        B = T.allocate([128], "float32", "global")
        C = T.allocate([128], "float32", "global")
        B[i * 32 + j] = A[i * 32 + j] + 1.0
        C[i * 32 + j] = A[i * 32 + j] + B[i * 32 + j]
        D[i * 32 + j] = C[i * 32 + j] * 2.0
Example #12
0
def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32,
                            m: T.int32) -> None:
    A = T.match_buffer(a, n * m, "float32")
    C = T.match_buffer(c, n * m, "float32")
    T.preflattened_buffer(A, (n, m), "float32", data=A.data)
    T.preflattened_buffer(C, (n, m), "float32", data=C.data)

    for i in range(0, n):
        B = T.allocate([m], "float32", "global")
        for j in range(0, m):
            B[j] = A[i * m + j] + 1.0
        for j in range(0, m):
            C[i * m + j] = B[j] * 2.0
Example #13
0
 def main(placeholder: T.Buffer[(1024,), "int8"], ethosu_write: T.Buffer[(32768,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([1456], "uint8")
     buffer_1 = T.buffer_decl([352], "uint8")
     buffer_2 = T.buffer_decl([11040], "uint8")
     buffer_3 = T.buffer_decl([272], "uint8")
     T.preflattened_buffer(placeholder, [1, 8, 1, 8, 16], 'int8', data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 32, 2, 32, 16], 'int8', data=ethosu_write.data)
     # body
     ethosu_write_1 = T.allocate([12288], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 3, 8, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 768, 16, 256, 3, 3, 1, 1, 1, 1, buffer[0], 1456, 12, buffer_1[0], 352, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 35, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 768, 16, 256, "int8", 32, 32, 26, 32, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 1024, 16, 512, 3, 3, 1, 1, 1, 1, buffer_2[0], 11040, 12, buffer_3[0], 272, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NEAREST", 0, 0, 0, dtype="handle"))
Example #14
0
def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (256, ), "float32")
    C = T.match_buffer(c, (256, ), "float32")
    T.preflattened_buffer(A, [16, 16], dtype="float32", data=A.data)
    T.preflattened_buffer(C, [16, 16], dtype="float32", data=C.data)
    for i0 in T.serial(0, 4):
        B_new = T.allocate([68], "float32", "global")
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                B_new[i1 * 17 + j] = A[i0 * 64 + i1 * 16 + j] + 1.0
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                C[i0 * 64 + i1 * 16 + j] = B_new[i1 * 17 + j] * 2.0
Example #15
0
 def main(placeholder_3: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([80], "uint8")
     buffer_1 = T.buffer_decl([304], "uint8")
     T.preflattened_buffer(placeholder_3, [1, 16, 16, 32], dtype="int8", data=placeholder_3.data)
     T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 8], dtype="int8", data=ethosu_write_1.data)
     # body
     placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
     placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 304, placeholder_global[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer[0], 80, placeholder_d_global[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, placeholder_global[0], 304, T.int8(-1), T.int8(-1), 12, placeholder_d_global[0], 80, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Example #16
0
 def main(placeholder: T.Buffer[(192,), "int8"], ethosu_write: T.Buffer[(8192,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([160], "uint8")
     buffer_1 = T.buffer_decl([320], "uint8")
     buffer_2 = T.buffer_decl([304], "uint8")
     buffer_3 = T.buffer_decl([80], "uint8")
     T.preflattened_buffer(placeholder, [1, 8, 8, 3], 'int8', data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 32, 32, 8], 'int8', data=ethosu_write.data)
     # body
     ethosu_write_1 = T.allocate([4096], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 4, 8, 3, 4, 0, 8, placeholder[96], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 32, 1, 1, 1, 1, 1, 1, 1, buffer[0], 160, 12, buffer_1[0], 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 16, 32, 8, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 32, 8, 16, 0, 32, ethosu_write[4096], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 8, 1, 1, 1, 1, 1, 1, 1, buffer_2[0], 304, 12, buffer_3[0], 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "ZEROS", 0, 0, 0, dtype="handle"))
Example #17
0
 def main(placeholder_5: T.Buffer[(192,), "int8"], ethosu_write_1: T.Buffer[(512,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([80], "uint8")
     buffer_1 = T.buffer_decl([320], "uint8")
     buffer_2 = T.buffer_decl([1312], "uint8")
     buffer_3 = T.buffer_decl([2608], "uint8")
     T.preflattened_buffer(placeholder_5, [1, 8, 8, 3], 'int8', data=placeholder_5.data)
     T.preflattened_buffer(ethosu_write_1, [1, 8, 8, 8], 'int8', data=ethosu_write_1.data)
     # body
     ethosu_write_2 = T.allocate([1536], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[256], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[48], 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 1312, 12, buffer_1[0], 320, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, ethosu_write_1[256], 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, buffer_3[0], 2608, 12, buffer[0], 80, 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Example #18
0
 def main(placeholder_5: T.Buffer[(1024,), "int8"], ethosu_write_1: T.Buffer[(2048,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([1456], "uint8")
     buffer_1 = T.buffer_decl([352], "uint8")
     buffer_2 = T.buffer_decl([272], "uint8")
     buffer_3 = T.buffer_decl([11040], "uint8")
     T.preflattened_buffer(placeholder_5, [1, 8, 1, 8, 16], 'int8', data=placeholder_5.data)
     T.preflattened_buffer(ethosu_write_1, [1, 8, 2, 8, 16], 'int8', data=ethosu_write_1.data)
     # body
     ethosu_write_2 = T.allocate([2304], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[384], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 1, 1, 0, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, placeholder_5[256], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, buffer[0], 1456, T.int8(-1), T.int8(-1), 12, buffer_1[0], 352, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, ethosu_write_1[1024], 0, 0, 0, T.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, buffer_3[0], 11040, T.int8(-1), T.int8(-1), 12, buffer_2[0], 272, T.int8(-1), T.int8(-1), 0, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Example #19
0
def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, 256, "float32")
    C = T.match_buffer(c, 256, "float32")
    T.preflattened_buffer(A, (16, 16), dtype="float32", data=A.data)
    T.preflattened_buffer(C, (16, 16), dtype="float32", data=C.data)

    i0 = T.env_thread("blockIdx.x")
    i1 = T.env_thread("threadIdx.x")
    i2 = T.env_thread("vthread")

    T.launch_thread(i0, 4)
    T.launch_thread(i1, 2)
    T.launch_thread(i2, 2)
    B = T.allocate([16], "float32", "local")
    for j in range(0, 16):
        B[j] = A[i0 * 64 + i1 * 32 + i2 * 16 + j] + 1.0
    for j in range(0, 16):
        C[i0 * 64 + i1 * 32 + i2 * 16 + j] = B[j] * 2.0
Example #20
0
 def main(placeholder_5: T.Buffer[(8192,), "int8"], ethosu_write_1: T.Buffer[(4096,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([416], "uint8")
     buffer_1 = T.buffer_decl([112], "uint8")
     buffer_2 = T.buffer_decl([272], "uint8")
     buffer_3 = T.buffer_decl([64], "uint8")
     T.preflattened_buffer(placeholder_5, [1, 16, 16, 32], dtype="int8", data=placeholder_5.data)
     T.preflattened_buffer(ethosu_write_1, [1, 16, 16, 16], dtype="int8", data=ethosu_write_1.data)
     # body
     placeholder_global_unrolled_iter_0 = T.allocate([416], "uint8", "global", annotations={"disable_lower_builtin": True})
     placeholder_d_global_unrolled_iter_0 = T.allocate([112], "uint8", "global", annotations={"disable_lower_builtin": True})
     placeholder_global_unrolled_iter_1 = T.allocate([272], "uint8", "global", annotations={"disable_lower_builtin": True})
     placeholder_d_global_unrolled_iter_1 = T.allocate([64],  "uint8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_copy", buffer[0], 416, placeholder_global_unrolled_iter_0[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_1[0], 112, placeholder_d_global_unrolled_iter_0[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_2[0], 272, placeholder_global_unrolled_iter_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", buffer_3[0], 64, placeholder_d_global_unrolled_iter_1[0], dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, ethosu_write_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_0[0], 416, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_0[0], 112, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, placeholder_5[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, ethosu_write_1[10], 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, placeholder_global_unrolled_iter_1[0], 272, T.int8(-1), T.int8(-1), 12, placeholder_d_global_unrolled_iter_1[0], 64, T.int8(-1), T.int8(-1), 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
 def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     # var definition
     threadIdx_x = T.env_thread("threadIdx.x")
     blockIdx_x = T.env_thread("blockIdx.x")
     T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data)
     T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data)
     T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data)
     # body
     T.launch_thread(blockIdx_x, 64)
     conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local")
     PadInput_shared = T.allocate([768], "float32", "shared")
     weight_shared = T.allocate([4096], "float32", "shared")
     T.launch_thread(threadIdx_x, 32)
     for i2_3_init, i1_4_init, i2_4_init in T.grid(2, 2, 2):
         conv2d_transpose_nhwc_local[i1_4_init * 4 + i2_3_init * 2 + i2_4_init] = T.float32(0)
     for i6_0 in T.serial(16):
         for ax0_ax1_ax2_ax3_fused_0 in T.serial(24):
             PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(4 <= ax0_ax1_ax2_ax3_fused_0 and ax0_ax1_ax2_ax3_fused_0 < 20 and 1 <= blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 and blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 < 5, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32")
         for ax0_ax1_ax2_ax3_fused_0 in T.serial(32):
             weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + ax0_ax1_ax2_ax3_fused_0 % 2 * 4096 + threadIdx_x // 2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)]
         for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2):
             conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024]
     for ax1, ax2 in T.grid(2, 4):
         conv2d_transpose_nhwc[threadIdx_x // 8 * 4096 + ax1 * 2048 + blockIdx_x // 32 * 1024 + ax2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 8] = conv2d_transpose_nhwc_local[ax1 * 4 + ax2]
def partitioned_concat_3(
    placeholder: T.Buffer[(50176,), "int8"],
    placeholder_1: T.Buffer[(25088,), "int8"],
    placeholder_2: T.Buffer[(25088,), "int8"],
    T_concat: T.Buffer[(100352,), "int8"],
) -> None:
    T.preflattened_buffer(placeholder, [1, 64, 28, 28], "int8", data=placeholder.data)
    T.preflattened_buffer(placeholder_1, [1, 32, 28, 28], "int8", data=placeholder_1.data)
    T.preflattened_buffer(placeholder_2, [1, 32, 28, 28], "int8", data=placeholder_2.data)
    T.preflattened_buffer(T_concat, [1, 128, 28, 28], "int8", data=T_concat.data)
    for i1, i2, i3 in T.grid(64, 28, 28):
        T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
    for i1, i2, i3 in T.grid(32, 28, 28):
        T_concat[i1 * 784 + i2 * 28 + i3 + 50176] = placeholder_1[i1 * 784 + i2 * 28 + i3]
    for i1, i2, i3 in T.grid(32, 28, 28):
        T_concat[i1 * 784 + i2 * 28 + i3 + 75264] = placeholder_2[i1 * 784 + i2 * 28 + i3]
def concat_func_3(
    placeholder: T.Buffer[(50176,), "int8"],
    placeholder_1: T.Buffer[(25088,), "int8"],
    placeholder_2: T.Buffer[(25088,), "int8"],
    T_concat: T.Buffer[(100352,), "int8"],
) -> None:
    T.preflattened_buffer(placeholder, (1, 64, 28, 28), "int8", data=placeholder.data)
    T.preflattened_buffer(placeholder_1, (1, 32, 28, 28), "int8", data=placeholder_1.data)
    T.preflattened_buffer(placeholder_2, (1, 32, 28, 28), "int8", data=placeholder_2.data)
    T.preflattened_buffer(T_concat, (1, 128, 28, 28), "int8", data=T_concat.data)
    for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
        for i2, i3 in T.grid(28, 28):
            if 96 <= i1:
                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder_2[i1 * 784 + i2 * 28 + i3 - 75264]
            if 64 <= i1 and i1 < 96:
                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder_1[i1 * 784 + i2 * 28 + i3 - 50176]
            if i1 < 64:
                T_concat[i1 * 784 + i2 * 28 + i3] = placeholder[i1 * 784 + i2 * 28 + i3]
def partitioned_concat(
    A: T.Buffer[(16,), "float32"], B: T.Buffer[(16,), "float32"], C: T.Buffer[(32,), "float32"]
) -> None:
    T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
    T.preflattened_buffer(A, [16], data=A.data)
    T.preflattened_buffer(B, [16], data=B.data)
    T.preflattened_buffer(C, [32], data=C.data)
    for i in T.serial(0, 16):
        C[i] = A[i]
    for i in T.serial(0, 16):
        C[i + 16] = B[i + 16]
Example #25
0
def tir_matmul(
    A: T.Buffer[(16384,), "float32"],
    B: T.Buffer[(16384,), "float32"],
    C: T.Buffer[(16384,), "float32"],
) -> None:
    # function attr dict
    T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
    T.preflattened_buffer(A, [128, 128], dtype="float32", data=A.data)
    T.preflattened_buffer(B, [128, 128], dtype="float32", data=B.data)
    T.preflattened_buffer(C, [128, 128], dtype="float32", data=C.data)
    # body
    for x, y in T.grid(128, 128):
        C[x * 128 + y] = T.float32(0)
        for k in T.serial(128):
            C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
Example #26
0
 def main(
     A: T.Buffer[(16384, ), "float32"],
     B: T.Buffer[(16384, ), "float32"],
     C: T.Buffer[(16384, ), "float32"],
 ) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     T.preflattened_buffer(A, [128, 128], data=A.data)
     T.preflattened_buffer(B, [128, 128], data=B.data)
     T.preflattened_buffer(C, [128, 128], data=C.data)
     # body
     for x, y in T.grid(128, 128):
         C[x * 128 + y] = 0.0
         for k in T.serial(0, 128):
             C[x * 128 +
               y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k]
Example #27
0
 def main(placeholder: T.Buffer[(1536,), "int8"], placeholder_1: T.Buffer[(1280,), "int8"], T_concat: T.Buffer[(4096,), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_decl([2992], "uint8")
     buffer_1 = T.buffer_decl([160], "uint8")
     buffer_2 = T.buffer_decl([2992], "uint8")
     buffer_3 = T.buffer_decl([160], "uint8")
     buffer_4 = T.buffer_decl([2992], "uint8")
     buffer_5 = T.buffer_decl([160], "uint8")
     buffer_6 = T.buffer_decl([2992], "uint8")
     buffer_7 = T.buffer_decl([160], "uint8")
     T.preflattened_buffer(placeholder, [1, 8, 12, 16], "int8", data=placeholder.data)
     T.preflattened_buffer(placeholder_1, [1, 8, 10, 16], "int8", data=placeholder_1.data)
     T.preflattened_buffer(T_concat, [1, 8, 32, 16], "int8", data=T_concat.data)
     # body
     T_concat_1 = T.allocate([2816], "int8", "global", annotations={"disable_lower_builtin":True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, placeholder_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 160, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer[0], 2992, 12, buffer_1[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 10, 16, 8, 0, 10, T_concat_1[192], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 10, 16, 8, 0, 10, T_concat[352], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_2[0], 2992, 12, buffer_3[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 12, 16, 8, 0, 12, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 192, 16, 1, "int8", 8, 12, 16, 8, 0, 12, T_concat_1[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 352, 16, 1, 3, 3, 1, 1, 1, 1, buffer_4[0], 2992, 12, buffer_5[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 22, 16, 8, 0, 22, T_concat_1[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 352, 16, 1, "int8", 8, 22, 16, 8, 0, 22, T_concat[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 512, 16, 1, 3, 3, 1, 1, 1, 1, buffer_6[0], 2992, 12, buffer_7[0], 160, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 0, 0, 0, dtype="handle"))
Example #28
0
def preflattened_buffer_map_offset_factor_nonint(foo: T.handle):
    foo_1 = T.match_buffer(foo, [1])
    T.preflattened_buffer(
        foo_1, [1], offset_factor="bar"
    )  # check_error: offset_factor: want int or IntImm, got 'bar'
Example #29
0
def preflattened_buffer_map(A: T.handle, B: T.handle):
    A_1 = T.match_buffer(A, [1])
    T.preflattened_buffer(A_1, [1], align=T.int32(1), offset_factor=T.int64(2))
    B_1 = T.match_buffer(B, [1])
    T.preflattened_buffer(B_1, [1])
    B_1[0] = A_1[0]
Example #30
0
 def main(placeholder: T.Buffer[(301056, ), "int8"],
          ethosu_write: T.Buffer[(75264, ), "int8"]) -> None:
     T.func_attr({
         "from_legacy_te_schedule": True,
         "global_symbol": "main",
         "tir.noalias": True
     })
     T.preflattened_buffer(placeholder, [1, 56, 56, 96],
                           dtype='int8',
                           data=placeholder.data)
     T.preflattened_buffer(ethosu_write, [1, 56, 56, 24],
                           dtype='int8',
                           data=ethosu_write.data)
     buffer1 = T.buffer_decl([2608], "uint8")
     buffer2 = T.buffer_decl([240], "uint8")
     buffer3 = T.buffer_decl([736], "uint8")
     buffer4 = T.buffer_decl([240], "uint8")
     p1 = T.allocate([2608],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p2 = T.allocate([240],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p3 = T.allocate([736],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p4 = T.allocate([240],
                     "uint8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p5 = T.allocate([75264],
                     "int8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     p6 = T.allocate([75264],
                     "int8",
                     "global",
                     annotations={"disable_lower_builtin": True})
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer1[0],
                       2608,
                       p1[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer2[0],
                       240,
                       p2[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer3[0],
                       736,
                       p3[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_copy",
                       buffer4[0],
                       240,
                       p4[0],
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       56,
                       56,
                       96,
                       56,
                       0,
                       56,
                       placeholder[0],
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       5376,
                       96,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       1344,
                       24,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       p1[0],
                       2608,
                       T.int8(-1),
                       T.int8(-1),
                       12,
                       p2[0],
                       240,
                       T.int8(-1),
                       T.int8(-1),
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       0,
                       0,
                       0,
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_conv2d",
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(0.5),
                       10,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p6[0],
                       0,
                       0,
                       0,
                       T.float32(0.25),
                       14,
                       "NHWC",
                       1344,
                       24,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       1,
                       p3[0],
                       736,
                       T.int8(-1),
                       T.int8(-1),
                       12,
                       p4[0],
                       240,
                       T.int8(-1),
                       T.int8(-1),
                       0,
                       0,
                       0,
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       "NONE",
                       0,
                       0,
                       0,
                       dtype="handle"))
     T.evaluate(
         T.call_extern("ethosu_binary_elementwise",
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p5[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       p6[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "int8",
                       56,
                       56,
                       24,
                       56,
                       0,
                       56,
                       ethosu_write[0],
                       0,
                       0,
                       0,
                       T.float32(1),
                       0,
                       "NHWC",
                       1344,
                       24,
                       1,
                       "ADD",
                       0,
                       "NONE",
                       0,
                       0,
                       "TFL",
                       0,
                       0,
                       0,
                       dtype="handle"))