Exemple #1
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_(
         placeholder_16: T.handle, placeholder_17: T.handle,
         placeholder_18: T.handle, T_add: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_15934180698220515269_",
         "tir.noalias": True
     })
     placeholder_19 = T.match_buffer(placeholder_16, [360000],
                                     dtype="int16")
     placeholder_20 = T.match_buffer(placeholder_17, [16384], dtype="int16")
     placeholder_21 = T.match_buffer(placeholder_18, [256], dtype="int32")
     T_add_1 = T.match_buffer(T_add, [1440000], dtype="int32")
     # body
     PaddedInput_2 = T.allocate([360000], "int16", "global")
     for i0_i1_fused_2, i2_2, i3_2 in T.grid(75, 75, 64):
         PaddedInput_2[i0_i1_fused_2 * 4800 + i2_2 * 64 +
                       i3_2] = placeholder_19[i0_i1_fused_2 * 4800 +
                                              i2_2 * 64 + i3_2]
     for ax0_ax1_fused_ax2_fused_2 in T.serial(0, 5625):
         Conv2dOutput_2 = T.allocate([64], "int32", "global")
         for ax3_outer_1 in T.serial(0, 4):
             for ff_2 in T.serial(0, 64):
                 Conv2dOutput_2[ff_2] = 0
                 for rc_2 in T.serial(0, 64):
                     Conv2dOutput_2[ff_2] = Conv2dOutput_2[ff_2] + T.cast(
                         PaddedInput_2[ax0_ax1_fused_ax2_fused_2 * 64 +
                                       rc_2],
                         "int32") * T.cast(
                             placeholder_20[rc_2 * 256 + ax3_outer_1 * 64 +
                                            ff_2], "int32")
             for ax3_inner_3 in T.serial(0, 64):
                 T_add_1[
                     ax0_ax1_fused_ax2_fused_2 * 256 + ax3_outer_1 * 64 +
                     ax3_inner_3] = T.q_multiply_shift(T.cast(
                         T.cast(
                             T.max(
                                 T.min(
                                     T.q_multiply_shift(
                                         Conv2dOutput_2[ax3_inner_3] +
                                         placeholder_21[ax3_outer_1 * 64 +
                                                        ax3_inner_3],
                                         1711626602,
                                         31,
                                         -8,
                                         dtype="int32") + 132, 255), 0),
                             "uint8"), "int32") - 132,
                                                       2094289803,
                                                       31,
                                                       -2,
                                                       dtype="int32") + 136
Exemple #2
0
 def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(
         placeholder: T.handle, placeholder_1: T.handle,
         T_cast: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_2 = T.match_buffer(placeholder, [360000], dtype="uint8")
     placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
     T_cast_1 = T.match_buffer(T_cast, [360000], dtype="int16")
     # body
     for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
         T_cast_1[
             ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 +
             ax3_inner] = T.cast(
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(T.cast(
                                 placeholder_2[ax0_ax1_fused * 4800 +
                                               ax2 * 64 + ax3_outer * 16 +
                                               ax3_inner], "int32") - 94,
                                                1843157232,
                                                31,
                                                1,
                                                dtype="int32") +
                             placeholder_3[ax3_outer * 16 + ax3_inner],
                             255), 0), "uint8"), "int16")
def primfunc_local_allocates(placeholder_162: T.handle, placeholder_163: T.handle, placeholder_164: T.handle, T_cast_76: T.handle) -> None:
    # function attr dict
    T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_9", "tir.noalias": True})
    placeholder_165 = T.match_buffer(placeholder_162, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_166 = T.match_buffer(placeholder_163, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_167 = T.match_buffer(placeholder_164, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
    T_cast_77 = T.match_buffer(T_cast_76, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    sid_21 = T.allocate_const([0,1,2,3,4,5,6,7], "int8", [8])
    # body
    PaddedInput_25 = T.allocate([131072], "int16", "global")
    for i1_35, i2_46, i3_47 in T.grid(16, 16, 512):
        PaddedInput_25[(((i1_35*8192) + (i2_46*512)) + i3_47)] = T.if_then_else(((((1 <= i1_35) and (i1_35 < 15)) and (1 <= i2_46)) and (i2_46 < 15)), placeholder_165[((((i1_35*7168) + (i2_46*512)) + i3_47) - 7680)], T.int16(0), dtype="int16")
    T_add_11 = T.allocate([100352], "int32", "global")
    with T.allocate([100352], "int32", "global") as DepthwiseConv2d_11:
        for i_11, j_11, c_11 in T.grid(14, 14, 512):
            DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = 0
            for di_11, dj_11 in T.grid(3, 3):
                DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] = (DepthwiseConv2d_11[(((i_11*7168) + (j_11*512)) + c_11)] + (PaddedInput_25[(((((i_11*8192) + (di_11*8192)) + (j_11*512)) + (dj_11*512)) + c_11)].astype("int32")*placeholder_166[(((di_11*1536) + (dj_11*512)) + c_11)].astype("int32")))
        for ax1_44, ax2_45, ax3_47 in T.grid(14, 14, 512):
            T_add_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] = (DepthwiseConv2d_11[(((ax1_44*7168) + (ax2_45*512)) + ax3_47)] + placeholder_167[ax3_47])
    compute_22 = T.allocate([100352], "int32", "global")
    with T.allocate([100352], "int32", "global") as T_cast_78:
        for ax1_45, ax2_46, ax3_48 in T.grid(14, 14, 512):
            T_cast_78[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)] = T_add_11[(((ax1_45*7168) + (ax2_46*512)) + ax3_48)]
        for i1_36, i2_47, i3_48 in T.grid(14, 14, 512):
            compute_22[(((i1_36*7168) + (i2_47*512)) + i3_48)] = T.q_multiply_shift(T_cast_78[(((i1_36*7168) + (i2_47*512)) + i3_48)], 1948805937, 31, -5, dtype="int32")
    T_cast_79 = T.allocate([100352], "uint8", "global")
    with T.allocate([100352], "int32", "global") as compute_23:
        for i1_37, i2_48, i3_49 in T.grid(14, 14, 512):
            compute_23[(((i1_37*7168) + (i2_48*512)) + i3_49)] = T.max(T.max(compute_22[(((i1_37*7168) + (i2_48*512)) + i3_49)], 255), 0)
        for ax1_46, ax2_47, ax3_49 in T.grid(14, 14, 512):
            T_cast_79[(((ax1_46*7168) + (ax2_47*512)) + ax3_49)] = compute_23[(((ax1_46*7168) + (ax2_47*512)) + ax3_49)].astype("uint8")
    for ax1_47, ax2_48, ax3_50 in T.grid(14, 14, 512):
        T_cast_77[(((ax1_47*7168) + (ax2_48*512)) + ax3_50)] = T_cast_79[(((ax1_47*7168) + (ax2_48*512)) + ax3_50)].astype("int16")
def primfunc_global_allocates(placeholder_144: T.handle, placeholder_145: T.handle, placeholder_146: T.handle, T_cast_48: T.handle) -> None:
    # function attr dict
    T.func_attr({"global_symbol": "fused_nn_conv2d_add_cast_fixed_point_multiply_clip_cast_cast_13", "tir.noalias": True})
    placeholder_147 = T.match_buffer(placeholder_144, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_148 = T.match_buffer(placeholder_145, [4608], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_149 = T.match_buffer(placeholder_146, [512], dtype="int32", elem_offset=0, align=128, offset_factor=1)
    T_cast_49 = T.match_buffer(T_cast_48, [100352], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    # body
    PaddedInput_22 = T.allocate([131072], "int16", "global")
    DepthwiseConv2d_9 = T.allocate([100352], "int32", "global")
    for i1_29, i2_39, i3_40 in T.grid(16, 16, 512):
        PaddedInput_22[(((i1_29*8192) + (i2_39*512)) + i3_40)] = T.if_then_else(((((1 <= i1_29) and (i1_29 < 15)) and (1 <= i2_39)) and (i2_39 < 15)), placeholder_147[((((i1_29*7168) + (i2_39*512)) + i3_40) - 7680)], T.int16(0), dtype="int16")
    for i_9, j_9, c_9 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i_9*7168) + (j_9*512)) + c_9)] = 0
        for di_9, dj_9 in T.grid(3, 3):
            DepthwiseConv2d_9[(((i_9*7168) + (j_9*512)) + c_9)] = (DepthwiseConv2d_9[(((i_9*7168) + (j_9*512)) + c_9)] + (PaddedInput_22[(((((i_9*8192) + (di_9*8192)) + (j_9*512)) + (dj_9*512)) + c_9)].astype("int32")*placeholder_148[(((di_9*1536) + (dj_9*512)) + c_9)].astype("int32")))
    for ax1_27, ax2_28, ax3_30 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((ax1_27*7168) + (ax2_28*512)) + ax3_30)] = (DepthwiseConv2d_9[(((ax1_27*7168) + (ax2_28*512)) + ax3_30)] + placeholder_149[ax3_30])
    for i1_30, i2_40, i3_41 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_30*7168) + (i2_40*512)) + i3_41)] = T.q_multiply_shift(DepthwiseConv2d_9[(((i1_30*7168) + (i2_40*512)) + i3_41)], 1269068532, 31, -4, dtype="int32")
    for i1_31, i2_41, i3_42 in T.grid(14, 14, 512):
        DepthwiseConv2d_9[(((i1_31*7168) + (i2_41*512)) + i3_42)] = T.max(T.max(DepthwiseConv2d_9[(((i1_31*7168) + (i2_41*512)) + i3_42)], 255), 0)
    for ax1_28, ax2_29, ax3_31 in T.grid(14, 14, 512):
        PaddedInput_22[(((ax1_28*7168) + (ax2_29*512)) + ax3_31)] = DepthwiseConv2d_9[(((ax1_28*7168) + (ax2_29*512)) + ax3_31)].astype("uint8")
    for ax1_29, ax2_30, ax3_32 in T.grid(14, 14, 512):
        T_cast_49[(((ax1_29*7168) + (ax2_30*512)) + ax3_32)] = PaddedInput_22[(((ax1_29*7168) + (ax2_30*512)) + ax3_32)].astype("int16")
Exemple #5
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(
         placeholder_4: T.handle, placeholder_5: T.handle,
         placeholder_6: T.handle, T_cast_2: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_7 = T.match_buffer(placeholder_4, [1, 75, 75, 64],
                                    dtype="int16")
     placeholder_8 = T.match_buffer(placeholder_5, [1, 1, 64, 64],
                                    dtype="int16")
     placeholder_9 = T.match_buffer(placeholder_6, [1, 1, 1, 64],
                                    dtype="int32")
     T_cast_3 = T.match_buffer(T_cast_2, [1, 75, 75, 64], dtype="int16")
     # body
     PaddedInput = T.allocate([360000], "int16", "global")
     for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
         T.store(
             PaddedInput, i0_i1_fused * 4800 + i2 * 64 + i3,
             T.load("int16", placeholder_7.data,
                    i0_i1_fused * 4800 + i2 * 64 + i3), True)
     for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
         Conv2dOutput = T.allocate([64], "int32", "global")
         for ff in T.serial(0, 64):
             T.store(Conv2dOutput, ff, 0, True)
             for rc in T.serial(0, 64):
                 T.store(
                     Conv2dOutput, ff,
                     T.load("int32", Conv2dOutput, ff) + T.cast(
                         T.load("int16", PaddedInput,
                                ax0_ax1_fused_ax2_fused * 64 + rc), "int32")
                     * T.cast(
                         T.load("int16", placeholder_8.data, rc * 64 + ff),
                         "int32"), True)
         for ax3_inner_1 in T.serial(0, 64):
             T.store(
                 T_cast_3.data, ax0_ax1_fused_ax2_fused * 64 + ax3_inner_1,
                 T.cast(
                     T.cast(
                         T.max(
                             T.min(
                                 T.q_multiply_shift(
                                     T.load("int32", Conv2dOutput,
                                            ax3_inner_1) +
                                     T.load("int32", placeholder_9.data,
                                            ax3_inner_1),
                                     1843106743,
                                     31,
                                     -6,
                                     dtype="int32"), 255), 0), "uint8"),
                     "int16"), True)
Exemple #6
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1(
         placeholder_10: T.handle, placeholder_11: T.handle,
         placeholder_12: T.handle, T_cast_4: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_1",
         "tir.noalias": True
     })
     placeholder_13 = T.match_buffer(placeholder_10, [360000],
                                     dtype="int16")
     placeholder_14 = T.match_buffer(placeholder_11, [36864], dtype="int16")
     placeholder_15 = T.match_buffer(placeholder_12, [64], dtype="int32")
     T_cast_5 = T.match_buffer(T_cast_4, [360000], dtype="int16")
     # body
     PaddedInput_1 = T.allocate([379456], "int16", "global")
     for i0_i1_fused_1, i2_1, i3_1 in T.grid(77, 77, 64):
         PaddedInput_1[i0_i1_fused_1 * 4928 + i2_1 * 64 +
                       i3_1] = T.if_then_else(
                           1 <= i0_i1_fused_1 and i0_i1_fused_1 < 76
                           and 1 <= i2_1 and i2_1 < 76,
                           placeholder_13[i0_i1_fused_1 * 4800 + i2_1 * 64 +
                                          i3_1 - 4864],
                           T.int16(0),
                           dtype="int16")
     for ax0_ax1_fused_ax2_fused_1 in T.serial(0, 5625):
         Conv2dOutput_1 = T.allocate([64], "int32", "global")
         for ff_1 in T.serial(0, 64):
             Conv2dOutput_1[ff_1] = 0
             for ry, rx, rc_1 in T.grid(3, 3, 64):
                 Conv2dOutput_1[ff_1] = Conv2dOutput_1[ff_1] + T.cast(
                     PaddedInput_1[
                         T.floordiv(ax0_ax1_fused_ax2_fused_1, 75) * 4928 +
                         ry * 4928 + rx * 64 +
                         T.floormod(ax0_ax1_fused_ax2_fused_1, 75) * 64 +
                         rc_1], "int32") * T.cast(
                             placeholder_14[ry * 12288 + rx * 4096 +
                                            rc_1 * 64 + ff_1], "int32")
         for ax3_inner_2 in T.serial(0, 64):
             T_cast_5[ax0_ax1_fused_ax2_fused_1 * 64 +
                      ax3_inner_2] = T.cast(
                          T.cast(
                              T.max(
                                  T.min(
                                      T.q_multiply_shift(
                                          Conv2dOutput_1[ax3_inner_2] +
                                          placeholder_15[ax3_inner_2],
                                          1608879842,
                                          31,
                                          -7,
                                          dtype="int32"), 255), 0),
                              "uint8"), "int16")
Exemple #7
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast(
         placeholder_4: T.handle, placeholder_5: T.handle,
         placeholder_6: T.handle, T_cast_2: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_7 = T.match_buffer(placeholder_4, [360000], dtype="int16")
     placeholder_8 = T.match_buffer(placeholder_5, [4096], dtype="int16")
     placeholder_9 = T.match_buffer(placeholder_6, [64], dtype="int32")
     T_cast_3 = T.match_buffer(T_cast_2, [360000], dtype="int16")
     # body
     PaddedInput = T.allocate([360000], "int16", "global")
     for i0_i1_fused, i2, i3 in T.grid(75, 75, 64):
         PaddedInput[i0_i1_fused * 4800 + i2 * 64 +
                     i3] = placeholder_7[i0_i1_fused * 4800 + i2 * 64 + i3]
     for ax0_ax1_fused_ax2_fused in T.serial(0, 5625):
         Conv2dOutput = T.allocate([64], "int32", "global")
         for ff in T.serial(0, 64):
             Conv2dOutput[ff] = 0
             for rc in T.serial(0, 64):
                 Conv2dOutput[ff] = Conv2dOutput[ff] + T.cast(
                     PaddedInput[ax0_ax1_fused_ax2_fused * 64 + rc],
                     "int32") * T.cast(placeholder_8[rc * 64 + ff], "int32")
         for ax3_inner_1 in T.serial(0, 64):
             T_cast_3[ax0_ax1_fused_ax2_fused * 64 + ax3_inner_1] = T.cast(
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(Conv2dOutput[ax3_inner_1] +
                                                placeholder_9[ax3_inner_1],
                                                1843106743,
                                                31,
                                                -6,
                                                dtype="int32"), 255), 0),
                     "uint8"), "int16")
Exemple #8
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast(
         placeholder_62: T.handle, placeholder_63: T.handle,
         placeholder_64: T.handle, T_cast_20: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_clip_cast",
         "tir.noalias": True
     })
     placeholder_65 = T.match_buffer(placeholder_62, [1, 224, 224, 3],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_66 = T.match_buffer(placeholder_63, [7, 7, 3, 64],
                                     dtype="int16",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     placeholder_67 = T.match_buffer(placeholder_64, [1, 1, 1, 64],
                                     dtype="int32",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_21 = T.match_buffer(T_cast_20, [1, 112, 112, 64],
                                dtype="uint8",
                                elem_offset=0,
                                align=128,
                                offset_factor=1)
     # body
     PaddedInput_7 = T.allocate([157323], "int16", "global")
     for i0_i1_fused_7 in T.serial(0, 229):
         for i2_7, i3_7 in T.grid(229, 3):
             T.store(
                 PaddedInput_7,
                 (((i0_i1_fused_7 * 687) + (i2_7 * 3)) + i3_7),
                 T.if_then_else(
                     ((((2 <= i0_i1_fused_7) and (i0_i1_fused_7 < 226)) and
                       (2 <= i2_7)) and (i2_7 < 226)),
                     T.load("int16", placeholder_65.data,
                            ((((i0_i1_fused_7 * 672) +
                               (i2_7 * 3)) + i3_7) - 1350)),
                     T.int16(0),
                     dtype="int16"), True)
     for ax0_ax1_fused_ax2_fused_7 in T.serial(0, 12544):
         Conv2dOutput_7 = T.allocate([64], "int32", "global")
         for ff_3 in T.serial(0, 64):
             T.store(Conv2dOutput_7, ff_3, 0, True)
             for ry_2, rx_2, rc_7 in T.grid(7, 7, 3):
                 T.store(
                     Conv2dOutput_7, ff_3,
                     (T.load("int32", Conv2dOutput_7, ff_3) + (T.cast(
                         T.load("int16", PaddedInput_7, ((
                             (((T.floordiv(ax0_ax1_fused_ax2_fused_7, 112) *
                                1374) + (ry_2 * 687)) +
                              (T.floormod(ax0_ax1_fused_ax2_fused_7, 112) *
                               6)) +
                             (rx_2 * 3)) + rc_7)), "int32") * T.cast(
                                 T.load("int16", placeholder_66.data,
                                        ((((ry_2 * 1344) + (rx_2 * 192)) +
                                          (rc_7 * 64)) + ff_3)), "int32"))),
                     True)
         for ax3_inner_7 in T.serial(0, 64):
             T.store(
                 T_cast_21.data,
                 ((ax0_ax1_fused_ax2_fused_7 * 64) + ax3_inner_7),
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(
                                 (T.load("int32", Conv2dOutput_7,
                                         ax3_inner_7) +
                                  T.load("int32", placeholder_67.data,
                                         ax3_inner_7)),
                                 1939887962,
                                 31,
                                 -9,
                                 dtype="int32"), 255), 0), "uint8"), True)
Exemple #9
0
 def tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_(
         placeholder_22: T.handle, placeholder_23: T.handle,
         placeholder_24: T.handle, placeholder_25: T.handle,
         T_cast_6: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_nn_conv2d_add_fixed_point_multiply_add_clip_cast_cast_subtract_fixed_point_4200876283395191415_",
         "tir.noalias": True
     })
     placeholder_29 = T.match_buffer(placeholder_22, [360000],
                                     dtype="int16")
     placeholder_27 = T.match_buffer(placeholder_23, [16384], dtype="int16")
     placeholder_26 = T.match_buffer(placeholder_24, [256], dtype="int32")
     placeholder_28 = T.match_buffer(placeholder_25, [1440000],
                                     dtype="int32")
     T_cast_7 = T.match_buffer(T_cast_6, [1440000], dtype="uint8")
     # body
     PaddedInput_3 = T.allocate([360000], "int16", "global")
     for i0_i1_fused_3, i2_3, i3_3 in T.grid(75, 75, 64):
         PaddedInput_3[i0_i1_fused_3 * 4800 + i2_3 * 64 +
                       i3_3] = placeholder_29[i0_i1_fused_3 * 4800 +
                                              i2_3 * 64 + i3_3]
     for ax0_ax1_fused_ax2_fused_3 in T.serial(0, 5625):
         Conv2dOutput_3 = T.allocate([64], "int32", "global")
         for ax3_outer_2 in T.serial(0, 4):
             for ff_3 in T.serial(0, 64):
                 Conv2dOutput_3[ff_3] = 0
                 for rc_3 in T.serial(0, 64):
                     Conv2dOutput_3[ff_3] = Conv2dOutput_3[ff_3] + T.cast(
                         PaddedInput_3[ax0_ax1_fused_ax2_fused_3 * 64 +
                                       rc_3],
                         "int32") * T.cast(
                             placeholder_27[rc_3 * 256 + ax3_outer_2 * 64 +
                                            ff_3], "int32")
             for ax3_inner_4 in T.serial(0, 64):
                 T_cast_7[
                     ax0_ax1_fused_ax2_fused_3 * 256 + ax3_outer_2 * 64 +
                     ax3_inner_4] = T.cast(
                         T.max(
                             T.min(
                                 T.q_multiply_shift(T.cast(
                                     T.cast(
                                         T.max(
                                             T.min(
                                                 T.q_multiply_shift(
                                                     Conv2dOutput_3[
                                                         ax3_inner_4] +
                                                     placeholder_26[
                                                         ax3_outer_2 * 64 +
                                                         ax3_inner_4],
                                                     1343014664,
                                                     31,
                                                     -8,
                                                     dtype="int32") + 136,
                                                 255), 0), "uint8"),
                                     "int32") - 136,
                                                    1073903788,
                                                    31,
                                                    1,
                                                    dtype="int32") +
                                 placeholder_28[ax0_ax1_fused_ax2_fused_3 *
                                                256 + ax3_outer_2 * 64 +
                                                ax3_inner_4], 255), 0),
                         "uint8")
def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(
        placeholder_30: T.handle, placeholder_31: T.handle,
        placeholder_32: T.handle, T_cast_8: T.handle) -> None:
    # function attr dict
    T.func_attr({
        "global_symbol":
        "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2",
        "tir.noalias": True
    })
    placeholder_33 = T.match_buffer(placeholder_30, [1, 28, 28, 192],
                                    dtype="int16",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
    placeholder_34 = T.match_buffer(placeholder_31, [1, 1, 192, 16],
                                    dtype="int16",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
    placeholder_35 = T.match_buffer(placeholder_32, [1, 1, 1, 16],
                                    dtype="int32",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
    T_cast_9 = T.match_buffer(T_cast_8, [1, 28, 28, 16],
                              dtype="int16",
                              elem_offset=0,
                              align=128,
                              offset_factor=1)
    # body
    PaddedInput_3 = T.allocate([1, 28, 28, 192], "int16", "global")
    for i0_i1_fused_3 in T.parallel(0, 28):
        for i2_3, i3_3 in T.grid(28, 192):
            T.store(
                PaddedInput_3,
                (((i0_i1_fused_3 * 5376) + (i2_3 * 192)) + i3_3),
                T.load("int16", placeholder_33.data,
                       (((i0_i1_fused_3 * 5376) + (i2_3 * 192)) + i3_3)), True)
    for ax0_ax1_fused_ax2_fused_3 in T.parallel(0, 784):
        for ax3_2 in T.serial(0, 16):
            Conv2dOutput_3 = T.allocate([1, 1, 1, 1], "int32", "global")
            T.store(Conv2dOutput_3, 0, 0, True)
            for rc_3 in T.serial(0, 192):
                T.store(Conv2dOutput_3, 0,
                        (T.load("int32", Conv2dOutput_3, 0) + (T.cast(
                            T.load("int16", PaddedInput_3,
                                   ((ax0_ax1_fused_ax2_fused_3 * 192) + rc_3)),
                            "int32") * T.cast(
                                T.load("int16", placeholder_34.data,
                                       ((rc_3 * 16) + ax3_2)), "int32"))),
                        True)
            T.store(
                T_cast_9.data, ((ax0_ax1_fused_ax2_fused_3 * 16) + ax3_2),
                T.cast(
                    T.cast(
                        T.max(
                            T.min(
                                T.q_multiply_shift(
                                    (T.load("int32", Conv2dOutput_3, 0) +
                                     T.load("int32", placeholder_35.data,
                                            ax3_2)),
                                    1764006585,
                                    31,
                                    -7,
                                    dtype="int32"), 255), 0), "uint8"),
                    "int16"), True)
def fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2(placeholder_30: T.handle, placeholder_31: T.handle, placeholder_32: T.handle, T_cast_8: T.handle) -> None:
    # function attr dict
    T.func_attr({"global_symbol": "fused_nn_conv2d_add_fixed_point_multiply_clip_cast_cast_2", "tir.noalias": True})
    placeholder_33 = T.match_buffer(placeholder_30, [150528], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_34 = T.match_buffer(placeholder_31, [3072], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    placeholder_35 = T.match_buffer(placeholder_32, [16], dtype="int32", elem_offset=0, align=128, offset_factor=1)
    T_cast_9 = T.match_buffer(T_cast_8, [12544], dtype="int16", elem_offset=0, align=128, offset_factor=1)
    # body
    PaddedInput_3 = T.allocate([150528], "int16", "global")
    for i0_i1_fused_3 in T.parallel(0, 28):
        for i2_3, i3_3 in T.grid(28, 192):
            PaddedInput_3[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3) ] = placeholder_33[(((i0_i1_fused_3*5376) + (i2_3*192)) + i3_3)]
    for ax0_ax1_fused_ax2_fused_3 in T.parallel(0, 784):
        for ax3_2 in T.serial(0, 16):
            Conv2dOutput_3 = T.allocate([1], "int32", "global")
            Conv2dOutput_3[0] = 0
            for rc_3 in T.serial(0, 192):
                Conv2dOutput_3[0] = (Conv2dOutput_3[0] + (T.cast(PaddedInput_3[((ax0_ax1_fused_ax2_fused_3*192) + rc_3)], "int32")*T.cast(placeholder_34[((rc_3*16) + ax3_2)], "int32")))
            T_cast_9[((ax0_ax1_fused_ax2_fused_3*16) + ax3_2)] = T.cast(T.cast(T.max(T.min(T.q_multiply_shift((Conv2dOutput_3[0] + placeholder_35[ax3_2]), 1764006585, 31, -7, dtype="int32"), 255), 0), "uint8"), "int16")