Ejemplos de load en Python, ejemplos de tvm.script.tir.load en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: test_meta_schedule_postproc_verify_gpu_code.py Proyecto: zjppoet/tvm

 def main(a: T.handle, b: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "T.noalias": True})
     # var definition
     threadIdx_x = T.env_thread("threadIdx.x")
     threadIdx_y = T.env_thread("threadIdx.y")
     blockIdx_x = T.env_thread("blockIdx.x")
     blockIdx_y = T.env_thread("blockIdx.y")
     blockIdx_z = T.env_thread("blockIdx.z")
     A = T.match_buffer(a, [14, 14, 256, 256], dtype="float32")
     B = T.match_buffer(b, [14, 14, 512, 256], dtype="float32")
     # body
     T.launch_thread(blockIdx_z, 196)
     B_local = T.allocate([6400000], "float32", "local")
     Apad_shared = T.allocate([512], "float32", "shared")
     Apad_shared_local = T.allocate([8], "float32", "local")
     T.launch_thread(blockIdx_y, 8)
     T.launch_thread(blockIdx_x, 4)
     T.launch_thread(threadIdx_y, 8)
     T.launch_thread(threadIdx_x, 8)
     for ff_c_init, nn_c_init in T.grid(8, 8):
         T.store(B_local, ff_c_init * 8 + nn_c_init, T.float32(0), True)
     for rc_outer, ry, rx in T.grid(32, 3, 3):
         for ax3_inner_outer in T.serial(0, 2):
             T.store(Apad_shared, T.ramp(threadIdx_y * 64 + threadIdx_x * 8 + ax3_inner_outer * 4, 1, 4), T.if_then_else(1 <= blockIdx_z // 14 + ry and blockIdx_z // 14 + ry < 15 and 1 <= rx + blockIdx_z % 14 and rx + blockIdx_z % 14 < 15, T.load("float32x4", A.data, T.ramp(ry * 917504 + blockIdx_z * 65536 + rx * 65536 + rc_outer * 2048 + threadIdx_y * 256 + blockIdx_x * 64 + threadIdx_x * 8 + ax3_inner_outer * 4 - 983040, 1, 4), T.broadcast(True, 4)), T.broadcast(T.float32(0), 4), dtype="float32x4"), T.broadcast(True, 4))
         for rc_inner in T.serial(0, 8):
             for ax3 in T.serial(0, 8):
                 T.store(Apad_shared_local, ax3, T.load("float32", Apad_shared, rc_inner * 64 + threadIdx_x * 8 + ax3), True)
             for ff_c, nn_c in T.grid(8, 8):
                 T.store(B_local, ff_c * 8 + nn_c, T.load("float32", B_local, ff_c * 8 + nn_c) + T.load("float32", Apad_shared_local, nn_c), True)
     for ff_inner_inner_inner, nn_inner_inner_inner in T.grid(8, 8):
         T.store(B.data, blockIdx_z * 131072 + blockIdx_y * 16384 + threadIdx_y * 2048 + ff_inner_inner_inner * 256 + blockIdx_x * 64 + threadIdx_x * 8 + nn_inner_inner_inner, T.load("float32", B_local, ff_inner_inner_inner * 8 + nn_inner_inner_inner), True)# fmt: on

Ejemplo n.º 2

0

Mostrar archivo

 def tvmgen_default_fused_cast_subtract(placeholder_2: T.handle,
                                        placeholder_3: T.handle,
                                        T_subtract: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol": "tvmgen_default_fused_cast_subtract",
         "tir.noalias": True
     })
     placeholder_4 = T.match_buffer(placeholder_2, [1, 224, 224, 3],
                                    dTpe="uint8",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder_3, [],
                                    dtype="int16",
                                    elem_offset=0,
                                    align=128,
                                    offset_factor=1)
     T_subtract_1 = T.match_buffer(T_subtract, [1, 224, 224, 3],
                                   dtype="int16",
                                   elem_offset=0,
                                   align=128,
                                   offset_factor=1)
     # body
     for ax0_ax1_fused_1 in T.serial(0, 224):
         for ax2_1, ax3_inner_1 in T.grid(224, 3):
             T.store(T_subtract_1.data, (((ax0_ax1_fused_1 * 672) +
                                          (ax2_1 * 3)) + ax3_inner_1),
                     (T.cast(
                         T.load("uint8", placeholder_4.data,
                                (((ax0_ax1_fused_1 * 672) +
                                  (ax2_1 * 3)) + ax3_inner_1)), "int16") -
                      T.load("int16", placeholder_5.data, 0)), True)

Ejemplo n.º 3

0

Mostrar archivo

def partitioned_concat_3(
    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
) -> None:
    for i1, i2, i3 in T.grid(64, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3,
            T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3),
            True,
        )
    for i1, i2, i3 in T.grid(32, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3 + 50176,
            T.load("int8", placeholder_1.data, i1 * 784 + i2 * 28 + i3),
            True,
        )
    for i1, i2, i3 in T.grid(32, 28, 28):
        T.store(
            T_concat.data,
            i1 * 784 + i2 * 28 + i3 + 75264,
            T.load("int8", placeholder_2.data, i1 * 784 + i2 * 28 + i3),
            True,
        )

Ejemplo n.º 4

0

Mostrar archivo

def unified_element_wise_kernels_with_different_size(a: T.handle, b: T.handle,
                                                     c: T.handle,
                                                     d: T.handle) -> None:
    block_x = T.env_thread("blockIdx.x")
    thread_x = T.env_thread("threadIdx.x")
    block_x_1 = T.env_thread("blockIdx.x")
    thread_x_1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [256, 256])
    D = T.match_buffer(d, [256, 256])
    with T.launch_thread(block_x, 128):
        T.launch_thread(thread_x, 128)
        T.store(
            B.data,
            block_x * 128 + thread_x,
            T.load("float32", A.data, block_x * 128 + thread_x) * 2.0,
            True,
        )
    T.launch_thread(block_x_1, 256)
    T.launch_thread(thread_x_1, 256)
    T.store(
        D.data,
        block_x_1 * 256 + thread_x_1,
        T.load("float32", C.data, block_x_1 * 256 + thread_x_1) + 1.0,
        True,
    )

Ejemplo n.º 5

0

Mostrar archivo

def concat_func_3(
    placeholder: T.Buffer[(1, 64, 28, 28), "int8"],
    placeholder_1: T.Buffer[(1, 32, 28, 28), "int8"],
    placeholder_2: T.Buffer[(1, 32, 28, 28), "int8"],
    T_concat: T.Buffer[(1, 128, 28, 28), "int8"],
) -> None:
    for i1 in T.serial(128, annotations={"pragma_loop_partition_hint": 1}):
        for i2, i3 in T.grid(28, 28):
            if 96 <= i1:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder_2.data,
                           i1 * 784 + i2 * 28 + i3 - 75264),
                    True,
                )
            if 64 <= i1 and i1 < 96:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder_1.data,
                           i1 * 784 + i2 * 28 + i3 - 50176),
                    True,
                )
            if i1 < 64:
                T.store(
                    T_concat.data,
                    i1 * 784 + i2 * 28 + i3,
                    T.load("int8", placeholder.data, i1 * 784 + i2 * 28 + i3),
                    True,
                )

Ejemplo n.º 6

0

Mostrar archivo

 def tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast(
         placeholder: T.handle, placeholder_1: T.handle,
         T_cast: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol":
         "tvmgen_default_fused_cast_subtract_fixed_point_multiply_add_clip_cast_cast",
         "tir.noalias": True
     })
     placeholder_2 = T.match_buffer(placeholder, [1, 75, 75, 64],
                                    dtype="uint8")
     placeholder_3 = T.match_buffer(placeholder_1, [64], dtype="int32")
     T_cast_1 = T.match_buffer(T_cast, [1, 75, 75, 64], dtype="int16")
     # body
     for ax0_ax1_fused, ax2, ax3_outer, ax3_inner in T.grid(75, 75, 4, 16):
         T.store(
             T_cast_1.data,
             ax0_ax1_fused * 4800 + ax2 * 64 + ax3_outer * 16 + ax3_inner,
             T.cast(
                 T.cast(
                     T.max(
                         T.min(
                             T.q_multiply_shift(T.cast(
                                 T.load(
                                     "uint8", placeholder_2.data,
                                     ax0_ax1_fused * 4800 + ax2 * 64 +
                                     ax3_outer * 16 + ax3_inner), "int32") -
                                                94,
                                                1843157232,
                                                31,
                                                1,
                                                dtype="int32") +
                             T.load("int32", placeholder_3.data,
                                    ax3_outer * 16 + ax3_inner), 255), 0),
                     "uint8"), "int16"), True)

Ejemplo n.º 7

0

Mostrar archivo

def unified_element_wise_thread_x(a: T.handle, b: T.handle,
                                  c: T.handle) -> None:
    thread_x = T.env_thread("threadIdx.x")
    block_x = T.env_thread("blockIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [128, 128])
    T.launch_thread(block_x, 128)
    with T.launch_thread(thread_x, 4):
        for j0_1 in T.serial(0, 32):
            T.store(
                B.data,
                block_x * 128 + thread_x * 32 + j0_1,
                T.load("float32", A.data, block_x * 128 + thread_x * 32 + j0_1)
                * 2.0,
                True,
            )
    T.launch_thread(thread_x, 4)
    for j1_1 in T.serial(0, 32):
        T.store(
            C.data,
            block_x * 128 + thread_x * 32 + j1_1,
            T.load("float32", A.data, block_x * 128 + thread_x * 32 + j1_1) +
            1.0,
            True,
        )

Ejemplo n.º 8

0

Mostrar archivo

def flattened_elementwise_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")
    for i in T.serial(0, 16):
        B_new = T.allocate([16], "float32", "global")
        for j in T.serial(0, 16):
            B_new[j] = T.load("float32", A.data, ((i * 16) + j)) + 1.0
        for j in T.serial(0, 16):
            C.data[((i * 16) + j)] = T.load("float32", B_new, j) * 2.0

Ejemplo n.º 9

0

Mostrar archivo

 def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, ethosu_write: ty.handle) -> None:
     # function attr dict
     tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 6, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer = tir.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_3 = tir.match_buffer(placeholder, [192], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = tir.match_buffer(placeholder_1, [848], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 6, 4, 5, 0, 6, tir.load("int8", placeholder_3.data, 72), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 4, 1, "int8", 4, 6, 16, 4, 0, 6, tir.load("int8", ethosu_write_1.data, 384), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 96, 16, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_1.data, 0), 848, 12, tir.load("uint8", buffer.data, 0), 160, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))

Ejemplo n.º 10

0

Mostrar archivo

def flattened_multi_alloc_func(a: T.handle, d: T.handle) -> None:
    A = T.match_buffer(a, (32), "float32")
    D = T.match_buffer(d, (32), "float32")

    for i in range(0, 32):
        B = T.allocate((32, ), "float32", "global")
        C = T.allocate((32, ), "float32", "global")
        B[i] = T.load("float32", A.data, i) + 1.0
        C[i] = T.load("float32", A.data, i) + T.load("float32", B, i)
        D.data[i] = T.load("float32", C, i) * 2.0

Ejemplo n.º 11

0

Mostrar archivo

 def main(placeholder_3: T.Buffer[(1, 16, 16, 32), "int8"], ethosu_write_1: T.Buffer[(1, 16, 16, 8), "int8"]) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.buffer_var("uint8", "")
     buffer_1 = T.buffer_var("uint8", "")
     # body
     placeholder_global = T.allocate([304], "uint8", "global", annotations={"disable_lower_builtin": True})
     placeholder_d_global = T.allocate([80], "uint8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer_1, 0), 304, T.load("uint8", placeholder_global, 0), dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer, 0), 80, T.load("uint8", placeholder_d_global, 0), dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, T.load("int8", placeholder_3.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 8, 16, 0, 16, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_global, 0), 304, 12, T.load("uint8", placeholder_d_global, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))

Ejemplo n.º 12

0

Mostrar archivo

def flattened_symbolic_func(a: T.handle, c: T.handle, n: T.int32,
                            m: T.int32) -> None:
    A = T.match_buffer(a, (n, m), "float32")
    C = T.match_buffer(c, (n, m), "float32")

    for i in range(0, n):
        B = T.allocate([m], "float32", "global")
        for j in range(0, m):
            B[j] = T.load("float32", A.data, i * m + j) + 1.0
        for j in range(0, m):
            C.data[i * m + j] = T.load("float32", B, j) * 2.0

Ejemplo n.º 13

0

Mostrar archivo

 def tvmgen_default_fused_nn_max_pool2d_cast(placeholder_28: T.handle,
                                             T_cast_6: T.handle) -> None:
     # function attr dict
     T.func_attr({
         "global_symbol": "tvmgen_default_fused_nn_max_pool2d_cast",
         "tir.noalias": True
     })
     placeholder_29 = T.match_buffer(placeholder_28, [1, 112, 112, 64],
                                     dtype="uint8",
                                     elem_offset=0,
                                     align=128,
                                     offset_factor=1)
     T_cast_7 = T.match_buffer(T_cast_6, [1, 56, 56, 64],
                               dtype="int16",
                               elem_offset=0,
                               align=128,
                               offset_factor=1)
     # body
     tensor_2 = T.allocate([200704], "uint8", "global")
     for ax0_ax1_fused_4 in T.serial(0, 56):
         for ax2_4 in T.serial(0, 56):
             for ax3_init in T.serial(0, 64):
                 T.store(tensor_2, (((ax0_ax1_fused_4 * 3584) +
                                     (ax2_4 * 64)) + ax3_init), T.uint8(0),
                         True)
             for rv0_rv1_fused_1, ax3_2 in T.grid(9, 64):
                 T.store(
                     tensor_2,
                     (((ax0_ax1_fused_4 * 3584) + (ax2_4 * 64)) + ax3_2),
                     T.max(
                         T.load("uint8", tensor_2,
                                (((ax0_ax1_fused_4 * 3584) +
                                  (ax2_4 * 64)) + ax3_2)),
                         T.if_then_else(
                             ((((ax0_ax1_fused_4 * 2) +
                                T.floordiv(rv0_rv1_fused_1, 3)) < 112) and
                              (((ax2_4 * 2) +
                                T.floormod(rv0_rv1_fused_1, 3)) < 112)),
                             T.load("uint8", placeholder_29.data, (((
                                 ((ax0_ax1_fused_4 * 14336) +
                                  (T.floordiv(rv0_rv1_fused_1, 3) * 7168)) +
                                 (ax2_4 * 128)) + (T.floormod(
                                     rv0_rv1_fused_1, 3) * 64)) + ax3_2)),
                             T.uint8(0),
                             dtype="uint8")), True)
     for ax0_ax1_fused_5 in T.serial(0, 56):
         for ax2_5, ax3_3 in T.grid(56, 64):
             T.store(
                 T_cast_7.data,
                 (((ax0_ax1_fused_5 * 3584) + (ax2_5 * 64)) + ax3_3),
                 T.cast(
                     T.load("uint8", tensor_2, (((ax0_ax1_fused_5 * 3584) +
                                                 (ax2_5 * 64)) + ax3_3)),
                     "int16"), True)

Ejemplo n.º 14

0

Mostrar archivo

 def test_tir_fma(A: T.handle, B: T.handle, C: T.handle,
                  d: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "test_fma", "tir.noalias": True})
     n = T.var("int32")
     stride = T.var("int32")
     stride_1 = T.var("int32")
     stride_2 = T.var("int32")
     stride_3 = T.var("int32")
     A_1 = T.match_buffer(
         A,
         [n],
         strides=[stride],
         elem_offset=0,
         align=128,
         offset_factor=1,
         type="auto",
     )
     B_1 = T.match_buffer(
         B,
         [n],
         strides=[stride_1],
         elem_offset=0,
         align=128,
         offset_factor=1,
         type="auto",
     )
     C_1 = T.match_buffer(
         C,
         [n],
         strides=[stride_2],
         elem_offset=0,
         align=128,
         offset_factor=1,
         type="auto",
     )
     d_1 = T.match_buffer(
         d,
         [n],
         strides=[stride_3],
         elem_offset=0,
         align=128,
         offset_factor=1,
         type="auto",
     )
     # body
     for i in T.serial(0, n):
         d_1.data[(i * stride_3)] = (T.load("float32", A_1.data,
                                            (i * stride)) *
                                     T.load("float32", B_1.data,
                                            (i * stride_1))) + T.load(
                                                "float32", C_1.data,
                                                (i * stride_2))

Ejemplo n.º 15

0

Mostrar archivo

def flattened_strided_buffer_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")
    for i0 in T.serial(0, 4):
        B_new = T.allocate([68], "float32", "global")
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                B_new[i1 * 17 + j] = T.load("float32", A.data,
                                            i0 * 64 + i1 * 16 + j) + 1.0
        for i1 in T.serial(0, 4):
            for j in T.serial(0, 16):
                C.data[i0 * 64 + i1 * 16 +
                       j] = T.load("float32", B_new, i1 * 17 + j) * 2.0

Ejemplo n.º 16

0

Mostrar archivo

Archivo: test_tir_to_cs_translator.py Proyecto: zhen-jia/incubator-tvm

 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, ethosu_conv2d: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     placeholder_3 = T.match_buffer(placeholder, [1, 16, 16, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_conv2d_1 = T.match_buffer(ethosu_conv2d, [1, 16, 16, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder_2, [8], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     placeholder_4 = T.match_buffer(placeholder_1, [8, 1, 1, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     placeholder_global = T.allocate([256], "uint8", "global")
     placeholder_d_global = T.allocate([8], "int32", "global")
     T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", placeholder_4.data, 0), 256,  T.load("uint8", placeholder_global, 0), dtype="handle"))
     T.evaluate(T.call_extern("ethosu_copy", T.load("int32", placeholder_5.data, 0), 8, T.load("int32", placeholder_d_global, 0), dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 16, 16, 32, 16, 0, 16, T.load("uint8", placeholder_3.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "uint8", 16, 16, 8, 16, 0, 16, T.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_global, 0), 0, 12, T.load("uint8", placeholder_d_global, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="handle"))

Ejemplo n.º 17

0

Mostrar archivo

def partitioned_concat(a: T.handle, b: T.handle, c: T.handle) -> None:
    T.func_attr({
        "from_legacy_te_schedule": True,
        "global_symbol": "main",
        "tir.noalias": True
    })
    A = T.match_buffer(a, [16], dtype="float32")
    B = T.match_buffer(b, [16], dtype="float32")
    C = T.match_buffer(c, [32], dtype="float32")
    for i in T.serial(0, 16):
        T.store(C.data, i, T.load("float32", A.data, i), True)
    for i in T.serial(0, 16):
        T.store(C.data, i + 16, T.load("float32", B.data, i + 16), True)

Ejemplo n.º 18

0

Mostrar archivo

 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle, ethosu_write: T.handle) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.match_buffer(placeholder_3, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = T.match_buffer(ethosu_write, [1, 16, 16, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = T.match_buffer(placeholder_1, [592], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = T.match_buffer(placeholder_2, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = T.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = T.allocate([4096], "int8", "global")
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 16, 16, 0, 16, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_1.data, 0), 592, 12, T.load("uint8", buffer_2.data, 0), 160, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 16, 16, 0, 16, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 16, 1, "int8", 16, 16, 8, 16, 0, 16, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 160, 12, T.load("uint8", buffer_3.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))

Ejemplo n.º 19

0

Mostrar archivo

 def main(a: T.handle, b: T.handle, c: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     A = T.match_buffer(a, [128, 128])
     B = T.match_buffer(b, [128, 128])
     C = T.match_buffer(c, [128, 128])
     # body
     for x, y in T.grid(128, 128):
         C.data[x * 128 + y] = 0.0
         for k in T.serial(0, 128):
             C.data[x * 128 + y] = T.load("float32", C.data, x * 128 + y) + T.load(
                 "float32", A.data, x * 128 + k
             ) * T.load("float32", B.data, y * 128 + k)

Ejemplo n.º 20

0

Mostrar archivo

 def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
     # function attr dict
     tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = tir.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = tir.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = tir.match_buffer(placeholder_1, [1312], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = tir.match_buffer(placeholder_3, [2608], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = tir.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = tir.allocate([1536], "int8", "global")
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 256), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 48), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 256, 32, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_2.data, 0), 1312, 12, tir.load("uint8", buffer_1.data, 0), 320, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 32, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHWC", 256, 32, 1, "int8", 4, 8, 8, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 256), 0, 0, 0, tir.float32(0.25), 14, "NHWC", 64, 8, 1, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 2608, 12, tir.load("uint8", buffer.data, 0), 80, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))

Ejemplo n.º 21

0

Mostrar archivo

Archivo: test_replace_conv2d.py Proyecto: wang910/tvm

 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle, ethosu_write: T.handle) -> None:
     # function attr dict
     T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = T.match_buffer(placeholder_3, [304], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder, [1, 8, 8, 3], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = T.match_buffer(placeholder_4, [80], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = T.match_buffer(placeholder_2, [320], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = T.match_buffer(ethosu_write, [1, 8, 8, 8], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = T.match_buffer(placeholder_1, [160], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = T.allocate([1024], "int8", "global", annotations={"disable_lower_builtin": True})
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_3.data, 0), 160, 12, T.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 304, 12, T.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 3, 8, 0, 4, T.load("int8", placeholder_5.data, 12), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 128, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer_3.data, 0), 160, 12, T.load("uint8", buffer_2.data, 0), 320, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))
     T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 4, 32, 8, 0, 4, T.load("int8", ethosu_write_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 128, 32, 1, "int8", 8, 4, 8, 8, 0, 4, T.load("int8", ethosu_write_1.data, 32), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", buffer.data, 0), 304, 12, T.load("uint8", buffer_1.data, 0), 80, 0, 0, 0, 0, "NONE", 0, 0, "TFL", "NONE", dtype="handle"))

Ejemplo n.º 22

0

Mostrar archivo

 def main(placeholder: ty.handle, placeholder_1: ty.handle, placeholder_2: ty.handle, placeholder_3: ty.handle, placeholder_4: ty.handle, ethosu_write: ty.handle) -> None:
     # function attr dict
     tir.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
     buffer = tir.match_buffer(placeholder_1, [1456], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_1 = tir.match_buffer(placeholder_2, [352], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = tir.match_buffer(placeholder, [1, 8, 1, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     ethosu_write_1 = tir.match_buffer(ethosu_write, [1, 8, 2, 8, 16], dtype="int8", elem_offset=0, align=128, offset_factor=1)
     buffer_2 = tir.match_buffer(placeholder_4, [272], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     buffer_3 = tir.match_buffer(placeholder_3, [11040], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_write_2 = tir.allocate([2304], "int8", "global")
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 384), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 1, 1, 0, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 6, 8, 3, 6, 0, 8, tir.load("int8", placeholder_5.data, 256), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 128, 16, 1, "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 384, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer.data, 0), 1456, 12, tir.load("uint8", buffer_1.data, 0), 352, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))
     tir.evaluate(tir.call_extern("ethosu_conv2d", "int8", 5, 8, 35, 5, 0, 8, tir.load("int8", ethosu_write_2, 0), 0, 0, 0, tir.float32(0.5), 10, "NHCWB16", 384, 16, 128, "int8", 4, 8, 26, 4, 0, 8, tir.load("int8", ethosu_write_1.data, 1024), 0, 0, 0, tir.float32(0.25), 14, "NHCWB16", 256, 16, 128, 3, 3, 1, 1, 1, 1, tir.load("uint8", buffer_3.data, 0), 11040, 12, tir.load("uint8", buffer_2.data, 0), 272, 0, 1, 1, 1, "NONE", 0, 0, "NONE", dtype="handle"))

Ejemplo n.º 23

0

Mostrar archivo

Archivo: test_tir_to_cs_translator.py Proyecto: zhen-jia/incubator-tvm

 def main(placeholder: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle, ethosu_conv2d: T.handle) -> None:
     # function attr dict
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     placeholder_9 = T.match_buffer(placeholder_3, [1, 1, 32, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     ethosu_conv2d_1 = T.match_buffer(ethosu_conv2d, [1, 8, 8, 8], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_7 = T.match_buffer(placeholder_1, [1, 1, 3, 32], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_6 = T.match_buffer(placeholder, [1, 8, 8, 3], dtype="uint8", elem_offset=0, align=128, offset_factor=1)
     placeholder_8 = T.match_buffer(placeholder_2, [32], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     placeholder_5 = T.match_buffer(placeholder_4, [8], dtype="int32", elem_offset=0, align=128, offset_factor=1)
     # body
     ethosu_conv2d_2 = T.allocate([1024], "uint8", "global")
     ethosu_conv2d_3 = T.allocate([2048], "uint8", "global")
     T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, T.load("uint8", placeholder_6.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, T.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_7.data, 0), 0, 12, T.load("uint8", placeholder_8.data, 0), 0, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="uint8"))
     T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, T.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, T.load("uint8", ethosu_conv2d_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_9.data, 0), 0, 12, T.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
     T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 3, 4, 0, 8, T.load("uint8", placeholder_6.data, 96), 0, 0, 0, T.float32(0.5), 10, "NHWC", 24, 3, 1, "uint8", 4, 8, 32, 4, 0, 8, T.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 32, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_7.data, 0), 0, 12, T.load("uint8", placeholder_8.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))
     T.evaluate(T.call_extern("ethosu_conv2d", "uint8", 4, 8, 32, 4, 0, 8, T.load("uint8", ethosu_conv2d_2, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 256, 32, 1, "uint8", 4, 8, 8, 4, 0, 8, T.load("uint8", ethosu_conv2d_1.data, 256), 0, 0, 0, T.float32(0.25), 14, "NHWC", 64, 8, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_9.data, 0), 0, 12, T.load("uint8", placeholder_5.data, 0), 0, 0, 0, 0, 0, "CLIP", 0, 255, "NONE", dtype="uint8"))

Ejemplo n.º 24

0

Mostrar archivo

def flattened_predicate_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (32), "float32")
    C = T.match_buffer(c, (32), "float32")

    for i, j in T.grid(5, 7):
        if i * 7 + j < 32:
            C.data[i * 7 + j] = T.load("float32", A.data, i * 7 + j) + 1.0

Ejemplo n.º 25

0

Mostrar archivo

def element_wise_two_thread_x_in_same_kernel_not_equal(a: T.handle,
                                                       b: T.handle,
                                                       c: T.handle) -> None:
    i = T.env_thread("blockIdx.x")
    j0 = T.env_thread("threadIdx.x")
    j1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [128, 64])
    T.launch_thread(i, 128)
    with T.launch_thread(j0, 128):
        T.store(B.data, i * 64 + j0,
                T.load("float32", A.data, i * 128 + j0) * 2.0, True)
    T.launch_thread(j1, 64)
    T.store(C.data, i * 64 + j1,
            T.load("float32", A.data, i * 128 + j1) + 1.0, True)

Ejemplo n.º 26

0

Mostrar archivo

def buffer_opaque_access(b: T.handle, c: T.handle) -> None:
    B = T.match_buffer(b, [16, 16], "float32")
    C = T.match_buffer(c, [16, 16], "float32")

    with T.block([]):
        T.reads([])
        T.writes(B[0:16, 0:16])
        A = T.allocate([256], "float32", "global")
        for i, j in T.grid(16, 16):
            T.store(A, i * 16 + j, 1)
        for i in range(0, 16):
            for j in range(0, 16):
                T.evaluate(T.load("float32", A, i * 16 + j))
            for j in range(0, 16):
                T.evaluate(
                    T.tvm_fill_fragment(B.data,
                                        16,
                                        16,
                                        16,
                                        0,
                                        T.float32(0),
                                        dtype="handle"))

    for i, j in T.grid(16, 16):
        with T.block([16, 16]) as [vi, vj]:
            T.bind(vi, i)
            T.bind(vj, j)
            C[vi, vj] = B[vi, vj]

Ejemplo n.º 27

0

Mostrar archivo

def flattened_gpu_func(a: T.handle, c: T.handle) -> None:
    A = T.match_buffer(a, (16, 16), "float32")
    C = T.match_buffer(c, (16, 16), "float32")

    i0 = T.env_thread("blockIdx.x")
    i1 = T.env_thread("threadIdx.x")
    i2 = T.env_thread("vthread")

    T.launch_thread(i0, 4)
    T.launch_thread(i1, 2)
    T.launch_thread(i2, 2)
    B = T.allocate([16], "float32", "local")
    for j in range(0, 16):
        B[j] = T.load("float32", A.data, i0 * 64 + i1 * 32 + i2 * 16 + j) + 1.0
    for j in range(0, 16):
        C.data[i0 * 64 + i1 * 32 + i2 * 16 + j] = T.load("float32", B, j) * 2.0

Ejemplo n.º 28

0

Mostrar archivo

Archivo: test_tir_lower_match_buffer.py Proyecto: grant-arm/tvm

def fail_match_load(a: T.handle) -> None:
    A = T.match_buffer(a, (8, 8))
    for i, j in T.grid(8, 8):
        with T.block([]):
            T.reads(A[i, j])
            T.writes([])
            sub_A = T.match_buffer(A[i, j], ())
            T.evaluate(T.load("float32", sub_A.data, 0))

Ejemplo n.º 29

0

Mostrar archivo

Archivo: test_target_codegen_llvm.py Proyecto: wang910/tvm

 def threadpool_nested_parallel_loop(
         A: T.Buffer[(4, 4), "float32"], B: T.Buffer[(4, 4),
                                                     "float32"]) -> None:
     T.func_attr({"global_symbol": "main", "tir.noalias": True})
     for i in T.parallel(4):
         for j in T.parallel(4):
             T.store(B.data, i * 4 + j,
                     T.load("float32", A.data, i * 4 + j) * 2.0)

Ejemplo n.º 30

0

Mostrar archivo

def element_wise_kernels_with_different_size(a: T.handle, b: T.handle,
                                             c: T.handle, d: T.handle) -> None:
    i0 = T.env_thread("blockIdx.x")
    j0 = T.env_thread("threadIdx.x")
    i1 = T.env_thread("blockIdx.x")
    j1 = T.env_thread("threadIdx.x")
    A = T.match_buffer(a, [128, 128])
    B = T.match_buffer(b, [128, 128])
    C = T.match_buffer(c, [256, 256])
    D = T.match_buffer(d, [256, 256])
    with T.launch_thread(i0, 128):
        T.launch_thread(j0, 128)
        T.store(B.data, i0 * 128 + j0,
                T.load("float32", A.data, i0 * 128 + j0) * 2.0, True)
    T.launch_thread(i1, 256)
    T.launch_thread(j1, 256)
    T.store(D.data, i1 * 256 + j1,
            T.load("float32", C.data, i1 * 256 + j1) + 1.0, True)