# ~~~~~~~~~~~~
# One specificity of hardware accelerators, is that on-chip memory has to be
# explicitly managed.
# This means that we'll need to describe intermediate tensors :code:`A_buf`
# and :code:`B_buf` that can have a different memory scope than the original
# placeholder tensors :code:`A` and :code:`B`.
#
# Later in the scheduling phase, we can tell the compiler that :code:`A_buf`
# and :code:`B_buf` will live in the VTA's on-chip buffers (SRAM), while
# :code:`A` and :code:`B` will live in main memory (DRAM).
# We describe A_buf and B_buf as the result of a compute
# operation that is the identity function.
# This can later be interpreted by the compiler as a cached read operation.

# A copy buffer
A_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: A(*i), "A_buf")
# B copy buffer
B_buf = te.compute((o, m, env.BATCH, env.BLOCK_OUT), lambda *i: B(*i), "B_buf")

######################################################################
# Vector Addition
# ~~~~~~~~~~~~~~~
# Now we're ready to describe the vector addition result tensor :code:`C`,
# with another compute operation.
# The compute function takes the shape of the tensor, as well as a lambda
# function that describes the computation rule for each position of the tensor.
#
# No computation happens during this phase, as we are only declaring how
# the computation should be done.

# Describe the in-VTA vector addition
def test_buffer_broadcast_expr():
    n0, m0, x = te.size_var('n0'), te.size_var('m0'), te.size_var('x')
    n1, m1 = te.size_var('n1'), te.size_var('m1')
    o0, o1 = te.size_var('o0'), te.size_var('o1')

    A = te.placeholder((m0, n0), name='A')
    B = te.placeholder((m1, n1), name='B')
    C = te.compute((o0, o1 // x), lambda i, j: A[i, j] + B[i, j], name='C')

    Ab = tvm.tir.decl_buffer(A.shape,
                             A.dtype,
                             name="Ab",
                             buffer_type="auto_broadcast")
    Bb = tvm.tir.decl_buffer(B.shape,
                             B.dtype,
                             name="Bb",
                             buffer_type="auto_broadcast")
    Cc = tvm.tir.decl_buffer(C.shape,
                             C.dtype,
                             name="Cc",
                             buffer_type="auto_broadcast")
    s = te.create_schedule(C.op)

    def check_stride():
        if not tvm.runtime.enabled("llvm"):
            return
        fadd = tvm.build(s, [A, B, C, o1, x],
                         target='llvm',
                         name='bcast_add',
                         binds={
                             A: Ab,
                             B: Bb,
                             C: Cc
                         })
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    def check_no_stride():
        if not tvm.runtime.enabled("llvm"):
            return
        fadd = tvm.build(s, [A, B, C, o1, x],
                         target='llvm',
                         name='bcast_add',
                         binds={
                             A: Ab,
                             B: Bb,
                             C: Cc
                         })
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    def check_auto_bind():
        if not tvm.runtime.enabled("llvm"):
            return
        # Let build bind buffers
        fadd = tvm.build(s, [A, B, C, o1, x], target='llvm', name='bcast_add')
        ctx = tvm.cpu(0)
        a = tvm.nd.array(np.random.uniform(size=(1, 4)).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=(2, 4)).astype(B.dtype), ctx)
        c = tvm.nd.array(np.zeros((2, 4), dtype=C.dtype), ctx)
        fadd(a, b, c, 4, 1)
        tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())

    check_stride()
    check_no_stride()
    check_auto_bind()
Exemple #3
0
def gemm_int8(n, m, l):
    A = te.placeholder((n, l), name="A", dtype="int8")
    B = te.placeholder((m, l), name="B", dtype="int8")

    k = te.reduce_axis((0, l), name="k")
    C = te.compute(
        (n, m),
        lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"),
                            axis=k),
        name="C",
    )

    cfg = autotvm.get_config()
    s = te.create_schedule(C.op)
    y, x = C.op.axis

    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    k = CC.op.reduce_axis[0]

    cfg.define_split(
        "tile_k",
        cfg.axis(k),
        num_outputs=3,
        filter=lambda entity: entity.size[2] == 4 and entity.size[0] * 2 >=
        entity.size[1],
    )

    ko, kt, ki = cfg["tile_k"].apply(s, CC, k)

    s[CC].tensorize(ki, intrin_dp4a)

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")

    def block_size_filter(entity):
        return (entity.size[0] * 2 >= entity.size[1] * 2
                and entity.size[1] <= 16 and entity.size[3] <= 4)

    cfg.define_split("tile_y",
                     cfg.axis(y),
                     num_outputs=4,
                     filter=block_size_filter)
    cfg.define_split("tile_x",
                     cfg.axis(x),
                     num_outputs=4,
                     filter=block_size_filter)
    by, tyz, ty, yi = cfg["tile_y"].apply(s, C, y)
    bx, txz, tx, xi = cfg["tile_x"].apply(s, C, x)

    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].bind(tyz, te.thread_axis("vthread"))
    s[C].bind(txz, te.thread_axis("vthread"))
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(by, bx, tyz, txz, ty, tx, yi, xi)

    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    s[CC].reorder(ko, kt, yo, xo, ki)
    s[CC].unroll(kt)

    for stage in [AL, BL]:
        s[stage].compute_at(s[CC], kt)
        _, xi = s[stage].split(stage.op.axis[1], factor=4)
        s[stage].vectorize(xi)
        s[stage].double_buffer()

    cfg.define_knob("storage_align", [16, 48])
    for stage in [AA, BB]:
        s[stage].storage_align(s[stage].op.axis[0], cfg["storage_align"].val,
                               0)
        s[stage].compute_at(s[CC], ko)

        fused = s[stage].fuse(*s[stage].op.axis)
        ty, tx = s[stage].split(fused, nparts=cfg["tile_y"].size[2])
        tx, xi = s[stage].split(tx, nparts=cfg["tile_x"].size[2])
        _, xi = s[stage].split(xi, factor=16)

        s[stage].bind(ty, thread_y)
        s[stage].bind(tx, thread_x)
        s[stage].vectorize(xi)

    cfg.define_knob("auto_unroll_max_step", [512, 1500])
    s[C].pragma(by, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
    s[C].pragma(by, "unroll_explicit", False)

    cfg.add_flop(n * m * l * 2)
    return s, [A, B, C]
def schedule_hwnc_tensorcore_cuda(cfg, s, Conv):
    """Schedule tensorcore template"""
    packed_data, packed_kernel = s[Conv].op.input_tensors
    ic, kh, kw, ii = s[Conv].op.reduce_axis
    pad_data = s[packed_data].op.input_tensors[0]

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    # Designate the memory hierarchy
    AS = s.cache_read(packed_data, "shared", [Conv])
    WS = s.cache_read(packed_kernel, "shared", [Conv])
    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
    ConvF = s.cache_write(Conv, "wmma.accumulator")

    if Conv.op in s.outputs:
        output = Conv
        ConvS = s.cache_read(ConvF, "shared", [Conv])
        OL = ConvS
    else:
        output = s.outputs[0].output(0)
        s[Conv].set_scope("shared")
        OL = Conv

    out_dtype = Conv.dtype

    if isinstance(
            packed_kernel.op,
            te.tensor.ComputeOp) and packed_kernel.name == "packed_kernel":
        if autotvm.GLOBAL_SCOPE.in_tuning:
            s[packed_kernel].pragma(s[packed_kernel].op.axis[0],
                                    "debug_skip_region")
        else:
            with Target("cuda"):
                schedule_injective_from_existing(s, packed_kernel)

    if isinstance(pad_data.op,
                  te.tensor.ComputeOp) and "pad" in pad_data.op.tag:
        s[pad_data].compute_inline()
        data = pad_data.op.input_tensors[0]

        if autotvm.GLOBAL_SCOPE.in_tuning:
            # skip this part during tuning to make recrods accurate
            # this part will be pre-computed during NNVM's pre-compute optimization pass
            s[pad_data].pragma(s[pad_data].op.axis[0], "debug_skip_region")
    else:
        data = pad_data
        s[data].compute_inline()

    data_dtype = data.dtype
    kernel_dtype = packed_kernel.dtype

    # Schedule for autotvm
    cfg.define_knob("block_row_warps", [1, 2, 4])
    cfg.define_knob("block_col_warps", [1, 2, 4])
    cfg.define_knob("warp_row_tiles", [1, 2, 4, 8, 16])
    cfg.define_knob("warp_col_tiles", [1, 2, 4, 8, 16])
    cfg.define_knob("chunk", [1, 2, 4, 8])
    cfg.define_knob("fuse_pack", [0, 1])
    cfg.define_knob("split_block_k_nums", [1, 2, 4, 8, 16, 32])
    cfg.define_knob("vector_ws", [1, 8])
    cfg.define_knob("vector_as", [1, 8, 16])

    block_row_warps = cfg["block_row_warps"].val
    block_col_warps = cfg["block_col_warps"].val
    warp_row_tiles = cfg["warp_row_tiles"].val
    warp_col_tiles = cfg["warp_col_tiles"].val
    chunk = cfg["chunk"].val
    vector_as = cfg["vector_as"].val
    vector_ws = cfg["vector_ws"].val
    split_block_k_nums = cfg["split_block_k_nums"].val
    fuse_pack = cfg["fuse_pack"].val

    if not fuse_pack:
        s[packed_data].compute_inline()
    else:
        with Target("cuda"):
            schedule_injective_from_existing(s, packed_data)

    if data_dtype in ["int4", "uint4"]:
        wmma_m = wmma_n = 8
        wmma_k = 32
    else:
        wmma_m = 8
        wmma_n = 32
        wmma_k = 16

    warp_size = 32

    # Schedule for output
    if len(s[output].op.axis) == 4:
        (
            hc,
            wc,
            nc,
            oc,
        ) = output.op.axis
        nc, nnc = s[output].split(nc, factor=wmma_m)
        oc, ooc = s[output].split(oc, factor=wmma_n)
    else:
        hc, wc, nc, oc, nnc, ooc = output.op.axis

    kernel_scope, hc = s[output].split(hc, nparts=1)

    block_k = s[output].fuse(hc, wc)
    block_k, split_block_k = s[output].split(block_k,
                                             factor=split_block_k_nums)
    nc, nci = s[output].split(nc, factor=warp_row_tiles)
    block_i, nc = s[output].split(nc, factor=block_row_warps)
    oc, oci = s[output].split(oc, factor=warp_col_tiles)
    block_j, oc = s[output].split(oc, factor=block_col_warps)
    s[output].reorder(block_k, split_block_k, block_i, block_j, nc, oc, nci,
                      oci, nnc, ooc)
    t = s[output].fuse(nnc, ooc)
    _, tx = s[output].split(t, factor=warp_size)
    s[output].bind(block_k, block_z)
    s[output].bind(block_i, block_x)
    s[output].bind(block_j, block_y)
    s[output].bind(tx, thread_x)
    s[output].bind(nc, thread_y)
    s[output].bind(oc, thread_z)

    # Schedule wmma store
    s[OL].compute_at(s[output], block_j)
    hc, wc, nc, oc, nnc, ooc = OL.op.axis
    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
    _, oc = s[OL].split(oc, factor=block_col_warps)
    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
    _, nc = s[OL].split(nc, factor=block_row_warps)
    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
    s[OL].bind(nc, thread_y)
    s[OL].bind(oc, thread_z)

    # Schedule local computation
    s[ConvF].compute_at(s[OL], oc)
    _, _, n, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    cfg.define_reorder("reorder_inner", [ko, kh], policy="all")
    cfg["reorder_inner"].apply(s, ConvF, [ko, kh])
    cfg["reorder_inner"].apply(s, ConvF, [ki, kw])

    cfg.define_knob("compute_at_AS", [0, 1, 2, 3])
    cfg.define_knob("compute_at_WS", [0, 1, 2, 3])
    compute_at_AS = cfg["compute_at_AS"].val
    compute_at_WS = cfg["compute_at_WS"].val

    # Move intermediate computation into each output compute tile
    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    # Schedule for A's share memory
    if compute_at_AS == 0:
        s[AS].compute_at(s[ConvF], ki)
    elif compute_at_AS == 1:
        s[AS].compute_at(s[ConvF], kw)
    elif compute_at_AS == 2:
        s[AS].compute_at(s[ConvF], ko)
    else:
        s[AS].compute_at(s[ConvF], kh)
    _, _, n, _, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, _ = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, nparts=warp_size)
    ti, _t = s[AS].split(ti, factor=vector_as)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)
    s[AS].vectorize(_t)

    # Schedule for W's share memory
    if compute_at_WS == 0:
        s[WS].compute_at(s[ConvF], ki)
    elif compute_at_WS == 1:
        s[WS].compute_at(s[ConvF], kw)
    elif compute_at_WS == 2:
        s[WS].compute_at(s[ConvF], ko)
    else:
        s[WS].compute_at(s[ConvF], kh)
    s[WS].compute_at(s[ConvF], kw)
    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, _ = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    ti, _t = s[WS].split(ti, factor=vector_ws)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    # double buffer
    cfg.define_knob("AS_double_buffer", [0, 1])
    cfg.define_knob("WS_double_buffer", [0, 1])
    if cfg["AS_double_buffer"].val:
        s[AS].double_buffer()
    if cfg["WS_double_buffer"].val:
        s[WS].double_buffer()

    # unroll
    cfg.define_knob("auto_unroll_max_step", [0, 512, 1500])
    s[output].pragma(kernel_scope, "auto_unroll_max_step",
                     cfg["auto_unroll_max_step"].val)
    s[output].pragma(kernel_scope, "unroll_explicit", False)

    shape = (wmma_m, wmma_n, wmma_k)

    AS_shape = (wmma_m, wmma_k)
    AL_shape = (wmma_m, wmma_k)
    WS_shape = (wmma_n, wmma_k)
    WL_shape = (wmma_n, wmma_k)
    CL_shape = (wmma_m, wmma_n)
    CS_shape = (wmma_m, wmma_n)

    AL_gemm = te.placeholder(AL_shape, name="A", dtype=data_dtype)
    WL_gemm = te.placeholder(WL_shape, name="B", dtype=kernel_dtype)
    k_gemm = te.reduce_axis((0, wmma_k), name="k")
    CL_compute = te.compute(
        CL_shape,
        lambda ii, jj: te.sum((AL_gemm[ii, k_gemm].astype("int32") * WL_gemm[
            jj, k_gemm].astype("int32")),
                              axis=k_gemm),
        name="C",
    )

    AL_strides = [wmma_k, 1]
    AS_strides = [wmma_k, 1]
    WL_strides = [wmma_k, 1]
    WS_strides = [wmma_k, 1]
    CL_strides = [wmma_n, 1]
    CS_strides = [wmma_n, 1]

    s[AF].tensorize(
        AF.op.axis[-2],
        intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape, "row_major",
                                  AS_shape, AL_shape, data_dtype),
    )

    s[WF].tensorize(
        WF.op.axis[-2],
        intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape, "col_major",
                                  WS_shape, WL_shape, kernel_dtype),
    )

    s[OL].tensorize(
        nnc,
        intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype,
                                 CL_shape, CS_shape))

    s[ConvF].tensorize(
        nnf,
        intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides,
                         CL_strides, shape),
    )

    return s
Exemple #5
0
def _intrin_popcount(m, k_i, w_b, x_b, unipolar):
    pack_dtype = "uint8"
    w = te.placeholder((w_b, m, k_i), dtype=pack_dtype, name="w")
    x = te.placeholder(
        (
            x_b,
            k_i,
        ),
        dtype=pack_dtype,
        name="x",
    )
    k = te.reduce_axis((0, k_i), name="k")
    bw = te.reduce_axis((0, w_b), name="bw")
    bx = te.reduce_axis((0, x_b), name="bx")
    if unipolar:
        dtype = "int16"
        z = te.compute(
            (m, ),
            lambda i: te.sum(
                (tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(
                    dtype)) - tvm.tir.popcount(~w[bw, i, k].astype(dtype) & x[
                        bx, k].astype(dtype))) << (bw + bx).astype(dtype),
                axis=[bw, bx, k],
            ),
            name="z",
        )
    else:
        dtype = "uint16"
        z = te.compute(
            (m, ),
            lambda i: te.sum(
                tvm.tir.popcount(w[bw, i, k].astype(dtype) & x[bx, k].astype(
                    dtype)) << (bw + bx).astype(dtype),
                axis=[bw, bx, k],
            ),
            name="z",
        )
    Wb = tvm.tir.decl_buffer(w.shape,
                             w.dtype,
                             name="W",
                             offset_factor=k_i,
                             strides=[te.var("ldw"),
                                      te.var("ldw"),
                                      1])  # stride can be inferred
    Xb = tvm.tir.decl_buffer(x.shape,
                             x.dtype,
                             name="X",
                             offset_factor=k_i,
                             strides=[te.var("ldw"), 1])
    Zb = tvm.tir.decl_buffer(z.shape,
                             z.dtype,
                             name="Z",
                             offset_factor=1,
                             strides=[1])

    def _intrin_func(ins, outs):
        ww, xx = ins
        zz = outs[0]

        args_2 = tvm.tir.const(2, "uint32")

        if unipolar:
            vpadd = "llvm.arm.neon.vpadd.v8i8"
            vpadalu = "llvm.arm.neon.vpadals.v16i8.v8i16"
            full_dtype = "int8x16"
            half_dtype = "int8x8"
            return_dtype = "int16x8"
        else:
            vpadd = "llvm.arm.neon.vpadd.v8u8"
            vpadalu = "llvm.arm.neon.vpadalu.v16u8.v8u16"
            full_dtype = "uint8x16"
            half_dtype = "uint8x8"
            return_dtype = "uint16x8"

        def _instr(index):
            irb = tvm.tir.ir_builder.create()
            if index == 1:  # reduce reset
                irb.emit(zz.vstore(0, tvm.tir.const(0, return_dtype)))
                return irb.get()
            # body and reduce update
            cnts8 = [None] * 8
            cnts4 = [None] * 4
            cnts2 = [None] * 2
            for bw in range(w_b):
                for bx in range(x_b):
                    if k_i == 16:
                        for i in range(m):
                            w_ = ww.vload([bw, i, 0],
                                          "uint8x16").astype(full_dtype)
                            x_ = xx.vload([bx, 0],
                                          "uint8x16").astype(full_dtype)
                            if unipolar:
                                cnts = tvm.tir.popcount(
                                    w_ & x_) - tvm.tir.popcount(~w_ & x_)
                            else:
                                cnts = tvm.tir.popcount(w_ & x_)
                            upper_half = tvm.tir.call_intrin(
                                half_dtype, "tir.vectorhigh", cnts)
                            lower_half = tvm.tir.call_intrin(
                                half_dtype, "tir.vectorlow", cnts)
                            cnts8[i] = upper_half + lower_half
                        for i in range(m // 2):
                            cnts4[i] = tvm.tir.call_llvm_pure_intrin(
                                half_dtype, vpadd, args_2, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.tir.call_llvm_pure_intrin(
                                half_dtype, vpadd, args_2, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.tir.call_intrin(full_dtype,
                                                   "tir.vectorcombine",
                                                   cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.tir.const(
                            bw + bx, pack_dtype)
                        out = tvm.tir.call_llvm_pure_intrin(
                            return_dtype, vpadalu, args_2,
                            zz.vload(0, return_dtype), shifted_cnts)
                    else:  # ki == 8
                        for i in range(m):
                            w_ = ww.vload([bw, i, 0],
                                          "uint8x8").astype(half_dtype)
                            x_ = xx.vload([bx, 0],
                                          "uint8x8").astype(half_dtype)
                            if unipolar:
                                cnts8[i] = tvm.tir.popcount(
                                    w_ & x_) - tvm.tir.popcount(~w_ & x_)
                            else:
                                cnts8[i] = tvm.tir.popcount(w_ & x_)
                        for i in range(m // 2):
                            cnts4[i] = tvm.tir.call_llvm_pure_intrin(
                                half_dtype, vpadd, args_2, cnts8[i * 2],
                                cnts8[i * 2 + 1])
                        for i in range(m // 4):
                            cnts2[i] = tvm.tir.call_llvm_pure_intrin(
                                half_dtype, vpadd, args_2, cnts4[i * 2],
                                cnts4[i * 2 + 1])
                        cnts = tvm.tir.call_intrin(full_dtype,
                                                   "tir.vectorcombine",
                                                   cnts2[0], cnts2[1])
                        shifted_cnts = cnts << tvm.tir.const(
                            bw + bx, pack_dtype)
                        out = tvm.tir.call_llvm_pure_intrin(
                            return_dtype, vpadalu, args_2,
                            zz.vload(0, return_dtype), shifted_cnts)
                    irb.emit(zz.vstore(0, out))
            return irb.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(z.op,
                                 _intrin_func,
                                 binds={
                                     w: Wb,
                                     x: Xb,
                                     z: Zb
                                 },
                                 default_buffer_params=buffer_params)
Exemple #6
0
def test_gemm():
    # graph
    nn = 2048
    n = te.var("n")
    n = tvm.runtime.convert(nn)
    m, l = n, n
    A = te.placeholder((l, n), name="A")
    B = te.placeholder((l, m), name="B")
    k = te.reduce_axis((0, l), name="k")
    C = te.compute((m, n),
                   lambda ii, jj: te.sum(A[k, jj] * B[k, ii], axis=k),
                   name="C")

    # schedule
    s = te.create_schedule(C.op)
    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = te.thread_axis("blockIdx.x")
    thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")

    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_thread)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    ko, ki = s[CC].split(k, factor=8)
    kt, ki = s[CC].split(ki, factor=1)
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
    s[CC].unroll(kt)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)
    s[AA].vectorize(xi)
    # Schedule for B' shared memory load
    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s[BB].vectorize(xi)
    s[AA].double_buffer()
    s[BB].double_buffer()

    # correctness
    def check_device(device):
        dev = tvm.device(device, 0)
        if not dev.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Device %s" % device)
        f = tvm.build(s, [A, B, C], device)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), dev)
        for i in range(2):
            f(a, b, c)
        tvm.testing.assert_allclose(c.numpy(), np.dot(b_np.T, a_np), rtol=1e-5)

        num_flops = 2 * nn * nn * nn
        num_runs = 10
        timer_f = f.time_evaluator(f.entry_name, dev, number=num_runs)
        t = timer_f(a, b, c).mean
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." %
              (num_runs, t * 1e3, GFLOPS))

    for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]:
        with tvm.transform.PassContext(
                config={
                    "tir.UnrollLoop": {
                        "auto_max_step": 128,
                        "explicit_unroll": device != "cuda"
                    }
                }):
            check_device(device)
Exemple #7
0
def te_element_wise():
    A = te.placeholder((128, 128), name="A")
    B = te.compute((128, 128), lambda x, y: A[x, y] * 2, name="B")
    C = te.compute((128, 128), lambda x, y: B[x, y] + 1, name="C")
    return [A, C]
Exemple #8
0
        def check_padded_load(pad_before, pad_after, test_name=None):
            # declare
            n = 3
            m = 5
            x = te.placeholder((n, m, env.BATCH, env.BLOCK_OUT),
                               name="x",
                               dtype=env.acc_dtype)
            x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
            # insert no-op that won't be optimized away
            y_buf = te.compute(
                (
                    n + pad_before[0] + pad_after[0],
                    m + pad_before[1] + pad_after[1],
                    env.BATCH,
                    env.BLOCK_OUT,
                ),
                lambda *i: x_buf(*i) >> 0,
                "y_buf",
            )
            y = te.compute(
                (
                    n + pad_before[0] + pad_after[0],
                    m + pad_before[1] + pad_after[1],
                    env.BATCH,
                    env.BLOCK_OUT,
                ),
                lambda *i: y_buf(*i).astype(env.inp_dtype),
                "y",
            )
            # schedule
            s = te.create_schedule(y.op)
            s[x_buf].set_scope(env.acc_scope)
            s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
            s[y_buf].set_scope(env.acc_scope)
            s[y_buf].pragma(y_buf.op.axis[0], env.alu)
            s[y].pragma(y.op.axis[0], env.dma_copy)
            # build
            with vta.build_config():
                mod = vta.build(
                    s, [x, y],
                    tvm.target.Target("ext_dev", host=env.target_host))

            if not remote:
                return
            temp = utils.tempdir()
            mod.save(temp.relpath("padded_load.o"))
            remote.upload(temp.relpath("padded_load.o"))
            f = remote.load_module("padded_load.o")
            # verify
            dev = remote.ext_dev(0)
            x_np = np.random.randint(0,
                                     10,
                                     size=(n, m, env.BATCH,
                                           env.BLOCK_OUT)).astype(x.dtype)
            y_np = np.zeros((
                n + pad_before[0] + pad_after[0],
                m + pad_before[1] + pad_after[1],
                env.BATCH,
                env.BLOCK_OUT,
            )).astype(y.dtype)
            y_np[pad_before[0]:pad_before[0] + n,
                 pad_before[1]:pad_before[1] + m, :] = x_np
            x_nd = tvm.nd.array(x_np, dev)
            y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("Padded {} load execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
Exemple #9
0
def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr,
                               data_layout, weight_layout, output_layout):
    if data_layout == 'hwc':
        (m, k) = get_const_tuple(data.shape)
    elif data_layout == 'chw':
        (k, m) = get_const_tuple(data.shape)
    
    if weight_layout == 'oi':
        (nnz, bs_o, bs_i) = get_const_tuple(weight_data.shape)
    elif weight_layout == 'io':
        (nnz, bs_i, bs_o) = get_const_tuple(weight_data.shape)

    (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape)
    num_blocks = num_blocks_plus_1 - 1

    def _compute_block_hwc(i, nb_j, j):
        row_start = weight_indptr[nb_j]
        row_end = weight_indptr[nb_j + 1]
        row_elems = row_end - row_start
        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
        block_offset = row_start + elem_idx
        c = te.reduce_axis((0, bs_i), name="c")
        block_j = weight_indices[block_offset]
        if weight_layout == 'oi':
            block_ij_val = weight_data[block_offset][j][c]
        elif weight_layout == 'io':
            block_ij_val = weight_data[block_offset][c][j]
        if data_layout == 'hwc':
            x_val = data[i, bs_i * block_j + c]
        elif data_layout == 'chw':
            x_val = data[bs_i * block_j + c, i]
        return te.sum(block_ij_val * x_val, axis=[elem_idx, c])

    def _compute_block_chw(nb_j, j, i):
        row_start = weight_indptr[nb_j]
        row_end = weight_indptr[nb_j + 1]
        row_elems = row_end - row_start
        elem_idx = te.reduce_axis((0, row_elems), name="elem_idx")
        block_offset = row_start + elem_idx
        c = te.reduce_axis((0, bs_i), name="c")
        block_j = weight_indices[block_offset]
        if weight_layout == 'oi':
            block_ij_val = weight_data[block_offset][j][c]
        elif weight_layout == 'io':
            block_ij_val = weight_data[block_offset][c][j]
        if data_layout == 'hwc':
            x_val = data[i, bs_i * block_j + c]
        elif data_layout == 'chw':
            x_val = data[bs_i * block_j + c, i]
        return te.sum(block_ij_val * x_val, axis=[elem_idx, c])

    idxd = tvm.tir.indexdiv
    idxm = tvm.tir.indexmod

    if output_layout == 'hwc':
        bsrmm_block = te.compute(
            (m, num_blocks, bs_o),
            _compute_block_hwc,
            tag="sparse_dense_v2_block_hwc",
            attrs={"FLOP": 2 * m * nnz * bs_o * bs_i},
        )
        return te.compute(
            (m, num_blocks * bs_o),
            lambda m, n: bsrmm_block[m, idxd(n, bs_o), idxm(n, bs_o)],
            tag="sparse_dense_v2_hwc",
        )
    elif output_layout == 'chw':
        bsrmm_block = te.compute(
            (num_blocks, bs_o, m),
            _compute_block_chw,
            tag="sparse_dense_v2_block_chw",
            attrs={"FLOP": 2 * m * nnz * bs_o * bs_i},
        )
        return te.compute(
            (num_blocks * bs_o, m),
            lambda n, m: bsrmm_block[idxd(n, bs_o), idxm(n, bs_o), m],
            tag="sparse_dense_v2_chw",
        )
Exemple #10
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                           name="a",
                           dtype=env.acc_dtype)
        a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                           "a_buf")  # DRAM->SRAM
        max_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                             lambda *i: tvm.te.max(a_buf(*i), 0),
                             "res_buf")  # relu
        min_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1),
            "max_buf",
        )  # relu
        res = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: min_buf(*i).astype(env.inp_dtype),
            "min_buf",
        )  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res],
                            tvm.target.Target("ext_dev", host=env.target_host))
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-256,
                                 256,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 <<
                                   (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Relu execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Exemple #11
0
    def _run(env, remote):
        m = 2
        n = 8
        imm_shift = np.random.randint(0, 8)
        imm_scale = np.random.randint(1, 5)
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                           name="a",
                           dtype=env.acc_dtype)
        a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                           "a_buf")  # DRAM->SRAM
        res_shift = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: a_buf(*i) + imm_shift,
                               "res_shift")  # compute
        res_scale = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: res_shift(*i) >> imm_scale,
                               "res_scale")  # compute
        res = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                         lambda *i: res_scale(*i).astype(env.inp_dtype),
                         "res")  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[res_shift].set_scope(env.acc_scope)  # SRAM
        s[res_scale].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        mod = vta.build(s, [a, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-10,
                                 10,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.right_shift((a_np + imm_shift), imm_scale)
        res_np = res_np.astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Shift and scale execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Exemple #12
0
        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                               name="a",
                               dtype=env.acc_dtype)
            a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: a(*i), "a_buf")  # DRAM->SRAM
            if use_imm:
                res_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                     lambda *i: tvm_op(a_buf(*i), imm),
                                     "res_buf")  # compute
            else:
                b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                   name="b",
                                   dtype=env.acc_dtype)
                b_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                   lambda *i: b(*i), "b_buf")  # DRAM->SRAM
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT),
                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                    "res_buf",
                )  # compute5B
            res = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT),
                lambda *i: res_buf(*i).astype(env.inp_dtype),
                "res",
            )  # SRAM->DRAM
            # schedule
            s = te.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(
                        s, [a, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
                else:
                    mod = vta.build(
                        s, [a, b, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
            temp = utils.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            dev = remote.ext_dev(0)
            a_np = np.random.randint(-16,
                                     16,
                                     size=(m, n, env.BATCH,
                                           env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16,
                                         16,
                                         size=(m, n, env.BATCH,
                                               env.BLOCK_OUT)).astype(b.dtype)
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, dev)
            res_nd = tvm.nd.array(
                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype),
                dev)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, dev)
                f(a_nd, b_nd, res_nd)

            np.testing.assert_equal(res_np, res_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("ALU {} execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
Exemple #13
0
    def _run(env, remote):
        # declare
        o = 4
        n = 1
        m = 4
        x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN),
                           name="x",
                           dtype=env.inp_dtype)
        w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                           name="w",
                           dtype=env.wgt_dtype)
        x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i),
                           "x_buf")
        w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                           lambda *i: w(*i), "w_buf")
        ko = te.reduce_axis((0, n), name="ko")
        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
        y_gem = te.compute(
            (o, m, env.BATCH, env.BLOCK_OUT),
            lambda bo, co, bi, ci: te.sum(
                x_buf[bo, ko, bi, ki].astype(env.acc_dtype) * w_buf[
                    co, ko, ci, ki].astype(env.acc_dtype),
                axis=[ko, ki],
            ),
            name="y_gem",
        )
        y_shf = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                           lambda *i: y_gem(*i) >> 8,
                           name="y_shf")
        y_max = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                           lambda *i: tvm.te.max(y_shf(*i), 0),
                           "y_max")  # relu
        y_min = te.compute(
            (o, m, env.BATCH, env.BLOCK_OUT),
            lambda *i: tvm.te.min(y_max(*i), (1 << (env.INP_WIDTH - 1)) - 1),
            "y_min",
        )  # relu
        y = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                       lambda *i: y_min(*i).astype(env.inp_dtype),
                       name="y")

        if not remote:
            return

        def verify(s, name=None):
            # Build with the CSE pass disabled as otherwise it would complicate the test
            with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
                mod = vta.build(
                    s, [x, w, y],
                    tvm.target.Target("ext_dev", host=env.target_host))
            temp = utils.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            dev = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, dev)
            w_nd = tvm.nd.array(w_np, dev)
            y_nd = tvm.nd.array(y_np, dev)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("GEMM schedule:{} execution statistics:".format(name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))

        def test_schedule1():
            # default schedule with no smt
            s = te.create_schedule(y.op)
            # set the scope of the SRAM buffers
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            # set pragmas for DMA transfer and ALU ops
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[y].pragma(s[y].op.axis[0], env.dma_copy)
            # tensorization
            s[y_gem].reorder(
                ko,
                s[y_gem].op.axis[0],
                s[y_gem].op.axis[1],
                s[y_gem].op.axis[2],
                s[y_gem].op.axis[3],
                ki,
            )
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            verify(s, name="default")

        def test_smt():
            # test smt schedule
            s = te.create_schedule(y.op)
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            abo, aco, abi, aci = s[y].op.axis
            abo1, abo2 = s[y].split(abo, nparts=2)
            s[y].bind(abo1, te.thread_axis("cthread"))
            s[y_gem].compute_at(s[y], abo1)
            s[y_shf].compute_at(s[y], abo1)
            s[y_max].compute_at(s[y], abo1)
            s[y_min].compute_at(s[y], abo1)
            s[y_gem].reorder(
                ko,
                s[y_gem].op.axis[0],
                s[y_gem].op.axis[1],
                s[y_gem].op.axis[2],
                s[y_gem].op.axis[3],
                ki,
            )
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y].pragma(abo2, env.dma_copy)
            verify(s, name="smt")

        test_schedule1()
        test_smt()
Exemple #14
0
def _schedule_dense_tensorcore(cfg, s, C):
    """Schedule dense operator using Tensorcore"""
    A, B = s[C].op.input_tensors
    batch, out_dim = get_const_tuple(C.shape)
    out_dtype = C.dtype
    s[A].compute_inline()
    s[B].compute_inline()

    # Explicit memory access
    AS = s.cache_read(A, "shared", [C])
    BS = s.cache_read(B, "shared", [C])
    AF = s.cache_read(AS, "wmma.matrix_a", [C])
    BF = s.cache_read(BS, "wmma.matrix_b", [C])
    CF = s.cache_write(C, "wmma.accumulator")
    CS = s.cache_read(CF, "shared", [C])

    # fallback support
    target = tvm.target.Target.current()
    if cfg.is_fallback:
        ref_log = autotvm.tophub.load_reference_log(target.kind.name,
                                                    target.model,
                                                    "dense_tensorcore.cuda")
        cfg.fallback_with_reference_log(ref_log)

    # Deal with op fusion, such as bias and relu
    if C.op not in s.outputs:
        s[C].compute_inline()
        C = s.outputs[0].output(0)

    # create tuning space
    cfg.define_knob("block_row_warps", [1, 2, 4])
    cfg.define_knob("block_col_warps", [1, 2, 4])
    cfg.define_knob("warp_row_tiles", [1, 2, 4])
    cfg.define_knob("warp_col_tiles", [1, 2, 4])
    cfg.define_knob("chunk", [1, 2, 4, 8])
    cfg.define_knob("offset", [0, 8])
    cfg.define_knob("offsetCS", [0, 8])
    cfg.define_knob("vec", [1, 2, 4, 8])

    # Ensure that the default parameters are applicable when autotvm is not in use
    if batch % 32 == 0 and out_dim % 8 == 0:
        cfg.define_knob("wmma_m", [32, 16, 8])
    elif batch % 16 == 0 and out_dim % 16 == 0:
        cfg.define_knob("wmma_m", [16, 8, 32])
    elif batch % 8 == 0 and out_dim % 32 == 0:
        cfg.define_knob("wmma_m", [8, 16, 32])

    warp_size = 32
    wmma_k = 16
    block_row_warps = cfg["block_row_warps"].val
    block_col_warps = cfg["block_col_warps"].val
    warp_row_tiles = cfg["warp_row_tiles"].val
    warp_col_tiles = cfg["warp_col_tiles"].val
    chunk = cfg["chunk"].val
    offset = cfg["offset"].val
    offsetCS = cfg["offsetCS"].val
    wmma_m = cfg["wmma_m"].val
    vec = cfg["vec"].val

    if wmma_m == 16:
        wmma_n = 16
    elif wmma_m == 8:
        wmma_n = 32
    elif wmma_m == 32:
        wmma_n = 8

    # Define the stride of intrin functions
    AS_align = chunk * wmma_k + offset
    BS_align = chunk * wmma_k + offset
    CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
    AS_stride = [AS_align, 1]
    BS_stride = [BS_align, 1]
    AF_stride = [wmma_k, 1]
    BF_stride = [wmma_k, 1]
    CF_stride = [warp_col_tiles * wmma_n, 1]
    CS_stride = [CS_align, 1]

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    # Schedule for dense computation
    block_factor_b = wmma_m * warp_row_tiles * block_row_warps
    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
    b, o = C.op.axis
    block_i, bc = s[C].split(b, factor=block_factor_b)
    block_j, oc = s[C].split(o, factor=block_factor_o)
    s[C].reorder(block_i, block_j, bc, oc)
    t = s[C].fuse(bc, oc)
    t, vi = s[C].split(t, factor=vec)
    t, tx = s[C].split(t, factor=warp_size)
    t, ty = s[C].split(t, factor=block_row_warps)
    t, tz = s[C].split(t, factor=block_col_warps)
    s[C].bind(block_i, block_x)
    s[C].bind(block_j, block_y)
    s[C].bind(tz, thread_z)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].vectorize(vi)

    # Schedule for wmma store
    s[CS].compute_at(s[C], block_j)
    bb, oo = CS.op.axis
    s[CS].storage_align(bb, CS_align - 1, CS_align)
    bb, bbi = s[CS].split(bb, factor=wmma_m)
    oo, ooi = s[CS].split(oo, factor=wmma_n)
    bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
    oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
    s[CS].reorder(bb, oo, bbii, ooii, bbi, ooi)

    # Schedule for wmma computation
    s[CF].compute_at(s[CS], oo)
    warp_i, warp_j = CF.op.axis
    warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
    warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
    (k, ) = CF.op.reduce_axis
    k, _k = s[CF].split(k, factor=wmma_k)
    ko, ki = s[CF].split(k, factor=chunk)
    s[CF].reorder(ko, ki, warp_i, warp_j, _ii, _jj, _k)

    # Schedule for  wmma_matrix_a load
    s[AF].compute_at(s[CF], ki)
    b, i = AF.op.axis
    b, b_ii = s[AF].split(b, factor=wmma_m)
    i, i_jj = s[AF].split(i, factor=wmma_k)
    s[AF].reorder(b, i, b_ii, i_jj)

    # Schedule for  wmma_matrix_b load
    s[BF].compute_at(s[CF], ki)
    o, i = BF.op.axis
    o, o_ii = s[BF].split(o, factor=wmma_n)
    i, i_ii = s[BF].split(i, factor=wmma_k)
    s[BF].reorder(o, i, o_ii, i_ii)

    # Schedule for A's(B's) shared memory load
    def shared_shedule(stage, strides):
        s[stage].compute_at(s[CF], ko)
        xo, yo = stage.op.axis
        s[stage].storage_align(xo, strides - 1, strides)
        t = s[stage].fuse(xo, yo)
        t, vi = s[stage].split(t, factor=vec)
        t, tx = s[stage].split(t, factor=warp_size)
        t, ty = s[stage].split(t, factor=block_row_warps)
        _, tz = s[stage].split(t, factor=block_col_warps)
        s[stage].bind(ty, thread_y)
        s[stage].bind(tz, thread_z)
        s[stage].bind(tx, thread_x)
        s[stage].vectorize(vi)

    shared_shedule(AS, AS_align)
    shared_shedule(BS, BS_align)

    shape = (wmma_m, wmma_n, wmma_k)
    in_dtype = "float16"
    AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype)
    BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype)
    k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
    CL_compute = te.compute(
        (wmma_m, wmma_n),
        lambda ii, jj: te.sum(
            AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype(
                out_dtype),
            axis=k_gemm,
        ),
        name="CL_compute",
    )

    # lower the computation loops down to TensorCore hardware intrinsics
    # by mapping the dense tensorcore to tensor intrinsics
    s[AF].tensorize(
        b_ii,
        intrin_wmma_load_matrix_A(AF_stride, AS_stride, shape, "row_major",
                                  (wmma_m, wmma_k), (wmma_m, wmma_k),
                                  "float16"),
    )
    s[BF].tensorize(
        o_ii,
        intrin_wmma_load_matrix_W(BF_stride, BS_stride, shape, "col_major",
                                  (wmma_n, wmma_k), (wmma_n, wmma_k),
                                  "float16"),
    )
    s[CF].tensorize(
        _ii,
        intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride,
                         CF_stride, shape))
    s[CS].tensorize(
        bbi,
        intrin_wmma_store_matrix(CS_stride, CF_stride, shape, out_dtype,
                                 (wmma_m, wmma_n), (wmma_m, wmma_n)),
    )
Exemple #15
0
def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1):
    """ROI align operator in NCHW layout.

    Parameters
    ----------
    data : tvm.te.Tensor
        4-D with shape [batch, channel, height, width]

    rois : tvm.te.Tensor
        2-D with shape [num_roi, 5]. The last dimension should be in format of
        [batch_index, w_start, h_start, w_end, h_end]

    pooled_size : int or list/tuple of two ints
        output size, or [out_height, out_width]

    spatial_scale : float
        Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal
        of total stride in convolutional layers, which should be in range (0.0, 1.0]

    sample_ratio : int
        Optional sampling ratio of ROI align, using adaptive size by default.

    Returns
    -------
    output : tvm.te.Tensor
        4-D with shape [num_roi, channel, pooled_size, pooled_size]
    """
    dtype = rois.dtype
    _, channel, height, width = get_const_tuple(data.shape)
    num_roi, _ = get_const_tuple(rois.shape)

    if isinstance(pooled_size, int):
        pooled_size_h = pooled_size_w = pooled_size
    else:
        pooled_size_h, pooled_size_w = pooled_size

    def _bilinear(i, c, y, x):
        outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width)
        y = tvm.te.max(y, 0.0)
        x = tvm.te.max(x, 0.0)
        val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1)
        return tvm.tir.if_then_else(outside, 0.0, val)

    def _sample(i, c, ph, pw):
        roi = rois[i]
        batch_index = roi[0].astype("int32")
        roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[
            3], roi[4]
        roi_start_h *= spatial_scale
        roi_end_h *= spatial_scale
        roi_start_w *= spatial_scale
        roi_end_w *= spatial_scale

        # force malformed ROIs to be 1x1
        roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype))
        roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype))

        bin_h = roi_h / pooled_size_h
        bin_w = roi_w / pooled_size_w

        if sample_ratio > 0:
            roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(
                sample_ratio, "int32")
        else:
            roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32")
            roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32")

        count = roi_bin_grid_h * roi_bin_grid_w
        rh = te.reduce_axis((0, roi_bin_grid_h))
        rw = te.reduce_axis((0, roi_bin_grid_w))
        roi_start_h += ph * bin_h
        roi_start_w += pw * bin_w
        return te.sum(
            _bilinear(
                batch_index,
                c,
                roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h,
                roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w,
            ) / count,
            axis=[rh, rw],
        )

    return te.compute((num_roi, channel, pooled_size_h, pooled_size_w),
                      _sample,
                      tag="pool,roi_align_nchw")
Exemple #16
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype, tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(N, tvm.tir.Any):
        N = tvm.te.size_var("n")
    if not isinstance(IH, int) or not isinstance(IW, int):
        raise RuntimeError("ARM winograd conv2d doesn't support dynamic input height or width.")

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW))

    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")

    idxd = tvm.tir.indexdiv
    idxm = tvm.tir.indexmod

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    K = CO
    C = CI

    H = (IH + pt + pb - 3) // HSTR + 1
    W = (IW + pl + pr - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
    tile_p = P if isinstance(N, int) else nH * nW
    cfg.define_split("tile_p", cfg.axis(tile_p), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    cfg.define_split("tile_k", cfg.axis(K), num_outputs=2, filter=lambda x: x.size[-1] <= 16)
    VP = cfg["tile_p"].size[-1]
    VK = cfg["tile_k"].size[-1]

    # pack input tile
    input_tile = te.compute(
        (C, idxd(P, VP), alpha, alpha, VP),
        lambda c, b, eps, nu, bb: data_pad[
            idxd(b * VP + bb, nH * nW),
            c,
            idxm(idxd(b * VP + bb, nW), nH) * m + eps,
            idxm(b * VP + bb, nW) * m + nu,
        ],
        name="d",
    )

    if autotvm.GLOBAL_SCOPE.in_tuning:
        VC = cfg["tile_k"].size[-1]
        kvshape = (KH + tile_size - 1, KW + tile_size - 1, idxd(CO, VC), CI, VC)
        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
    else:
        # transform kernel
        if pre_computed:
            U = kernel
        else:
            r_kh = te.reduce_axis((0, KH), "r_kh")
            r_kw = te.reduce_axis((0, KW), "r_kw")
            U = te.compute(
                (alpha, alpha, idxd(K, VK), C, VK),
                lambda eps, nu, k, c, kk: te.sum(
                    kernel[k * VK + kk][c][r_kh][r_kw].astype(out_dtype)
                    * G[eps][r_kh]
                    * G[nu][r_kw],
                    axis=[r_kh, r_kw],
                ),
                name="U",
            )

    # transform image
    r_eps = te.reduce_axis((0, alpha), "r_eps")
    r_nu = te.reduce_axis((0, alpha), "r_nu")
    V = te.compute(
        (alpha, alpha, idxd(P, VP), C, VP),
        lambda eps, nu, b, c, bb: te.sum(
            input_tile[c][b][r_eps][r_nu][bb].astype(out_dtype) * B[r_eps][eps] * B[r_nu][nu],
            axis=[r_eps, r_nu],
        ),
        name="V",
    )

    # batch gemm
    c = te.reduce_axis((0, C), name="c")
    M = te.compute(
        (alpha, alpha, K, P),
        lambda eps, nu, k, b: te.sum(
            U[eps][nu][idxd(k, VK)][c][idxm(k, VK)] * V[eps][nu][idxd(b, VP)][c][idxm(b, VP)],
            axis=c,
        ),
        name="M",
    )

    # inverse transform
    r_eps = te.reduce_axis((0, alpha), "r_eps")
    r_nu = te.reduce_axis((0, alpha), "r_nu")
    Y = te.compute(
        (K, P, m, m),
        lambda k, b, vh, vw: te.sum(
            M[r_eps][r_nu][k][b] * A[r_eps][vh] * A[r_nu][vw], axis=[r_eps, r_nu]
        ),
        name="Y",
    )

    # unpack output
    output = te.compute(
        (N, K, H, W),
        lambda n, k, h, w: Y[k][n * nH * nW + idxd(h, m) * nW + idxd(w, m), idxm(h, m), idxm(w, m)],
        name="output",
        tag="winograd_conv2d_output",
    )

    # we have to manually assign effective GFLOP for winograd
    if isinstance(N, int):
        cfg.add_flop(2 * N * K * H * W * KH * KW * C)
    return output
Exemple #17
0
def conv2d_compute(
    ifm: te.Tensor,
    weight: te.Tensor,
    scale_bias: te.Tensor,
    lut: te.Tensor,
    ifm_scale: float,
    ifm_zero_point: int,
    weight_zero_point: int,
    ofm_scale: float,
    ofm_zero_point: int,
    strides: Tuple[int, int],
    padding: Tuple[int, int, int, int],
    dilation: Union[Tuple[int, int], List[int]],
    activation: str,
    clip_min: int,
    clip_max: int,
    rounding_mode: str,
    upscale: str,
    ifm_layout: str,
    ofm_layout: str,
) -> te.Tensor:
    """A compute operator representing the capabilities of a 2D convolution for the NPU.

    Parameters
    ----------
    ifm : te.Tensor
        The Input Feature Map tensor (IFM).
    weight : te.Tensor
        The weight tensor.
    scale_bias : te.Tensor
        The packed per-channel weight scale and bias tensor.
    lut : te.Tensor
        The look-up table of values to use if activation = "LUT".
    ifm_scale : float
        The quantization scale for the Input Feature Map tensor.
    ifm_zero_point : int
        The quantization zero point for the Input Feature Map tensor.
    weight_zero_point : int
        The quantization zero point for the weight tensor.
    ofm_scale : float
        The quantization scale for the Output Feature Map tensor.
    ofm_zero_point : int
        The quantization zero point for the Output Feature Map tensor.
    strides : tuple
        The 2 dimensional strides as (stride_height, stride_width).
    padding : tuple
        The 4 dimensional padding as (pad_top, pad_left, pad_bottom, pad_right).
    dilation : Union[Tuple[int, int], List[int]]
        The 2 dimensional dilation as (dilation_height, dilation_width).
    activation : str
        The activation function to use.
            "NONE" - no activation function.
            "CLIP" - clip the output between clip_min and clip_max.
            "TANH" - tanh activation function.
            "SIGMOID" - sigmoid activation function.
            "LUT" - use a look-up table to perform the activation function.
    clip_min : int
        The minimum clipping value if activation = "CLIP".
    clip_max : int
        The maximum clipping value if activation = "CLIP".
    rounding_mode : str
        The rounding mode to apply to the Output Feature Map tensor.
            "TFL" - Tensorflow Lite rounding scheme.
            "TRUNCATE" - Truncate towards zero.
            "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity.
    upscale : str
        The 2x2 upscaling mode to apply to the Input Feature Map tensor.
            "NONE" - no upscaling.
            "NEAREST" - upscale using nearest neighbour.
            "ZEROS" - upscale using zeros.
            "NATURAL" - Round to nearest value, with x.5 rounded up towards +infinity.
    ifm_layout : str
        The layout of the Input Feature Map tensor. Can be "NHWC" or "NHCWB16".
    ofm_layout : str
        The layout of the Output Feature Map tensor. Can be "NHWC" or "NHCWB16".

    Returns
    -------
    te.Tensor
        The OFM tensor.

    """
    assert ifm.shape[0] == 1
    assert ifm_layout in {"NHWC", "NHCWB16"}
    assert ofm_layout in {"NHWC", "NHCWB16"}

    stride_h, stride_w = strides
    dilation_h, dilation_w = dilation
    ofm_channels, kernel_h, kernel_w, ifm_channels = weight.shape

    # Compute operation for the IFM DMA pipeline
    dmaed_ifm = dma_ifm_compute(ifm, ifm_layout, ifm_zero_point, ifm_scale,
                                weight.shape[3], padding)

    # 2D Convolution compute operation
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    ofm_height = (dmaed_ifm.shape[1] - dilated_kernel_h) // stride_h + 1
    ofm_width = (dmaed_ifm.shape[2] - dilated_kernel_w) // stride_w + 1
    rc = te.reduce_axis((0, ifm_channels), name="rc")
    rh = te.reduce_axis((0, kernel_h), name="ry")
    rw = te.reduce_axis((0, kernel_w), name="rx")

    conv2d_attrs = {
        "op": "ethosu_conv2d",
        "weight_zero_point": weight_zero_point,
        "activation": activation,
        "upscale": upscale,
        "clip_min": clip_min,
        "clip_max": clip_max,
        "rounding_mode": rounding_mode,
        "stride_h": stride_h,
        "stride_w": stride_w,
        "dilation_h": dilation_h,
        "dilation_w": dilation_w,
    }

    # This is a trick to insert the LUT tensor into the TE graph if LUT is present
    lut_expr = (lut[0] +
                lut[255]).astype(ifm.dtype) if activation in ("TANH",
                                                              "LUT") else 0

    # Add the LUT tensor to the attributes to be able to later tell which tensor is the LUT
    if activation in ("TANH", "LUT"):
        conv2d_attrs["lut"] = lut

    conv = te.compute(
        (1, ofm_height, ofm_width, ofm_channels),
        lambda nn, hh, ww, cc: te.sum(
            dmaed_ifm(nn, hh * stride_h + rh * dilation_h, ww * stride_w + rw *
                      dilation_w, rc).astype(ifm.dtype) * weight[
                          cc, rh, rw, rc].astype(ifm.dtype)
            # This is a trick to load 10 elements of the scale_bias at once, not accurate maths
            + (scale_bias[cc, 0] * scale_bias[cc, 9] + lut_expr).astype(ifm.
                                                                        dtype),
            axis=[rh, rw, rc],
        ),
        name="ethosu_conv2d",
        attrs=conv2d_attrs,
    )

    # Compute operation for the OFM DMA pipeline
    return dma_ofm_compute(conv, ofm_layout, ofm_zero_point, ofm_scale,
                           ofm_channels)
Exemple #18
0
def _conv3d_ncdhw(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
    out_dtype = data.dtype if out_dtype is None else out_dtype

    assert isinstance(dilation, int) or len(dilation) == 3
    if isinstance(dilation, int):
        dilation_d, dilation_h, dilation_w = (dilation, dilation, dilation)
    else:
        dilation_d, dilation_h, dilation_w = dilation

    DSTR, HSTR, WSTR = strides
    batch_size, in_channel, in_depth, in_height, in_width = get_const_tuple(data.shape)
    num_filter, _, kernel_depth, kernel_height, kernel_width = get_const_tuple(kernel.shape)

    dilated_kernel_d = (kernel_depth - 1) * dilation_d + 1
    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1

    pad_front, pad_top, pad_left, pad_back, pad_down, pad_right = get_pad_tuple3d(
        padding, (dilated_kernel_d, dilated_kernel_h, dilated_kernel_w)
    )

    pad_d = pad_front + pad_back
    pad_h = pad_top + pad_down
    pad_w = pad_left + pad_right

    pad_depth = in_depth + pad_d
    pad_height = in_height + pad_h
    pad_width = in_width + pad_w

    out_depth = simplify((in_depth + pad_d - dilated_kernel_d) // DSTR + 1)
    out_height = simplify((in_height + pad_h - dilated_kernel_h) // HSTR + 1)
    out_width = simplify((in_width + pad_w - dilated_kernel_w) // WSTR + 1)

    # pack data
    DOPAD = pad_d != 0 or pad_h != 0 or pad_w != 0
    if DOPAD:
        data_pad = pad(
            data,
            (0, 0, pad_front, pad_top, pad_left),
            (0, 0, pad_back, pad_down, pad_right),
            name="data_pad",
        )
    else:
        data_pad = data

    # fetch schedule
    ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]

    shape = (batch_size, in_channel // ic_bn, pad_depth, pad_height, ic_bn, pad_width)
    data_vec = te.compute(
        shape, lambda n, C, d, h, c, w: data_pad[n, C * ic_bn + c, d, h, w], name="data_vec"
    )

    # pack kernel
    shape = (
        num_filter // oc_bn,
        in_channel // ic_bn,
        kernel_depth,
        kernel_height,
        kernel_width,
        ic_bn,
        oc_bn,
    )
    kernel_vec = te.compute(
        shape,
        lambda CO, CI, d, h, w, ci, co: kernel[CO * oc_bn + co, CI * ic_bn + ci, d, h, w],
        name="kernel_vec",
    )

    # convolution
    oshape = (batch_size, num_filter // oc_bn, out_depth, out_height, out_width, oc_bn)
    unpack_shape = (batch_size, num_filter, out_depth, out_height, out_width)

    ic = te.reduce_axis((0, in_channel), name="ic")
    kh = te.reduce_axis((0, kernel_height), name="kh")
    kw = te.reduce_axis((0, kernel_width), name="kw")
    kd = te.reduce_axis((0, kernel_depth), name="kd")
    idxmod = tvm.tir.indexmod
    idxdiv = tvm.tir.indexdiv

    conv = te.compute(
        oshape,
        lambda n, oc_chunk, od, oh, ow, oc_block: te.sum(
            data_vec[
                n,
                idxdiv(ic, ic_bn),
                od * DSTR + kd * dilation_d,
                oh * HSTR + kh * dilation_h,
                idxmod(ic, ic_bn),
                ow * WSTR + kw * dilation_w,
            ].astype(out_dtype)
            * kernel_vec[
                oc_chunk, idxdiv(ic, ic_bn), kd, kh, kw, idxmod(ic, ic_bn), oc_block
            ].astype(out_dtype),
            axis=[ic, kd, kh, kw],
        ),
        name="conv",
    )
    conv_unpacked = te.compute(
        unpack_shape,
        lambda n, c, d, h, w: conv[n, idxdiv(c, oc_bn), d, h, w, idxmod(c, oc_bn)].astype(
            out_dtype
        ),
        name="output_unpack",
        tag="conv3d_ncdhw",
    )
    return conv_unpacked
Exemple #19
0
 def te_func():
     a = te.placeholder((), name="a", dtype="int32")
     b = te.placeholder((), name="b", dtype="int32")
     c = te.compute(a.shape, lambda *i: a(*i) + b(*i), name="c")
     return [a, b, c]
Exemple #20
0
def conv1d_ncw(data,
               kernel,
               strides=1,
               padding='VALID',
               dilation=1,
               out_dtype=None):
    """ 1D convolution forward operator for NCW layout.

    Parameters
    ----------
    data : tvm.te.Tensor
        3-D with shape [batch, in_channel, in_width]

    kernel : tvm.te.Tensor
        3-D with shape [num_filter, in_channel, filter_size]

    strides : int or tuple
        The spatial stride along width

    padding : int, tuple, or str
        Padding size can be an integer for equal padding,
        a tuple of (left, right) or a string in ['VALID', 'SAME'].

    dilation : int or tuple
        Dilation rate if convolution should be dilated.

    out_dtype : str
        The output data type. If None then output is same type as input.
    """
    if out_dtype is None:
        out_dtype = data.dtype
    if isinstance(strides, (tuple, list)):
        strides = strides[0]
    if isinstance(dilation, (tuple, list)):
        dilation = dilation[0]

    batch, in_channels, data_width = data.shape
    out_channels, _, kernel_size = kernel.shape

    # Compute the output shape
    dilated_kernel_size = (kernel_size - 1) * dilation + 1
    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size, ))
    out_channels = simplify(out_channels)
    out_width = simplify(
        (data_width - dilated_kernel_size + pad_left + pad_right) // strides +
        1)

    # Apply padding
    pad_before = [0, 0, pad_left]
    pad_after = [0, 0, pad_right]
    temp = pad(data, pad_before, pad_after, name='pad_temp')

    # Compute graph
    rc = te.reduce_axis((0, in_channels), name='rc')
    rw = te.reduce_axis((0, kernel_size), name='rw')

    return te.compute(
        (batch, out_channels, out_width),
        lambda b, c, w: te.sum(temp[b, rc, w * strides + rw * dilation].astype(
            out_dtype) * kernel[c, rc, rw].astype(out_dtype),
                               axis=[rc, rw]),
        tag="conv1d_ncw")
Exemple #21
0
def get_template_op(**kwargs):
  if 'COMPUTE_V1' not in os.environ:
    raise Exception("Environment variable `COMPUTE_V1` is not set")
  program = os.environ['COMPUTE_V1'].strip()
  assert program.startswith('- '), "The computing expression doesn't start with proper prefix: - ..."

  global placeholders, output_saver
  placeholders, output_saver = {}, {"outputs": []}

  program = program[2:].strip()
  if program:
    exec('import tvm; from tvm import topi; ' + program, globals())

    inputs = sorted(list(placeholders.values()), key=lambda x: x.name)
    outputs = sorted(output_saver["outputs"], key=lambda x: x.op.name)

    anno, options = program.find('## @'), []
    if anno >= 0:
      program, options = program[:anno].strip(), program[program.index(':', anno) + 1:].strip().split('|')

    if len(outputs) > 1:
      def to_list(shape):
        return [int(d) for d in shape]
      for i in range(1, len(outputs)):
        assert to_list(outputs[0].shape) == to_list(outputs[i].shape), "Shape sizes for multiple outputs should be equal: %s v.s. %s" % (to_list(outputs[0].shape), to_list(outputs[i].shape))
      outputs = te.compute(outputs[0].shape, lambda *X: [v[X] for v in outputs], name=intermediate_output)
    sch = te.create_schedule([outputs[i].op for i in range(len(outputs))])

    def get_device_props():
      props = tvm.runtime.ndarray.gpu(0)
      with open('%s/device_properties.cfg' % os.environ['ANTARES_DRIVER_PATH'], 'r') as fp:
        mem_bandwith = []
        while True:
          line = fp.readline()
          if not line:
            break
          key, val = line.split(': ')
          if key in ('GlobalMemoryBusWidth', 'MemoryClockRate'):
            mem_bandwith.append(float(val))
        mem_bandwith = 'inf' if not mem_bandwith else np.product(mem_bandwith) * 2.5e-7
        props.mem_bandwith = float(mem_bandwith)
      return props

    if not hasattr(AntaresGlobal, 'auto_config'):
      AntaresGlobal.auto_config = AutoConfig()

    def _callback(explicit_ops):
      attrs = Mock()
      attrs.device_props = get_device_props()
      attrs.inputs = list(inputs)
      attrs.outputs = list(outputs)
      attrs.explicit_ops = explicit_ops
      attrs.scheduler = sch
      attrs.auto_config = AntaresGlobal.auto_config
      attrs.backend = backend
      attrs.ir = program
      attrs.options = options
      attrs.blend = ''
      attrs.get_extent = lambda axis: int(axis.dom.extent)

      def get_lower():
        return str(tvm.lower(sch, attrs.inputs + attrs.outputs, simple_mode=True)).split('#[metadata]')[0]

      attrs.get_lower = get_lower
      AntaresGlobal.attrs = attrs
      do_native_scheduling(attrs)

    traverse_inline(sch, outputs[0].op, _callback)
    return sch, AntaresGlobal.attrs.inputs + AntaresGlobal.attrs.outputs
Exemple #22
0
def group_conv2d_nchw_spatial_pack(cfg,
                                   data,
                                   kernel,
                                   strides,
                                   padding,
                                   dilation,
                                   groups,
                                   out_dtype="float32"):
    """
    Compute group conv2d with NCHW layout, using GSPC algorithm.
    https://arxiv.org/abs/2006.09791
    """
    assert isinstance(dilation, int) or len(dilation) == 2
    if isinstance(dilation, int):
        dilation_h, dilation_w = dilation, dilation
    else:
        dilation_h, dilation_w = dilation

    assert isinstance(padding, int) or len(padding) == 2 or len(padding) == 4
    if isinstance(padding, int):
        pad_top, pad_left, pad_bottom, pad_right = padding, padding, padding, padding
    elif len(padding) == 2:
        hpad, wpad = padding
        pad_top, pad_bottom = hpad, hpad
        pad_left, pad_right = wpad, wpad
    else:
        pad_top, pad_left, pad_bottom, pad_right = padding

    hpad = pad_top + pad_bottom
    wpad = pad_left + pad_right

    assert isinstance(strides, int) or len(strides) == 2
    if isinstance(strides, int):
        stride_h, stride_w = strides, strides
    else:
        stride_h, stride_w = strides

    batch_size, in_channel, in_height, in_width = get_const_tuple(data.shape)
    out_channel, kernel_depth, k_height, k_width = get_const_tuple(
        kernel.shape)

    pad_height = in_height + pad_top + pad_bottom
    pad_width = in_width + pad_left + pad_right

    dilated_kernel_h = (k_height - 1) * dilation_h + 1
    dilated_kernel_w = (k_width - 1) * dilation_w + 1
    out_height = (in_height + pad_top + pad_bottom -
                  dilated_kernel_h) // stride_h + 1
    out_width = (in_width + pad_left + pad_right -
                 dilated_kernel_w) // stride_w + 1

    kernels_per_group = out_channel // groups

    cfg.define_split("tile_ic", in_channel, num_outputs=2)
    cfg.define_split("tile_oc", out_channel, num_outputs=2)
    cfg.define_split("tile_ow",
                     out_width,
                     num_outputs=2,
                     filter=lambda y: y.size[-1] <= 64)
    cfg.define_knob("unroll_kw", [True, False])

    # If no config was set, we can fallback to default config.
    if cfg.is_fallback:
        _get_default_config(
            cfg,
            te.placeholder((batch_size, in_channel, in_height, in_width),
                           dtype=data.dtype),
            te.placeholder(
                (out_channel, in_channel // groups, k_height, k_width),
                dtype=kernel.dtype),
            strides,
            padding,
            groups,
            out_dtype,
        )

    oc_bn = cfg["tile_oc"].size[-1]
    ic_bn = cfg["tile_ic"].size[-1]

    # pack data
    DOPAD = hpad != 0 or wpad != 0
    if DOPAD:
        data_pad = pad(data, (0, 0, pad_top, pad_left),
                       (0, 0, pad_bottom, pad_right),
                       name="data_pad")
    else:
        data_pad = data

    shape = (groups, batch_size, kernel_depth // ic_bn, pad_height, ic_bn,
             pad_width)

    data_vec = te.compute(
        shape,
        lambda g, n, C, h, c, w: data_pad[n, C * ic_bn + c + kernel_depth * g,
                                          h, w],
        name="data_vec",
    )

    # pack kernel
    shape = (
        groups,
        kernels_per_group // oc_bn,
        kernel_depth // ic_bn,
        k_height,
        k_width,
        ic_bn,
        oc_bn,
    )

    kernel_vec = te.compute(
        shape,
        lambda g, out_channel, in_channel, h, w, ci, co: kernel[
            (out_channel * oc_bn + co + g * kernels_per_group
             ), in_channel * ic_bn + ci, h, w],
        name="kernel_vec",
    )

    # convolution
    oshape = (groups, batch_size, kernels_per_group // oc_bn, out_height,
              out_width, oc_bn)
    unpack_shape = (batch_size, out_channel, out_height, out_width)

    ic = te.reduce_axis((0, (kernel_depth)), name="ic")
    kh = te.reduce_axis((0, k_height), name="kh")
    kw = te.reduce_axis((0, k_width), name="kw")

    idxmod = tvm.tir.indexmod
    idxdiv = tvm.tir.indexdiv
    conv = te.compute(
        oshape,
        lambda g, n, oc_chunk, oh, ow, oc_block: te.sum(
            data_vec[g, n,
                     idxdiv(ic, ic_bn), oh * stride_h + kh * dilation_h,
                     idxmod(ic, ic_bn), ow * stride_w + kw * dilation_w, ].
            astype(out_dtype) * kernel_vec[g, oc_chunk,
                                           idxdiv(ic, ic_bn), kh, kw,
                                           idxmod(ic, ic_bn), oc_block].astype(
                                               out_dtype),
            axis=[ic, kh, kw],
        ),
        name="conv",
    )

    unpack = te.compute(
        unpack_shape,
        lambda n, c, h, w: conv[
            idxdiv(c, kernels_per_group), n,
            idxmod(idxdiv(c, oc_bn), (kernels_per_group // oc_bn)), h, w,
            idxmod(idxmod(c, oc_bn), kernels_per_group), ].astype(out_dtype),
        name="output_unpack",
        tag="group_conv2d_nchw",
    )

    return unpack
Exemple #23
0
def hwnc_tensorcore_cuda(cfg,
                         Input,
                         Filter,
                         stride,
                         padding,
                         dilation,
                         out_dtype="int32"):
    """Compute declaration for tensorcore"""
    assert isinstance(stride, int) or len(stride) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    in_dtype = Input.dtype

    if in_dtype in ["int4", "uint4"]:
        wmma_n = wmma_m = 8
        wmma_k = 32
    else:
        wmma_m = 8
        wmma_n = 32
        wmma_k = 16

    pre_computed = len(Filter.shape) == 6
    in_height, in_width, batch, in_channels = get_const_tuple(Input.shape)
    if pre_computed:
        kernel_h, kernel_w, oc_chunk, _, oc_block_factor, _ = get_const_tuple(
            Filter.shape)
        num_filter = oc_block_factor * oc_chunk
    else:
        kernel_h, kernel_w, num_filter, _ = get_const_tuple(Filter.shape)

    if in_dtype in ["int4", "uint4"]:
        assert batch % 8 == 0 and in_channels % 32 == 0 and num_filter % 8 == 0
    else:
        assert batch % 8 == 0 and in_channels % 16 == 0 and num_filter % 32 == 0, (
            "The shape of (batch, in_channels, num_filter) "
            "must be multiple of (8, 16, 32) for int8, "
            "and (8, 32, 8) for int4")

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))

    out_channels = num_filter
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    cfg.add_flop(2 * batch * out_height * out_width * out_channels *
                 in_channels * kernel_h * kernel_w)

    # Input feature map: (H, W, N, IC, n, ic)
    data_shape = (in_height, in_width, batch // wmma_m, in_channels // wmma_k,
                  wmma_m, wmma_k)

    # Kernel: (H, W, OC, IC, oc, ic)
    kernel_shape = (
        kernel_h,
        kernel_w,
        out_channels // wmma_n,
        in_channels // wmma_k,
        wmma_n,
        wmma_k,
    )

    # Reduction axes
    kh = te.reduce_axis((0, kernel_h), name="kh")
    kw = te.reduce_axis((0, kernel_w), name="kw")
    ic = te.reduce_axis((0, in_channels // wmma_k), name="ic")
    ii = te.reduce_axis((0, wmma_k), name="ii")

    if pre_computed:
        packed_kernel = Filter
    else:
        packed_kernel = te.compute(
            kernel_shape,
            lambda kh, kw, o, i, oo, ii: Filter[kh, kw, o * wmma_n + oo, i *
                                                wmma_k + ii],
            name="packed_kernel",
        )

    packed_data = te.compute(
        data_shape, lambda h, w, n, i, nn, ii: Input[h, w, n * wmma_m + nn, i *
                                                     wmma_k + ii])

    pad_before = [pad_top, pad_left, 0, 0, 0, 0]
    pad_after = [pad_down, pad_right, 0, 0, 0, 0]
    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")

    Conv = te.compute(
        (out_height, out_width, batch // wmma_m, out_channels // wmma_n,
         wmma_m, wmma_n),
        lambda h, w, n, o, nn, oo: te.sum(
            (pad_data[h * stride_h + kh, w * stride_w + kw, n, ic, nn, ii].
             astype("int32") * packed_kernel[kh, kw, o, ic, oo, ii].astype(
                 "int32")),
            axis=[ic, kh, kw, ii],
        ),
        name="Conv",
        tag="conv2d_HWNCnc_tensorcore",
    )
    return Conv
def conv2d_winograd_nhwc_auto_scheduler_test(N,
                                             H,
                                             W,
                                             CI,
                                             CO,
                                             kernel_size=3,
                                             stride=1,
                                             padding=0,
                                             dilation=1):
    tile_size = 4
    inputs = te.placeholder((N, H, W, CI), name="inputs")
    N, H, W, CI = get_const_tuple(inputs.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"

    KH = KW = kernel_size
    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
    assert HSTR == 1 and WSTR == 1 and KH == KW

    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0),
                           name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, "float32")

    H = (H + 2 * HPAD - KH) // HSTR + 1
    W = (W + 2 * WPAD - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW
    r_kh = te.reduce_axis((0, KH), name="r_kh")
    r_kw = te.reduce_axis((0, KW), name="r_kw")
    kshape = (alpha, alpha, CI, CO)
    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")

    idxdiv = te.indexdiv
    idxmod = te.indexmod
    # pack input tile
    input_tile = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: data_pad[idxdiv(p, (nH * nW))][idxmod(
            idxdiv(p, nW), nH) * m + eps][idxmod(p, nW) * m + nu][ci],
        name="input_tile",
    )

    # transform data
    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    data_pack = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps]
                                      * B[r_b][nu],
                                      axis=[r_a, r_b]),
        name="data_pack",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["eps", "nu", "r_a", "r_b"]
        },
    )

    # do batch gemm
    ci = te.reduce_axis((0, CI), name="ci")
    bgemm = te.compute(
        (alpha, alpha, P, CO),
        lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[
            eps][nu][ci][co],
                                      axis=[ci]),
        name="bgemm",
    )

    # inverse transform
    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    inverse = te.compute(
        (m, m, P, CO),
        lambda vh, vw, p, co: te.sum(
            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name="inverse",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["vh", "vw", "r_a", "r_b"]
        },
    )

    # output
    output = te.compute(
        (N, H, W, CO),
        lambda n, h, w, co: inverse[idxmod(h, m),
                                    idxmod(w, m), n * nH * nW + idxdiv(h, m) *
                                    nW + idxdiv(w, m), co],
        name="conv2d_winograd",
    )

    return [inputs, kernel_pack, output]
Exemple #25
0
def bitserial_conv2d_nhwc(
    cfg,
    data,
    kernel,
    stride,
    padding,
    activation_bits,
    weight_bits,
    pack_dtype,
    out_dtype,
    unipolar,
):
    """ Compute convolution with pack on spatial axes. """
    assert data.shape[
        0].value == 1, "spatial pack convolution only support batch size=1"
    assert pack_dtype == "uint8", "only support packing into uint8 bits"
    assert out_dtype == "int16", "only support output type of int16"

    N, H, W, CI = get_const_tuple(data.shape)
    if len(kernel.shape) == 4:
        KH, KW, _, CO = get_const_tuple(kernel.shape)
        CI_packed = CI // 8
    else:
        KH, KW, KB, CI_packed, CO = get_const_tuple(kernel.shape)

    if isinstance(padding, int) or (isinstance(padding, (tuple, list))
                                    and len(padding) == 2):
        TPAD, LPAD, DPAD, RPAD = get_pad_tuple(padding, kernel)
    else:
        TPAD, LPAD, DPAD, RPAD = padding

    if isinstance(stride, (tuple, list)):
        HSTR, WSTR = stride
    else:
        HSTR, WSTR = stride, stride
    HCAT, WCAT = KH - 1, KW - 1

    PAD_H = H + (TPAD + DPAD)
    PAD_W = W + (LPAD + RPAD)
    OH = (PAD_H - KH) // HSTR + 1
    OW = (PAD_W - KW) // WSTR + 1
    oshape = (1, OH, OW, CO)

    idxd = tvm.tir.indexdiv
    idxm = tvm.tir.indexmod

    # Pad input channels of weights and data when it is not a multiple of 8
    if CI_packed % 8 != 0:
        CI_PAD = CI_packed % 8
        CI_packed += CI_PAD
    else:
        CI_PAD = 0

    # ==================== define configuration space ====================
    n, oh, ow, co = cfg.axis(N), cfg.axis(OH), cfg.axis(OW), cfg.axis(CO)
    ci, kh, kw = cfg.reduce_axis(CI_packed), cfg.reduce_axis(
        KH), cfg.reduce_axis(KW)
    ib, kb = cfg.reduce_axis(activation_bits), cfg.reduce_axis(weight_bits)

    co, vc = cfg.define_split("tile_co",
                              co,
                              num_outputs=2,
                              filter=lambda x: x.size[-1] == 8)
    oh, vh = cfg.define_split("tile_oh",
                              oh,
                              num_outputs=2,
                              filter=lambda x: x.size[-1] >= 2)
    ow, vw = cfg.define_split("tile_ow",
                              ow,
                              num_outputs=2,
                              filter=lambda x: x.size[-1] >= 2)
    ci_o, ci_i = cfg.define_split(
        "tile_ci",
        ci,
        num_outputs=2,
        filter=lambda x: x.size[-1] == 8 or x.size[-1] == 16)
    re_axes = cfg.define_reorder(
        "reorder_0",
        [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i],
        policy="candidate",
        candidate=[
            [n, oh, ow, co, vh, vw, kh, kw, ci_o, kb, ib, vc, ci_i],
            [n, oh, ow, co, vh, vw, kw, kh, ci_o, kb, ib, vc, ci_i],
        ],
    )
    # binary ops
    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW *
                 binary_op_multiplier(pack_dtype))
    # ====================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    data_q = bitpack(data,
                     activation_bits,
                     pack_axis=3,
                     bit_axis=3,
                     pack_type="uint8")

    kernel_vec = _kernel_vec_spatial_pack_nhwc(kernel, weight_bits, VC,
                                               len(kernel.shape) == 4)
    idxm = tvm.tir.indexmod
    if idxm(kernel_vec.shape[-1], 8) != 0 and CI_PAD != 0:
        kernel_vec = pad(kernel_vec, [0, 0, 0, 0, 0, 0],
                         [0, 0, 0, 0, 0, CI_PAD])

    N, H, W, IB, CI = data_q.shape
    OCO, KH, KW, KB, VC, CI = kernel_vec.shape

    dvshape = (
        N,
        PAD_H // (VH * HSTR),
        PAD_W // (VW * WSTR),
        VH * HSTR + HCAT,
        VW * WSTR + WCAT,
        IB,
        CI,
    )
    ovshape = (1, OH // VH, OW // VW, CO // VC, VH, VW, VC)

    if TPAD != 0 and RPAD != 0:
        data_pad = pad(data_q, (0, TPAD, LPAD, 0, 0),
                       (0, DPAD, RPAD, 0, CI_PAD),
                       name="data_pad")
    elif CI_PAD != 0:
        data_pad = pad(data_q, (0, 0, 0, 0, 0), (0, 0, 0, 0, CI_PAD),
                       name="data_pad")
    else:
        data_pad = data_q

    data_vec = te.compute(
        dvshape,
        lambda n, h, w, vh, vw, b, ci: data_pad[n][h * VH * HSTR + vh][
            w * VW * WSTR + vw][b][ci],
        name="data_vec",
    )
    ci = te.reduce_axis((0, CI), name="ci")
    dh = te.reduce_axis((0, KH), name="dh")
    dw = te.reduce_axis((0, KW), name="dw")
    ib = te.reduce_axis((0, IB), name="ib")
    kb = te.reduce_axis((0, KB), name="kb")

    def _bipolar_conv(n, h, w, co, vh, vw, vc):
        return te.sum(
            (tvm.tir.popcount(kernel_vec[co, dh, dw, kb, vc,
                                         ci].astype("uint16")
                              & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR +
                                         dw, ib, ci].astype("uint16")) <<
             (kb + ib).astype("uint16")),
            axis=[dh, dw, kb, ib, ci],
        )

    def _unipolar_conv(n, h, w, co, vh, vw, vc):
        return te.sum(
            ((tvm.tir.popcount(kernel_vec[co, dh, dw, kb, vc,
                                          ci].astype("int16")
                               & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR +
                                          dw, ib, ci].astype("int16")) - tvm.
              tir.popcount(~kernel_vec[co, dh, dw, kb, vc, ci].astype("int16")
                           & data_vec[n, h, w, vh * HSTR + dh, vw * WSTR + dw,
                                      ib, ci]).astype("int16")) <<
             (kb + ib).astype("int16")),
            axis=[dh, dw, kb, ib, ci],
        )

    if unipolar:
        conv_vec = te.compute(ovshape,
                              _unipolar_conv,
                              name="conv_vec",
                              tag="unipolar")
    else:
        conv_vec = te.compute(ovshape,
                              _bipolar_conv,
                              name="conv_vec",
                              tag="bipolar")

    conv = te.compute(
        oshape,
        lambda n, h, w, co: conv_vec[n,
                                     idxd(h, VH),
                                     idxd(w, VW),
                                     idxd(co, VC),
                                     idxm(h, VH),
                                     idxm(w, VW),
                                     idxm(co, VC)].astype(out_dtype),
        name="conv",
        tag="spatial_bitserial_conv_nhwc",
    )

    return conv
Exemple #26
0
def test_rpc_remote_module():
    if not tvm.runtime.enabled("rpc"):
        return
    # graph
    n = tvm.runtime.convert(102)
    A = te.placeholder((n, ), name="A")
    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
    s = te.create_schedule(B.op)

    server0 = rpc.Server("localhost", key="x0")
    server1 = rpc.Server("localhost", key="x1")

    client = rpc.connect(
        server0.host,
        server0.port,
        key="x0",
        session_constructor_args=[
            "rpc.Connect", server1.host, server1.port, "x1"
        ],
    )

    def check_remote(remote):
        temp = util.tempdir()
        ctx = remote.cpu(0)
        f = tvm.build(s, [A, B], "llvm", name="myadd")
        path_dso = temp.relpath("dev_lib.so")
        f.export_library(path_dso)
        remote.upload(path_dso)
        f1 = remote.load_module("dev_lib.so")
        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator(f1.entry_name, remote.cpu(0), number=10)
        cost = time_f(a, b).mean
        print("%g secs/op" % cost)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

        # Download the file from the remote
        path_tar = temp.relpath("dev_lib.tar")
        f.export_library(path_tar)
        remote.upload(path_tar)
        local_download_path = temp.relpath("dev_lib.download.so")
        with open(local_download_path, "wb") as fo:
            fo.write(remote.download_linked_module("dev_lib.tar"))
        fupdated = tvm.runtime.load_module(local_download_path)
        a = tvm.nd.array(
            np.random.uniform(size=102).astype(A.dtype), tvm.cpu(0))
        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), tvm.cpu(0))
        fupdated(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    def check_minrpc():
        if tvm.get_global_func("rpc.CreatePipeClient",
                               allow_missing=True) is None:
            return
        # export to minrpc
        temp = util.tempdir()
        f = tvm.build(s, [A, B], "llvm --system-lib", name="myadd")
        path_minrpc = temp.relpath("dev_lib.minrpc")
        f.export_library(path_minrpc, rpc.with_minrpc(cc.create_executable))

        with pytest.raises(RuntimeError):
            rpc.PopenSession("filenotexist")

        # statrt the minrpc session.
        remote = tvm.rpc.PopenSession(path_minrpc)
        ctx = remote.cpu(0)
        f1 = remote.system_lib()

        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
        time_f = f1.time_evaluator("myadd", remote.cpu(0), number=1)
        cost = time_f(a, b).mean
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

        # change to not executable
        os.chmod(path_minrpc, stat.S_IRUSR)
        with pytest.raises(RuntimeError):
            rpc.PopenSession(path_minrpc)

    def check_remote_link_cl(remote):
        """Test function to run remote code such as cl

        This is not enabled because there is forking issue
        of TVM runtime when server launches after OpenCL
        runtime initializes. We leave it as an example
        on how to do rpc when we want to do linking on remote.
        """
        if not tvm.testing.device_enabled("opencl"):
            print("Skip because opencl is not enabled")
            return
        temp = util.tempdir()
        ctx = remote.cl(0)
        s = te.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=32)
        s[B].bind(xo, te.thread_axis("blockIdx.x"))
        s[B].bind(xi, te.thread_axis("threadIdx.x"))
        f = tvm.build(s, [A, B], "opencl", target_host="llvm", name="myadd")
        # Option 1: save modules separately and rely on remote compiler
        path_o = temp.relpath("myadd.o")
        path_cl = temp.relpath("myadd.cl")
        path_json = temp.relpath("myadd.tvm_meta.json")
        f.save(path_o)
        f.imported_modules[0].save(path_cl)
        remote.upload(path_o)
        remote.upload(path_cl)
        # upload meta data
        remote.upload(path_json)
        fhost = remote.load_module("myadd.o")
        fdev = remote.load_module("myadd.cl")
        fhost.import_module(fdev)
        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)
        # Option 2: export library as a tar ball then handled by remote compiler
        path_tar = temp.relpath("myadd.tar")
        f.export_library(path_tar)
        remote.upload(path_tar)
        fhost = remote.load_module("myadd.tar")
        a = tvm.nd.array(np.random.uniform(size=102).astype(A.dtype), ctx)
        b = tvm.nd.array(np.zeros(102, dtype=A.dtype), ctx)
        fhost(a, b)
        np.testing.assert_equal(b.asnumpy(), a.asnumpy() + 1)

    check_remote(rpc.LocalSession())
    check_remote(client)
    check_minrpc()
Exemple #27
0
def non_max_suppression(
    data,
    valid_count,
    indices,
    max_output_size=-1,
    iou_threshold=0.5,
    force_suppress=False,
    top_k=-1,
    coord_start=2,
    score_index=1,
    id_index=0,
    return_indices=True,
    invalid_to_bottom=False,
):
    """Non-maximum suppression operator for object detection.

    Parameters
    ----------
    data : tvm.te.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6] or [batch_size, num_anchors, 5].

    valid_count : tvm.te.Tensor
        1-D tensor for valid number of boxes.

    indices : tvm.te.Tensor
        2-D tensor with shape [batch_size, num_anchors].

    max_output_size : optional, int or tvm.te.Tensor
        Max number of output valid boxes for each instance.
        Return all valid boxes if the value of max_output_size is less than 0.

    iou_threshold : optional, float
        Non-maximum suppression threshold.

    force_suppress : optional, boolean
        Whether to suppress all detections regardless of class_id.

    top_k : optional, int
        Keep maximum top k detections before nms, -1 for no limit.

    coord_start : required, int
        Start index of the consecutive 4 coordinates.

    score_index: optional, int
        Index of the scores/confidence of boxes.

    id_index : optional, int
        index of the class categories, -1 to disable.

    return_indices : optional, boolean
        Whether to return box indices in input data.

    invalid_to_bottom : optional, boolean
        Whether to move all valid bounding boxes to the top.

    Returns
    -------
    out : tvm.te.Tensor or tuple of tvm.te.Tensor
        3-D tensor with shape [batch_size, num_anchors, 6]
        or [batch_size, num_anchors, 5]. Out is a tuple of tvm.te.Tensor
        if return_indices is True, the Tensor in the tuple is 2-D tensor
        with shape [batch_size, num_anchors] and shape
        [batch_size, num_valid_anchors] respectively.

    Example
    --------
    .. code-block:: python

        # An example to use non_max_suppression
        dshape = (1, 5, 6)
        data = te.placeholder(dshape, name="data")
        valid_count = te.placeholder((dshape[0],), dtype="int32", name="valid_count")
        iou_threshold = 0.7
        force_suppress = True
        top_k = -1
        out = non_max_suppression(data, valid_count, indices, iou_threshold=iou_threshold,
                                  force_suppress=force_suppress, top_k=top_k)
        np_data = np.random.uniform(dshape)
        np_valid_count = np.array([4])
        s = topi.generic.schedule_nms(out)
        f = tvm.build(s, [data, valid_count, out], "llvm")
        ctx = tvm.cpu()
        tvm_data = tvm.nd.array(np_data, ctx)
        tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
        f(tvm_data, tvm_valid_count, tvm_out)
    """
    batch_size = data.shape[0]
    num_anchors = data.shape[1]
    if isinstance(max_output_size, int):
        max_output_size = tvm.tir.const(max_output_size, dtype="int32")
    score_axis = score_index
    score_shape = (batch_size, num_anchors)
    score_tensor = te.compute(score_shape, lambda i, j: data[i, j, score_axis])
    sort_tensor = argsort(score_tensor,
                          valid_count=valid_count,
                          axis=1,
                          is_ascend=False)

    out, box_indices = hybrid_nms(
        data,
        sort_tensor,
        valid_count,
        indices,
        batch_size,
        num_anchors,
        max_output_size,
        tvm.tir.const(iou_threshold, dtype=data.dtype),
        tvm.tir.const(force_suppress, dtype="bool"),
        tvm.tir.const(top_k, dtype="int32"),
        tvm.tir.const(coord_start, dtype="int32"),
        tvm.tir.const(score_index, dtype="int32"),
        tvm.tir.const(id_index, dtype="int32"),
        tvm.tir.const(return_indices, dtype="bool"),
        zero=tvm.tir.const(0, dtype=data.dtype),
        one=tvm.tir.const(1, dtype=data.dtype),
    )
    if return_indices:
        return hybrid_rearrange_indices_out(
            box_indices,
            one=tvm.tir.const(1, dtype="int32"),
            batch_size=batch_size,
            num_anchors=num_anchors,
        )

    if invalid_to_bottom:
        out = hybrid_rearrange_box_out(
            out,
            one=tvm.tir.const(1, dtype=data.dtype),
            batch_size=batch_size,
            num_anchors=num_anchors,
        )
    return out
Exemple #28
0
def test_cache_read_write(android_serial_number, tvm_tracker_host,
                          tvm_tracker_port, adb_server_socket):
    size = 128
    outer_shape = (size, )
    factor = 16
    inner_shape = (factor, )
    dtype = "int8"

    x = te.placeholder(shape=outer_shape, dtype=dtype, name="x")
    y = te.placeholder(shape=outer_shape, dtype=dtype, name="y")
    z = te.compute(outer_shape, lambda i: x[i] + y[i], name="z")
    s = te.create_schedule(z.op)

    x_global = s.cache_read(x, "global.vtcm", [z])
    y_global = s.cache_read(y, "global.vtcm", [z])
    z_global = s.cache_write(z, "global.vtcm")

    zouter, zinner = s[z_global].split(z_global.op.axis[0], factor=factor)

    s[x_global].compute_at(s[z_global], zouter)
    s[y_global].compute_at(s[z_global], zouter)

    mem_copy_read = intrin_mem_copy(inner_shape, dtype, "global.vtcm",
                                    "global")

    (cache_read_x, ) = s[x_global].op.axis
    s[x_global].tensorize(cache_read_x, mem_copy_read)

    (cache_read_y, ) = s[y_global].op.axis
    s[y_global].tensorize(cache_read_y, mem_copy_read)

    mem_copy_write = intrin_mem_copy(outer_shape, dtype, "global",
                                     "global.vtcm")

    (cache_write_z, ) = s[z].op.axis
    s[z].tensorize(cache_write_z, mem_copy_write)

    print(tvm.lower(s, [x, y, z]))

    target_hexagon = tvm.target.hexagon("v68", link_params=True)
    func = tvm.build(s, [x, y, z],
                     tvm.target.Target(target_hexagon, host=target_hexagon),
                     name="dmacpy")
    temp = utils.tempdir()
    dso_binary = "test_binary.so"
    dso_binary_path = temp.relpath(dso_binary)
    func.save(dso_binary_path)

    if not android_serial_number:
        pytest.skip(
            "Skip hardware test since ANDROID_SERIAL_NUMBER is not set.")

    rpc_info = {
        "rpc_tracker_host": tvm_tracker_host,
        "rpc_tracker_port": tvm_tracker_port,
        "rpc_server_port": 7070,
        "adb_server_socket": adb_server_socket,
    }
    launcher = HexagonLauncher(serial_number=android_serial_number,
                               rpc_info=rpc_info)
    launcher.upload(dso_binary_path, dso_binary)
    launcher.start_server()

    with launcher.start_session() as sess:
        mod = launcher.load_module(dso_binary, sess)
        xt = tvm.nd.array(np.random.randint(-128,
                                            high=127,
                                            size=size,
                                            dtype=x.dtype),
                          device=sess.device)
        yt = tvm.nd.array(np.random.randint(-128,
                                            high=127,
                                            size=size,
                                            dtype=x.dtype),
                          device=sess.device)
        zt = tvm.nd.array(np.random.randint(-128,
                                            high=127,
                                            size=size,
                                            dtype=x.dtype),
                          device=sess.device)
        mod["dmacpy"](xt, yt, zt)
    launcher.stop_server()

    ref = xt.numpy() + yt.numpy()
    np.testing.assert_equal(zt.numpy(), ref)
def conv2d_transpose_nchw(cfg, data, kernel, stride, padding, out_dtype,
                          output_padding):
    """Transposed 2D convolution nchw forward operator.

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template
    Input : tvm.te.Tensor
        4-D with shape [batch, in_channel, in_height, in_width]
    Filter : tvm.te.Tensor
        4-D with shape [in_channel, num_filter, filter_height, filter_width]
    strides : tuple of two ints
        The spatial stride along height and width
    padding : int or str
        Padding size, or ['VALID', 'SAME']
    out_dtype: str
        The output type. This is used in mixed precision
    output_padding : tuple of two ints
        Used to disambiguate output shape.

    Returns
    -------
    Output : tvm.te.Tensor
        4-D with shape [batch, out_channel, out_height, out_width]
    """
    batch, inp_channels, inp_height, inp_width = get_const_tuple(data.shape)
    _, out_channels, kernel_height, kernel_width = get_const_tuple(
        kernel.shape)
    stride_height, stride_width = stride
    outpad_height, outpad_width = output_padding
    assert outpad_height < stride_height and outpad_width < stride_width
    cfg.stride = stride
    pad_top, pad_left, pad_bottom, pad_right = nn.get_pad_tuple(
        padding, (kernel_height, kernel_width))

    out_width = (inp_width - 1) * stride_width + \
        kernel_width - pad_left - pad_right + outpad_width
    pad_left = kernel_width - 1 - pad_left
    pad_right = kernel_width - 1 - pad_right
    dilated_width = stride_width * (inp_width - 1) + 1

    out_height = (inp_height - 1) * stride_height + \
        kernel_height - pad_top - pad_bottom + outpad_height
    pad_top = kernel_height - 1 - pad_top
    pad_bottom = kernel_height - 1 - pad_bottom
    dilated_height = stride_height * (inp_height - 1) + 1

    # compute pad
    data = te.compute(
        (batch, inp_channels, pad_top + dilated_height + pad_bottom,
         pad_left + dilated_width + pad_right),
        lambda n, c, y, x: tvm.tir.if_then_else(
            tvm.tir.all(x >= pad_left, x < pad_left + dilated_width,
                        tvm.tir.indexmod(x - pad_left, stride_width).equal(0),
                        y >= pad_top, y < pad_top + dilated_height,
                        tvm.tir.indexmod(y - pad_top, stride_height).equal(0)),
            data[n, c,
                 tvm.tir.indexdiv(y - pad_top, stride_height),
                 tvm.tir.indexdiv(x - pad_left, stride_width)],
            tvm.tir.const(0., "float32")),
        name='data_pad')

    # compute transposed conv
    dc = te.reduce_axis((0, inp_channels), name='dc')
    dh = te.reduce_axis((0, kernel_height), name='dh')
    dw = te.reduce_axis((0, kernel_width), name='dw')
    data_out = te.compute(
        (batch, out_channels, out_height, out_width),
        lambda b, c, h, w: te.sum(data[b, dc, h + dh, w + dw].astype(
            out_dtype) * kernel[dc, c, kernel_height - 1 - dh, kernel_width - 1
                                - dw].astype(out_dtype),
                                  axis=[dc, dh, dw]),
        tag="conv2d_transpose_nchw")

    return data_out
Exemple #30
0
def test_rpc_module():
    # graph
    n = tvm.runtime.convert(1024)
    A = te.placeholder((n,), name="A")
    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
    a_np = np.random.uniform(size=1024).astype(A.dtype)
    temp = utils.tempdir()

    # Establish remote connection with target hardware
    tracker = rpc.connect_tracker(tracker_host, tracker_port)
    remote = tracker.request(key, priority=0, session_timeout=60)

    # Compile the Graph for CPU target
    s = te.create_schedule(B.op)
    xo, xi = s[B].split(B.op.axis[0], factor=64)
    s[B].parallel(xi)
    s[B].pragma(xo, "parallel_launch_point")
    s[B].pragma(xi, "parallel_barrier_when_finish")
    f = tvm.build(s, [A, B], target, name="myadd_cpu")
    path_dso_cpu = temp.relpath("cpu_lib.so")
    f.export_library(path_dso_cpu, ndk.create_shared)

    # Execute the portable graph on cpu target
    print("Run CPU test ...")
    dev = remote.cpu(0)
    remote.upload(path_dso_cpu)
    f2 = remote.load_module("cpu_lib.so")
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
    time_f = f2.time_evaluator(f2.entry_name, dev, number=10)
    cost = time_f(a, b).mean
    print("%g secs/op\n" % cost)
    np.testing.assert_equal(b.numpy(), a.numpy() + 1)

    # Compile the Graph for OpenCL target
    if test_opencl:
        s = te.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, te.thread_axis("threadIdx.x"))
        s[B].bind(xo, te.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B], "opencl", target_host=target, name="myadd")
        path_dso_cl = temp.relpath("dev_lib_cl.so")
        f.export_library(path_dso_cl, ndk.create_shared)

        print("Run GPU(OpenCL Flavor) test ...")
        dev = remote.cl(0)
        remote.upload(path_dso_cl)
        f1 = remote.load_module("dev_lib_cl.so")
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
        cost = time_f(a, b).mean
        print("%g secs/op\n" % cost)
        np.testing.assert_equal(b.numpy(), a.numpy() + 1)

    # Compile the Graph for Vulkan target
    if test_vulkan:
        s = te.create_schedule(B.op)
        xo, xi = s[B].split(B.op.axis[0], factor=64)
        s[B].bind(xi, te.thread_axis("threadIdx.x"))
        s[B].bind(xo, te.thread_axis("blockIdx.x"))
        # Build the dynamic lib.
        # If we don't want to do metal and only use cpu, just set target to be target
        f = tvm.build(s, [A, B], "vulkan", target_host=target, name="myadd")
        path_dso_vulkan = temp.relpath("dev_lib_vulkan.so")
        f.export_library(path_dso_vulkan, ndk.create_shared)

        print("Run GPU(Vulkan Flavor) test ...")
        dev = remote.vulkan(0)
        remote.upload(path_dso_vulkan)
        f1 = remote.load_module("dev_lib_vulkan.so")
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev)
        time_f = f1.time_evaluator(f1.entry_name, dev, number=10)
        cost = time_f(a, b).mean
        print("%g secs/op\n" % cost)
        np.testing.assert_equal(b.numpy(), a.numpy() + 1)