コード例 #1
0
 def conv_normal(print_ir):
     print("----- CONV2D End-to-End Test-------")
     with vta.build_config():
         s = vta.top.schedule_packed_conv2d([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
     cost = verify(s, True)
     gops = (num_ops / cost.mean) / float(10 ** 9)
     print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
コード例 #2
0
 def conv_normal(print_ir):
     print("----- CONV2D End-to-End Test-------")
     with vta.build_config():
         s = vta.top.schedule_packed_conv2d([res])
         if print_ir:
             print(
                 vta.lower(s, [data, kernel, bias, res],
                           simple_mode=True))
     cost = verify(s, True)
     gops = (num_ops / cost.mean) / float(10**9)
     print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
コード例 #3
0
# Here we choose to move the outermost reduction axis all the way out.
# This dictates that we first iterate over input channels, then batch
# dimensions, and finally output channels.
# Lastly, we apply the tensorization scheduling primitive :code:`tensorize`
# along the outer axis of the inner-most matrix matrix multiplication tensor
# block.
# We print the finalized schedule that is ready for code-generation
# by the VTA runtime JIT compiler.

s[C_buf].reorder(
    ko, s[C_buf].op.axis[0], s[C_buf].op.axis[1], s[C_buf].op.axis[2], s[C_buf].op.axis[3], ki
)
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function.

# Build GEMM VTA kernel
my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

# Write the compiled module into an object file.
temp = util.tempdir()
コード例 #4
0
def run_group_conv2d(env,
                     remote,
                     wl,
                     target,
                     check_correctness=True,
                     print_ir=False,
                     samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.nn.group_conv2d_nchw
        fschedule = topi.generic.schedule_group_conv2d_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.group_conv2d_packed
        fschedule = vta.top.schedule_group_conv2d_packed

    # Derive shapes depending upon packing
    CI_G = wl.in_filter // wl.groups
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
                       wl.groups, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \
        wl.out_filter * wl.in_filter // wl.groups

    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = tvm.topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride), wl.hpad, wl.groups).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH,
                                  wl.in_filter // env.BLOCK_IN, env.BLOCK_IN,
                                  wl.height, wl.width).transpose(
                                      (0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT,
                                      env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                                      env.BLOCK_IN, wl.hkernel,
                                      wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    bias_arr = tvm.nd.array(bias_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
コード例 #5
0
# The last phase is to lower the computation loops down to VTA hardware
# intrinsics by mapping the matrix multiplication to tensor intrinsics,
# and mapping the shift, and clipping computation to the vector ALU.

# Apply tensorization over the batch tensor tile axis
s[res_gemm].tensorize(b_tns, env.gemm)

# Add an ALU pragma over the shift and clipping operations
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
s[res_max].pragma(s[res_max].op.axis[0], env.alu)

# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, weight, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# Compile the TVM module
my_gemm = vta.build(s, [data, weight, res],
                    "ext_dev",
                    env.target_host,
                    name="my_gemm")
temp = util.tempdir()
コード例 #6
0
def run_pooling(env,
                remote,
                wl,
                target,
                check_correctness=True,
                print_ir=False,
                samples=10):

    # Workload assertions
    assert wl.hpad == wl.wpad
    pool_type = 'max'

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        #pooling_fcompute = topi.arm_cpu.pooling_nchw_spatial_pack
        pooling_fcompute = topi.nn.pool
        #pooling_fschedule = topi.arm_cpu.schedule_pooling_nchw_spatial_pack
        pooling_fschedule = topi.generic.schedule_pool
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        pooling_fcompute = vta.top.pooling_packed
        pooling_fschedule = vta.top.schedule_pooling_packed

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    # output shape
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT,
                        wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel,
                        env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        res = topi.nn.pool(data,
                           kernel=[3, 3],
                           stride=[2, 2],
                           padding=padding,
                           pool_type=pool_type,
                           layout="NCHW")
        #       res = topi.right_shift(res, 8)
        #       res = topi.add(res, bias)
        #       res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        #       res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = pooling_fschedule([res], layout)
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
    # get output shape
    _, oc, oh, ow = get_const_tuple(res.shape)
    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

    # @memoize("vta.tests.test_benchmark_topi.pooling.verify_nchw")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)

        pad_shape = (wl.batch, wl.in_filter, wl.height + wl.hpad * 2,
                     wl.width + wl.wpad * 2)
        pad_np = np.zeros(shape=pad_shape).astype(data.dtype)
        no_zero = (range(wl.batch), range(wl.in_filter),
                   (range(wl.hpad, wl.height + wl.hpad)),
                   (range(wl.wpad, wl.width + wl.wpad)))
        pad_np[np.ix_(*no_zero)] = a_np
        b_shape = (wl.batch, oc, oh, ow)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        kw, kh = 3, 3
        sw, sh = 2, 2
        for i in range(oh):
            for j in range(ow):
                b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh,
                                                 j * sw:j * sw + kw],
                                          axis=(2, 3))
        b_np = np.maximum(b_np, 0.0)
        return a_np, pad_np, b_np

    # Data in original format
    data_np, _, res_ref = get_ref_data()

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, res],
                        target=target,
                        target_host=env.target_host,
                        name="pooling")
    else:
        mod = tvm.build(s, [data, res],
                        target=target,
                        target_host=env.target_host,
                        name="pooling")
    temp = util.tempdir()
    mod.save(temp.relpath("pooling.o"))
    remote.upload(temp.relpath("pooling.o"))
    f = remote.load_module("pooling.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("pooling", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, res_arr)
        print(cost)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        res_orig = np.maximum(res_orig, 0.0)
        res_ref = res_ref.astype(env.out_dtype)
        res_orig = res_orig.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s POOLING TEST %s: Time cost = %g sec/op" %
          (device, status, cost.mean))

    return correct, cost, stats
コード例 #7
0
# The last phase is to lower the computation loops down to VTA hardware
# intrinsics by mapping the 2D convolution to tensor intrinsics,
# and mapping the shift, and clipping computation to the vector ALU.

# Apply tensorization over the batch tensor tile axis
s[res_conv].tensorize(b_tns, env.gemm)

# Add an ALU pragma over the shift and clipping operations
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
s[res_max].pragma(s[res_max].op.axis[0], env.alu)

# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from tvm.topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
コード例 #8
0
ファイル: matrix_multiply.py プロジェクト: bddppq/tvm
# along the outer axis of the inner-most matrix matrix multiplication tensor
# block.
# We print the finalized schedule that is ready for code-generation
# by the VTA runtime JIT compiler.

s[C_buf].reorder(
    ko,
    s[C_buf].op.axis[0],
    s[C_buf].op.axis[1],
    s[C_buf].op.axis[2],
    s[C_buf].op.axis[3],
    ki)
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function.

# Build GEMM VTA kernel
my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

# Write the compiled module into an object file.
temp = util.tempdir()
コード例 #9
0
def run_gemm(
    env,
    remote,
    target,
    batch_size,
    in_feat,
    out_feat,
    check_correctness=True,
    print_ir=True,
    samples=4,
):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (
            out_feat // env.BLOCK_OUT,
            in_feat // env.BLOCK_IN,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype),
                      w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH,
                                  in_feat // env.BLOCK_IN,
                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT,
                                      in_feat // env.BLOCK_IN,
                                      env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = utils.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("dense", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
コード例 #10
0
ファイル: convolution_opt.py プロジェクト: bddppq/tvm
# The last phase is to lower the computation loops down to VTA hardware
# intrinsics by mapping the 2D convolution to tensor intrinsics,
# and mapping the shift, and clipping computation to the vector ALU.

# Apply tensorization over the batch tensor tile axis
s[res_conv].tensorize(b_tns, env.gemm)

# Add an ALU pragma over the shift and clipping operations
s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
s[res_min].pragma(s[res_min].op.axis[0], env.alu)
s[res_max].pragma(s[res_max].op.axis[0], env.alu)

# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
コード例 #11
0
def run_conv2d_transpose(
    env, remote, wl, target, check_correctness=True, print_ir=False, samples=4
):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.arm_cpu.conv2d_transpose_nchw
        fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.conv2d_transpose_packed
        fschedule = vta.top.schedule_conv2d_transpose_packed

    # Derive shapes depending upon packing

    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
    if data_pack:
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
    else:
        data_shape = a_shape
        kernel_shape = w_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:

        res = fcompute(
            data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad)
        )
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad
    fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad
    num_ops = (
        2
        * wl.batch
        * fout_height
        * fout_width
        * wl.hkernel
        * wl.wkernel
        * wl.out_filter
        * wl.in_filter
    )

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act and wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(
            w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
        ).astype(kernel.dtype)
        r_np = tvm.topi.testing.conv2d_transpose_nchw_python(
            a_np.astype(env.acc_dtype),
            w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride),
            wl.hpad,
            (wl.o_hpad, wl.o_wpad),
        ).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.hkernel,
            wl.wkernel,
        ).transpose((2, 0, 4, 5, 3, 1))
        kernel_np = np.flip(kernel_np, 2)
        kernel_np = np.flip(kernel_np, 3)

    # Build
    if "vta" in target.keys:
        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
            mod = vta.build(
                s,
                [data, kernel, res],
                target=target,
                target_host=env.target_host,
                name="conv2d_transpose",
            )
    else:
        mod = tvm.build(
            s,
            [data, kernel, res],
            target=target,
            target_host=env.target_host,
            name="conv2d_transpose",
        )
    temp = utils.tempdir()
    mod.save(temp.relpath("conv2d_transpose.o"))
    remote.upload(temp.relpath("conv2d_transpose.o"))
    f = remote.load_module("conv2d_transpose.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("conv2d_transpose", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
                wl.batch, wl.out_filter, fout_height, fout_width
            )
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats