Esempio n. 1
0
def run_conv2d(env,
               remote,
               wl,
               target,
               check_correctness=True,
               print_ir=False,
               samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT,
                        wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel,
                        env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = tvm.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = tvm.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = tvm.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)

    # Define base computation schedule
    with target:
        res = topi.nn.conv2d(data, kernel, (wl.hstride, wl.wstride),
                             (wl.hpad, wl.wpad), (1, 1), layout, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = topi.generic.schedule_conv2d_nchw([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = topi.testing.conv2d_nchw_python(a_np.astype(env.acc_dtype),
                                               w_np.astype(env.acc_dtype),
                                               (wl.hstride, wl.wstride),
                                               wl.hpad).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH,
                                  wl.in_filter // env.BLOCK_IN, env.BLOCK_IN,
                                  wl.height, wl.width).transpose(
                                      (0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT,
                                      env.BLOCK_OUT,
                                      wl.in_filter // env.BLOCK_IN,
                                      env.BLOCK_IN, wl.hkernel,
                                      wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    bias_arr = tvm.nd.array(bias_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET == "sim":
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            remote.get_function("vta.simulator.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = json.loads(
                remote.get_function("vta.simulator.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
        res_ref = res_ref >> 8
        res_ref += bias_np.reshape(wl.out_filter, 1, 1)
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
Esempio n. 2
0
res_ref = res_ref >> env.INP_WIDTH
res_ref = np.clip(res_ref, 0, inp_max)
res_ref = res_ref.astype(res.dtype)
res_ref = res_ref.reshape((
    batch_size // env.BATCH,
    env.BATCH,
    out_channels // env.BLOCK_OUT,
    env.BLOCK_OUT,
    fout_height,
    fout_width,
)).transpose((0, 2, 4, 5, 1, 3))
tvm.testing.assert_allclose(res_ref, res_nd.numpy())

# Print stats
if env.TARGET in ["sim", "tsim"]:
    sim_stats = simulator.stats()
    print("Execution statistics:")
    for k, v in sim_stats.items():
        print("\t{:<16}: {:>16}".format(k, v))

print("Successful 2D convolution test!")

######################################################################
# Summary
# -------
# This tutorial demonstrates how TVM scheduling primitives can be used to
# lower 2D convolution onto hardware accelerator intrinsics, making
# use of hardware specific optimizations, such as latency hiding with
# virtual threading.
#
Esempio n. 3
0
def main(model,
         start_pack,
         stop_pack,
         data_shape=(1, 3, 224, 224),
         dtype='float32'):
    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")

    ######################################################################
    # Define the platform and model targets
    # -------------------------------------
    # Execute on CPU vs. VTA, and define the model.

    # Load VTA parameters from the vta/config/vta_config.json file
    env = vta.get_env()

    # Set ``device=arm_cpu`` to run inference on the CPU
    # or ``device=vta`` to run inference on the FPGA.
    device = "vta"
    target = env.target if device == "vta" else env.target_vta_cpu

    # Name of Gluon model to compile
    # The ``start_pack`` and ``stop_pack`` labels indicate where
    # to start and end the graph packing relay pass: in other words
    # where to start and finish offloading to VTA.

    ######################################################################
    # Obtain an execution remote
    # ---------------------------------
    # When target is 'pynq', reconfigure FPGA and runtime.
    # Otherwise, if target is 'sim', execute locally.
    print(f"Target is {env.TARGET}")
    if env.TARGET in ["sim", "tsim"]:
        remote = rpc.LocalSession()
    else:
        print(f"Error, incorrect target for benchmarking: {env.TARGET}")

    # Get execution context from remote
    ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

    ######################################################################
    # Build the inference graph runtime
    # ---------------------------------
    # Grab ResNet-18 model from Gluon model zoo and compile with Relay.
    # The compilation steps are:
    #    1) Front end translation from MxNet into Relay module.
    #    2) Apply 8-bit quantization: here we skip the first conv layer,
    #       and dense layer which will both be executed in fp32 on the CPU.
    #    3) Perform graph packing to alter the data layout for tensorization.
    #    4) Perform constant folding to reduce number of operators (e.g. eliminate
    #       batch norm multiply).
    #    5) Perform relay build to object file.
    #    6) Load the object file onto remote (FPGA device).
    #    7) Generate graph runtime, `m`.

    # Load pre-configured AutoTVM schedules
    with autotvm.tophub.context(target):

        # Populate the shape and data type dictionary for ResNet input
        dtype_dict = {"data": 'float32'}
        shape_dict = {"data": data_shape}

        # Measure build start time
        build_start = time.time()

        # Start front end compilation
        if model == 'resnet':
            mod, params = test_resnet_mxnet(env)
        elif model == 'yolo':
            mod, params = test_yolo_darknet()
        elif model == 'lenet':
            mod, params = lenet()
        elif model == 'mobilenet':
            mod, params = mobilenet()
        else:
            print(f"Error, incorrect model name: {model}")

        ### Need to bind params

        # Update shape and type dictionary
        shape_dict.update({k: v.shape for k, v in params.items()})
        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
            relay_prog = relay.quantize.quantize(mod['main'], params=params)

        print(f"Finishing quantizing graph")
        # Perform graph packing and constant folding for VTA target
        if target.device_name == "vta":
            assert env.BLOCK_IN == env.BLOCK_OUT
            relay_prog = graph_pack(relay_prog,
                                    env.BATCH,
                                    env.BLOCK_OUT,
                                    env.WGT_WIDTH,
                                    start_name=start_pack,
                                    stop_name=stop_pack)

        print(f"Finishing packing graph")

        # Compile Relay program with AlterOpLayout disabled
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(relay_prog,
                                                 target=target,
                                                 params=params,
                                                 target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)

        # Measure Relay build time
        build_time = time.time() - build_start
        print(model + " inference graph built in {0:.2f}s!".format(build_time))

        # Send the inference library over to the remote RPC server
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Graph runtime
        m = graph_runtime.create(graph, lib, ctx)
    #
    # # Set the network parameters and inputs
    data = np.random.uniform(size=data_shape).astype(dtype)

    m.set_input(**params)
    m.set_input('data', tvm.nd.array(data.astype(dtype)))

    # Perform inference and gather execution statistics
    # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator
    num = 1  # number of times we run module for a single measurement
    rep = 1  # number of measurements (we derive std dev from this)
    timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)

    if env.TARGET in ["sim", "tsim"]:
        simulator.clear_stats()
        timer()
        sim_stats = simulator.stats()
        print("\nExecution statistics:")
        for k, v in sim_stats.items():
            # Since we execute the workload many times, we need to normalize stats
            # Note that there is always one warm up run
            # Therefore we divide the overall stats by (num * rep + 1)
            print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
    else:
        tcost = timer()
        std = np.std(tcost.results) * 1000
        mean = tcost.mean * 1000
        print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" %
              (mean, std, env.BATCH))
        print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
Esempio n. 4
0
        def check_padded_load(pad_before, pad_after, test_name=None):
            # declare
            n = 3
            m = 5
            x = te.placeholder((n, m, env.BATCH, env.BLOCK_OUT),
                               name="x",
                               dtype=env.acc_dtype)
            x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
            # insert no-op that won't be optimized away
            y_buf = te.compute(
                (
                    n + pad_before[0] + pad_after[0],
                    m + pad_before[1] + pad_after[1],
                    env.BATCH,
                    env.BLOCK_OUT,
                ),
                lambda *i: x_buf(*i) >> 0,
                "y_buf",
            )
            y = te.compute(
                (
                    n + pad_before[0] + pad_after[0],
                    m + pad_before[1] + pad_after[1],
                    env.BATCH,
                    env.BLOCK_OUT,
                ),
                lambda *i: y_buf(*i).astype(env.inp_dtype),
                "y",
            )
            # schedule
            s = te.create_schedule(y.op)
            s[x_buf].set_scope(env.acc_scope)
            s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
            s[y_buf].set_scope(env.acc_scope)
            s[y_buf].pragma(y_buf.op.axis[0], env.alu)
            s[y].pragma(y.op.axis[0], env.dma_copy)
            # build
            with vta.build_config():
                mod = vta.build(
                    s, [x, y],
                    tvm.target.Target("ext_dev", host=env.target_host))

            if not remote:
                return
            temp = utils.tempdir()
            mod.save(temp.relpath("padded_load.o"))
            remote.upload(temp.relpath("padded_load.o"))
            f = remote.load_module("padded_load.o")
            # verify
            dev = remote.ext_dev(0)
            x_np = np.random.randint(0,
                                     10,
                                     size=(n, m, env.BATCH,
                                           env.BLOCK_OUT)).astype(x.dtype)
            y_np = np.zeros((
                n + pad_before[0] + pad_after[0],
                m + pad_before[1] + pad_after[1],
                env.BATCH,
                env.BLOCK_OUT,
            )).astype(y.dtype)
            y_np[pad_before[0]:pad_before[0] + n,
                 pad_before[1]:pad_before[1] + m, :] = x_np
            x_nd = tvm.nd.array(x_np, dev)
            y_nd = tvm.nd.empty(y_np.shape, device=dev, dtype=y_np.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("Padded {} load execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
Esempio n. 5
0
    def _run(env, remote):
        m = 2
        n = 8
        imm_shift = np.random.randint(0, 8)
        imm_scale = np.random.randint(1, 5)
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                           name="a",
                           dtype=env.acc_dtype)
        a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                           "a_buf")  # DRAM->SRAM
        res_shift = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: a_buf(*i) + imm_shift,
                               "res_shift")  # compute
        res_scale = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: res_shift(*i) >> imm_scale,
                               "res_scale")  # compute
        res = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                         lambda *i: res_scale(*i).astype(env.inp_dtype),
                         "res")  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[res_shift].set_scope(env.acc_scope)  # SRAM
        s[res_scale].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        mod = vta.build(s, [a, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-10,
                                 10,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.right_shift((a_np + imm_shift), imm_scale)
        res_np = res_np.astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Shift and scale execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Esempio n. 6
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                           name="a",
                           dtype=env.acc_dtype)
        a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                           "a_buf")  # DRAM->SRAM
        max_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                             lambda *i: tvm.te.max(a_buf(*i), 0),
                             "res_buf")  # relu
        min_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1),
            "max_buf",
        )  # relu
        res = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: min_buf(*i).astype(env.inp_dtype),
            "min_buf",
        )  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res],
                            tvm.target.Target("ext_dev", host=env.target_host))
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-256,
                                 256,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 <<
                                   (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Relu execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Esempio n. 7
0
        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                               name="a",
                               dtype=env.acc_dtype)
            a_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                               lambda *i: a(*i), "a_buf")  # DRAM->SRAM
            if use_imm:
                res_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                     lambda *i: tvm_op(a_buf(*i), imm),
                                     "res_buf")  # compute
            else:
                b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                   name="b",
                                   dtype=env.acc_dtype)
                b_buf = te.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                   lambda *i: b(*i), "b_buf")  # DRAM->SRAM
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT),
                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                    "res_buf",
                )  # compute5B
            res = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT),
                lambda *i: res_buf(*i).astype(env.inp_dtype),
                "res",
            )  # SRAM->DRAM
            # schedule
            s = te.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(
                        s, [a, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
                else:
                    mod = vta.build(
                        s, [a, b, res],
                        tvm.target.Target("ext_dev", host=env.target_host))
            temp = utils.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            dev = remote.ext_dev(0)
            a_np = np.random.randint(-16,
                                     16,
                                     size=(m, n, env.BATCH,
                                           env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16,
                                         16,
                                         size=(m, n, env.BATCH,
                                               env.BLOCK_OUT)).astype(b.dtype)
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, dev)
            res_nd = tvm.nd.array(
                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype),
                dev)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, dev)
                f(a_nd, b_nd, res_nd)

            np.testing.assert_equal(res_np, res_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("ALU {} execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))