Beispiel #1
0
def _build_func_common(measure_input,
                       runtime=None,
                       check_gpu=None,
                       build_option=None):
    """Common part for building a configuration"""
    target, task, config = measure_input
    target, task.target_host = Target.check_and_update_host_consist(
        target, task.target_host)

    with target:
        s, args = task.instantiate(config)

        # check invalidity of template and code hash consistency
        if not config.valid():
            raise InstantiationError(config.errors)

        opts = build_option or {}
        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
            opts["tir.add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]

        # if target is vta, we need to use vta build
        if (hasattr(measure_input.target, "device_name")
                and measure_input.target.device_name == "vta"):
            # pylint: disable=import-outside-toplevel
            import vta

            func = vta.build(s, args, target_host=task.target_host)
        else:
            with tvm.ir.transform.PassContext(config=opts):
                func = build(s,
                             args,
                             target_host=task.target_host,
                             runtime=runtime)
    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
    def _run(env, remote):
        # declare
        n = 21
        m = 20
        pad_before = [0, 1, 0, 0]
        pad_after = [1, 3, 0, 0]
        x = tvm.placeholder((n, m, env.BATCH, env.BLOCK_OUT),
                            name="x",
                            dtype=env.acc_dtype)
        x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
        # insert no-op that won't be optimized away
        y_buf = tvm.compute(
            (n + pad_before[0] + pad_after[0],
             m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT),
            lambda *i: x_buf(*i) >> 0, "y_buf")
        y = tvm.compute(
            (n + pad_before[0] + pad_after[0],
             m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT),
            lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
        # schedule
        s = tvm.create_schedule(y.op)
        s[x_buf].set_scope(env.acc_scope)
        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
        s[y_buf].set_scope(env.acc_scope)
        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
        s[y].pragma(y.op.axis[0], env.dma_copy)
        # build
        with vta.build_config():
            mod = vta.build(s, [x, y], "ext_dev", env.target_host)

        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("padded_load.o"))
        remote.upload(temp.relpath("padded_load.o"))
        f = remote.load_module("padded_load.o")
        # verify
        ctx = remote.ext_dev(0)
        x_np = np.random.randint(1, 2, size=(n, m, env.BATCH,
                                             env.BLOCK_OUT)).astype(x.dtype)
        y_np = np.zeros((n + pad_before[0] + pad_after[0],
                         m + pad_before[1] + pad_after[1], env.BATCH,
                         env.BLOCK_OUT)).astype(y.dtype)
        y_np[pad_before[0]:pad_before[0] + n,
             pad_before[1]:pad_before[1] + m, :] = x_np
        x_nd = tvm.nd.array(x_np, ctx)
        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(x_nd, y_nd)

        np.testing.assert_equal(y_np, y_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Padded load execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Beispiel #3
0
def _build_func_common(measure_input,
                       check_gpu=None,
                       cuda_arch=None,
                       build_option=None):
    """Common part for building a configuration"""
    target, task, config = measure_input
    with target:
        s, args = task.instantiate(config)

        # check invalidity of template and code hash consistency
        if not config.valid():
            raise InstantiationError(config.errors)

        opts = build_option or {}
        if check_gpu:  # Add verify pass to filter out invalid configs in advance.
            opts["add_lower_pass"] = [(2, gpu_verify_pass(**check_gpu))]
        if cuda_arch:
            set_cuda_target_arch(cuda_arch)

        # if target is vta, we need to use vta build
        if hasattr(measure_input.target, 'device_name') and \
            measure_input.target.device_name == 'vta':
            # pylint: disable=import-outside-toplevel
            import vta
            func = vta.build(s, args, target_host=task.target_host)
        else:
            with build_config(**opts):
                func = build(s, args, target_host=task.target_host)
    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
Beispiel #4
0
        def verify(s):
            mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            ctx = remote.ext_dev(0)
            x_np = np.random.randint(
                -128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(
                -128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, ctx)
            w_nd = tvm.nd.array(w_np, ctx)
            y_nd = tvm.nd.array(y_np, ctx)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b,i,:] += np.dot(x_np[b,j,:].astype(env.acc_dtype),
                                              w_np[i,j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1<<(env.INP_WIDTH-1))-1).astype(y.dtype)

            if env.TARGET == "sim":
                simulator.clear_stats()
                f(x_nd, w_nd, y_nd)
                print(simulator.stats())
            else:
                f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.asnumpy())
Beispiel #5
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                            name="a",
                            dtype=env.acc_dtype)
        a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                            "a_buf")  # DRAM->SRAM
        max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.max(a_buf(*i), 0),
                              "res_buf")  # relu
        min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.min(max_buf(*i),
                                                 (1 <<
                                                  (env.INP_WIDTH - 1)) - 1),
                              "max_buf")  # relu
        res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                          lambda *i: min_buf(*i).astype(env.inp_dtype),
                          "min_buf")  # SRAM->DRAM
        # schedule
        s = tvm.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        a_np = np.random.randint(-256,
                                 256,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 <<
                                   (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

        if env.TARGET == "tsim":
            simulator.tsim_init("libvta_hw")

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.asnumpy())

        if env.TARGET == "tsim":
            print("Relu test took {} clock cycles".format(
                simulator.tsim_cycles()))
Beispiel #6
0
    def _run(env, remote):
        m = 2
        n = 8
        imm_shift = np.random.randint(0, 8)
        imm_scale = np.random.randint(1, 5)
        # compute
        a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                            name="a",
                            dtype=env.acc_dtype)
        a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                            "a_buf")  # DRAM->SRAM
        res_shift = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: a_buf(*i) + imm_shift,
                                "res_shift")  # compute
        res_scale = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: res_shift(*i) >> imm_scale,
                                "res_scale")  # compute
        res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                          lambda *i: res_scale(*i).astype(env.inp_dtype),
                          "res")  # SRAM->DRAM
        # schedule
        s = tvm.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[res_shift].set_scope(env.acc_scope)  # SRAM
        s[res_scale].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        a_np = np.random.randint(-10,
                                 10,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.right_shift((a_np + imm_shift), imm_scale)
        res_np = res_np.astype(res.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

        if env.TARGET == "tsim":
            simulator.tsim_init("libvta_hw")

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.asnumpy())

        if env.TARGET == "tsim":
            print("Shift/scale test took {} clock cycles".format(
                simulator.tsim_cycles()))
Beispiel #7
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
        a_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
        )  # DRAM->SRAM
        max_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf"
        )  # relu
        min_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1),
            "max_buf",
        )  # relu
        res = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: min_buf(*i).astype(env.inp_dtype),
            "min_buf",
        )  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Relu execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Beispiel #8
0
    def _run(env, remote):
        m = 2
        n = 8
        imm_shift = np.random.randint(0, 8)
        imm_scale = np.random.randint(1, 5)
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
        a_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
        )  # DRAM->SRAM
        res_shift = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a_buf(*i) + imm_shift, "res_shift"
        )  # compute
        res_scale = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_shift(*i) >> imm_scale, "res_scale"
        )  # compute
        res = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_scale(*i).astype(env.inp_dtype), "res"
        )  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[res_shift].set_scope(env.acc_scope)  # SRAM
        s[res_scale].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[res_shift].pragma(res_shift.op.axis[0], env.alu)  # compute
        s[res_scale].pragma(res_scale.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-10, 10, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.right_shift((a_np + imm_shift), imm_scale)
        res_np = res_np.astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Shift and scale execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Beispiel #9
0
    def _run(env, remote):
        n = 6
        x = tvm.placeholder(
            (n, n, env.BATCH, env.BLOCK_OUT),
            name="x",
            dtype=env.acc_dtype)
        x_buf = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: x(*i), "x_buf")
        # insert no-op that won't be optimized away
        y_buf = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: x_buf(*i)>>0, "y_buf")
        y = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
        # schedule
        s = tvm.create_schedule(y.op)
        s[x_buf].set_scope(env.acc_scope)
        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
        s[y_buf].set_scope(env.acc_scope)
        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
        s[y].pragma(y.op.axis[0], env.dma_copy)

        # verification
        with vta.build_config():
            m = vta.build(s, [x, y], "ext_dev", env.target_host)

        if not remote:
            return
        temp = util.tempdir()
        m.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        x_np = np.random.randint(
            1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
        y_np = x_np.astype(y.dtype)
        x_nd = tvm.nd.array(x_np, ctx)
        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(x_nd, y_nd)

        np.testing.assert_equal(y_np, y_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Save load execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
        def verify(s, check_correctness):
            mod = vta.build(s, [data, kernel, bias, res],
                            "ext_dev",
                            env.target_host,
                            name="conv2d")
            temp = util.tempdir()

            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.ext_dev(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            bias_orig = (np.random.uniform(size=(wl.out_filter, )) *
                         4).astype("int32")
            bias_orig = np.abs(bias_orig)

            data_packed = data_orig.reshape(batch_size // env.BATCH, env.BATCH,
                                            wl.in_filter // env.BLOCK_IN,
                                            env.BLOCK_IN, wl.height,
                                            wl.width).transpose(
                                                (0, 2, 4, 5, 1, 3))
            kernel_packed = kernel_orig.reshape(wl.out_filter // env.BLOCK_OUT,
                                                env.BLOCK_OUT,
                                                wl.in_filter // env.BLOCK_IN,
                                                env.BLOCK_IN, wl.hkernel,
                                                wl.wkernel).transpose(
                                                    (0, 2, 4, 5, 1, 3))
            bias_packed = bias_orig.reshape(1, wl.out_filter // env.BLOCK_OUT,
                                            1, 1, env.BATCH, env.BLOCK_OUT)
            res_shape = topi.util.get_const_tuple(res.shape)

            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_packed, ctx)
            kernel_arr = tvm.nd.array(kernel_packed, ctx)
            bias_arr = tvm.nd.array(bias_packed, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            res_unpack = res_arr.asnumpy().transpose(
                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter,
                                            fout_height, fout_width)
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost
Beispiel #11
0
def _build_func_common(measure_input,
                       runtime=None,
                       check_gpu=None,
                       build_option=None):
    """Common part for building a configuration"""
    target, task, config = measure_input
    target, task.target_host = Target.canon_target_and_host(
        target, task.target_host)

    with target:
        s, args = task.instantiate(config)

        # check invalidity of template and code hash consistency
        if not config.valid():
            raise InstantiationError(config.errors)

        # if target is vta, we need to use vta build
        if (hasattr(measure_input.target, "device_name")
                and measure_input.target.device_name == "vta"):
            # pylint: disable=import-outside-toplevel
            import vta

            func = vta.build(s, args, target_host=task.target_host)
        else:
            current_pass_context: tvm.ir.transform.PassContext = (
                tvm.ir.transform.PassContext.current())
            current_config = dict(current_pass_context.config)
            if build_option is not None:
                current_config.update(build_option)

            if "tir.add_lower_pass" in current_config:
                current_add_lower_pass = list(
                    current_config["tir.add_lower_pass"])
            else:
                current_add_lower_pass = []
            if check_gpu:
                current_add_lower_pass.append(
                    (2, gpu_verify_pass(**check_gpu)))
            current_config["tir.add_lower_pass"] = current_add_lower_pass

            with tvm.ir.transform.PassContext(
                    opt_level=current_pass_context.opt_level,
                    required_pass=current_pass_context.required_pass,
                    disabled_pass=current_pass_context.disabled_pass,
                    instruments=current_pass_context.instruments,
                    config=current_config,
            ):
                func = build(s,
                             args,
                             target_host=task.target_host,
                             runtime=runtime)
    return func, tuple((get_const_tuple(x.shape), x.dtype) for x in args)
Beispiel #12
0
        def verify(s, name=None):
            # Build with the CSE pass disabled as otherwise it would complicate the test
            with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
                mod = vta.build(
                    s, [x, w, y],
                    tvm.target.Target("ext_dev", host=env.target_host))
            temp = utils.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            dev = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, dev)
            w_nd = tvm.nd.array(w_np, dev)
            y_nd = tvm.nd.array(y_np, dev)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("GEMM schedule:{} execution statistics:".format(name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
        def verify(s, check_correctness):
            mod = vta.build(s, [data, kernel, bias, res], "ext_dev",
                            env.target_host, name="conv2d")
            temp = util.tempdir()

            mod.save(temp.relpath("conv2d.o"))
            remote.upload(temp.relpath("conv2d.o"))
            f = remote.load_module("conv2d.o")
            # verify
            ctx = remote.ext_dev(0)
            # Data in original format
            data_orig, kernel_orig, res_ref = get_ref_data()
            bias_orig = (np.random.uniform(size=(wl.out_filter,)) * 4).astype("int32")
            bias_orig = np.abs(bias_orig)

            data_packed = data_orig.reshape(
                batch_size//env.BATCH, env.BATCH,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.height, wl.width).transpose((0, 2, 4, 5, 1, 3))
            kernel_packed = kernel_orig.reshape(
                wl.out_filter//env.BLOCK_OUT, env.BLOCK_OUT,
                wl.in_filter//env.BLOCK_IN, env.BLOCK_IN,
                wl.hkernel, wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
            bias_packed = bias_orig.reshape(
                1, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT)
            res_shape = topi.util.get_const_tuple(res.shape)

            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_packed, ctx)
            kernel_arr = tvm.nd.array(kernel_packed, ctx)
            bias_arr = tvm.nd.array(bias_packed, ctx)
            res_arr = tvm.nd.array(res_np, ctx)
            time_f = f.time_evaluator("conv2d", ctx, number=5)
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            res_unpack = res_arr.asnumpy().transpose(
                (0, 4, 1, 5, 2, 3)).reshape(batch_size, wl.out_filter, fout_height, fout_width)
            if check_correctness:
                assert wl.hpad == wl.wpad
                stride = (wl.hstride, wl.wstride)
                padding = wl.hpad
                res_ref = res_ref >> 8
                res_ref += bias_orig.reshape(wl.out_filter, 1, 1)
                res_ref = np.clip(res_ref, 0, 127).astype("int8")
                tvm.testing.assert_allclose(res_unpack, res_ref)
            return cost
        def verify(s, name=None):
            mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            ctx = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, ctx)
            w_nd = tvm.nd.array(w_np, ctx)
            y_nd = tvm.nd.array(y_np, ctx)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.asnumpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("GEMM schedule:{} execution statistics:".format(name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
Beispiel #15
0
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function.

# Build GEMM VTA kernel
my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

# Write the compiled module into an object file.
temp = util.tempdir()
my_gemm.save(temp.relpath("gemm.o"))

# Send the executable over RPC
remote.upload(temp.relpath("gemm.o"))

# Load the compiled module
f = remote.load_module("gemm.o")

######################################################################
# Running the Function
# --------------------
# The compiled TVM function uses a concise C API and can be invoked from
Beispiel #16
0
# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, weight, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# Compile the TVM module
my_gemm = vta.build(s, [data, weight, res],
                    "ext_dev",
                    env.target_host,
                    name="my_gemm")
temp = util.tempdir()
my_gemm.save(temp.relpath("gemm.o"))
remote.upload(temp.relpath("gemm.o"))
f = remote.load_module("gemm.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and weight arrays randomly in the int range of (-128, 128]
data_np = np.random.randint(-128, 128,
                            size=(batch_size, in_channels)).astype(data.dtype)
weight_np = np.random.randint(-128, 128,
                              size=(out_channels,
                                    in_channels)).astype(weight.dtype)
Beispiel #17
0
def run_group_conv2d(env,
                     remote,
                     wl,
                     target,
                     check_correctness=True,
                     print_ir=False,
                     samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.nn.group_conv2d_nchw
        fschedule = topi.generic.schedule_group_conv2d_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.group_conv2d_packed
        fschedule = vta.top.schedule_group_conv2d_packed

    # Derive shapes depending upon packing
    CI_G = wl.in_filter // wl.groups
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, CI_G, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                        wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1),
                       wl.groups, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * \
        wl.out_filter * wl.in_filter // wl.groups

    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = tvm.topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride), wl.hpad, wl.groups).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(wl.batch // env.BATCH, env.BATCH,
                                  wl.in_filter // env.BLOCK_IN, env.BLOCK_IN,
                                  wl.height, wl.width).transpose(
                                      (0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(wl.out_filter // env.BLOCK_OUT,
                                      env.BLOCK_OUT, CI_G // env.BLOCK_IN,
                                      env.BLOCK_IN, wl.hkernel,
                                      wl.wkernel).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    else:
        mod = tvm.build(s, [data, kernel, bias, res],
                        target=target,
                        target_host=env.target_host,
                        name="conv2d")
    temp = util.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    kernel_arr = tvm.nd.array(kernel_np, ctx)
    bias_arr = tvm.nd.array(bias_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("conv2d", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s GROUP CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
Beispiel #18
0
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from tvm.topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(
    -128, 128,
    size=(batch_size, in_channels, height, width)).astype(data.dtype)
kernel_np = np.random.randint(
    -128, 128,
def run_pooling(env,
                remote,
                wl,
                target,
                check_correctness=True,
                print_ir=False,
                samples=10):

    # Workload assertions
    assert wl.hpad == wl.wpad
    pool_type = 'max'

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        #pooling_fcompute = topi.arm_cpu.pooling_nchw_spatial_pack
        pooling_fcompute = topi.nn.pool
        #pooling_fschedule = topi.arm_cpu.schedule_pooling_nchw_spatial_pack
        pooling_fschedule = topi.generic.schedule_pool
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        pooling_fcompute = vta.top.pooling_packed
        pooling_fschedule = vta.top.schedule_pooling_packed

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    # output shape
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN,
                      wl.height, wl.width, env.BATCH, env.BLOCK_IN)
        kernel_shape = (wl.out_filter // env.BLOCK_OUT,
                        wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel,
                        env.BLOCK_OUT, env.BLOCK_IN)
        bias_shape = (wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1,
                      1, env.BATCH, env.BLOCK_OUT)
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        res = topi.nn.pool(data,
                           kernel=[3, 3],
                           stride=[2, 2],
                           padding=padding,
                           pool_type=pool_type,
                           layout="NCHW")
        #       res = topi.right_shift(res, 8)
        #       res = topi.add(res, bias)
        #       res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        #       res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = pooling_fschedule([res], layout)
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
    # get output shape
    _, oc, oh, ow = get_const_tuple(res.shape)
    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = 2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter

    # @memoize("vta.tests.test_benchmark_topi.pooling.verify_nchw")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)

        pad_shape = (wl.batch, wl.in_filter, wl.height + wl.hpad * 2,
                     wl.width + wl.wpad * 2)
        pad_np = np.zeros(shape=pad_shape).astype(data.dtype)
        no_zero = (range(wl.batch), range(wl.in_filter),
                   (range(wl.hpad, wl.height + wl.hpad)),
                   (range(wl.wpad, wl.width + wl.wpad)))
        pad_np[np.ix_(*no_zero)] = a_np
        b_shape = (wl.batch, oc, oh, ow)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        kw, kh = 3, 3
        sw, sh = 2, 2
        for i in range(oh):
            for j in range(ow):
                b_np[:, :, i, j] = np.max(pad_np[:, :, i * sh:i * sh + kh,
                                                 j * sw:j * sw + kw],
                                          axis=(2, 3))
        b_np = np.maximum(b_np, 0.0)
        return a_np, pad_np, b_np

    # Data in original format
    data_np, _, res_ref = get_ref_data()

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, res],
                        target=target,
                        target_host=env.target_host,
                        name="pooling")
    else:
        mod = tvm.build(s, [data, res],
                        target=target,
                        target_host=env.target_host,
                        name="pooling")
    temp = util.tempdir()
    mod.save(temp.relpath("pooling.o"))
    remote.upload(temp.relpath("pooling.o"))
    f = remote.load_module("pooling.o")
    ctx = remote.context(str(target))

    res_np = np.zeros(topi.util.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, ctx)
    res_arr = tvm.nd.array(res_np, ctx)
    time_f = f.time_evaluator("pooling", ctx, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, res_arr)
        print(cost)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.asnumpy()
        res_orig = np.maximum(res_orig, 0.0)
        res_ref = res_ref.astype(env.out_dtype)
        res_orig = res_orig.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s POOLING TEST %s: Time cost = %g sec/op" %
          (device, status, cost.mean))

    return correct, cost, stats
Beispiel #20
0
# Let's look at the final lowered TVM schedule after lowering memory
# loads/stores down to DMA copy intrinsics, and the computation down to
# VTA compute intrinsics.
print(vta.lower(s, [data, weight, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# Compile the TVM module
my_gemm = vta.build(
    s, [data, weight, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_gemm"
)
temp = utils.tempdir()
my_gemm.save(temp.relpath("gemm.o"))
remote.upload(temp.relpath("gemm.o"))
f = remote.load_module("gemm.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and weight arrays randomly in the int range of (-128, 128]
data_np = np.random.randint(-128, 128, size=(batch_size, in_channels)).astype(data.dtype)
weight_np = np.random.randint(-128, 128, size=(out_channels, in_channels)).astype(weight.dtype)

# Apply packing to the data and weight arrays from a 2D to a 4D packed layout
data_packed = data_np.reshape(
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function. By default TVM compiles into a type-erased
# function that can be directly called from python side.
#
# In the following line, we use :code:`tvm.build` to create a function.
# The build function takes the schedule, the desired signature of the
# function(including the inputs and outputs) as well as target language
# we want to compile to.
#
my_vadd = vta.build(s, [A, B, C],
                    tvm.target.Target("ext_dev", host=env.target_host),
                    name="my_vadd")

######################################################################
# Saving the Module
# ~~~~~~~~~~~~~~~~~
# TVM lets us save our module into a file so it can loaded back later. This
# is called ahead-of-time compilation and allows us to save some compilation
# time.
# More importantly, this allows us to cross-compile the executable on our
# development machine and send it over to the Pynq FPGA board over RPC for
# execution.

# Write the compiled module into an object file.
temp = utils.tempdir()
my_vadd.save(temp.relpath("vadd.o"))
Beispiel #22
0
s[C_buf].tensorize(s[C_buf].op.axis[2], env.gemm)

# Let's take a look at the finalized schedule
print(vta.lower(s, [A, B, C], simple_mode=True))

######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function.

# Build GEMM VTA kernel
my_gemm = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_gemm")

# Write the compiled module into an object file.
temp = util.tempdir()
my_gemm.save(temp.relpath("gemm.o"))

# Send the executable over RPC
remote.upload(temp.relpath("gemm.o"))

# Load the compiled module
f = remote.load_module("gemm.o")

######################################################################
# Running the Function
# --------------------
# The compiled TVM function uses a concise C API and can be invoked from
Beispiel #23
0
        def check_alu(tvm_op, np_op=None, use_imm=False):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                name="a",
                                dtype=env.acc_dtype)
            a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: a(*i), "a_buf")  #DRAM->SRAM
            if use_imm:
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), imm),
                                      "res_buf")  #compute
            else:
                b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                    name="b",
                                    dtype=env.acc_dtype)
                b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                    lambda *i: b(*i), "b_buf")  #DRAM->SRAM
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                                      "res_buf")  #compute5B
            res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: res_buf(*i).astype(env.inp_dtype),
                              "res")  #SRAM->DRAM
            # schedule
            s = tvm.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                else:
                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            ctx = remote.ext_dev(0)
            a_np = np.random.randint(-16,
                                     16,
                                     size=(m, n, env.BATCH,
                                           env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16,
                                         16,
                                         size=(m, n, env.BATCH,
                                               env.BLOCK_OUT)).astype(b.dtype)
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, ctx)
            res_nd = tvm.nd.array(
                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype),
                ctx)

            if env.TARGET == "tsim":
                simulator.tsim_init("libvta_hw")

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, ctx)
                f(a_nd, b_nd, res_nd)
            np.testing.assert_equal(res_np, res_nd.asnumpy())
def run_gemm(
    env,
    remote,
    target,
    batch_size,
    in_feat,
    out_feat,
    check_correctness=True,
    print_ir=True,
    samples=4,
):

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
    elif "vta" in target.keys:
        data_pack = True

    # Derive shapes depending upon packing
    a_shape = (batch_size, in_feat)
    w_shape = (out_feat, in_feat)
    if data_pack:
        data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        kernel_shape = (
            out_feat // env.BLOCK_OUT,
            in_feat // env.BLOCK_IN,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        fcompute = vta.top.dense_packed
        fschedule = vta.top.schedule_dense_packed
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        fcompute = topi.x86.dense_nopack
        fschedule = topi.x86.schedule_dense_nopack
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)

    # Define base computation schedule
    with target:
        res = fcompute(data, kernel, None, env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    num_ops = 2 * batch_size * in_feat * out_feat

    # @memoize("vta.tests.test_benchmark_topi.dense.verify")
    def get_ref_data():
        # derive min max for act, wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)

        r_np = np.dot(a_np.astype(env.acc_dtype),
                      w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH,
                                  in_feat // env.BLOCK_IN,
                                  env.BLOCK_IN).transpose((0, 2, 1, 3))
        kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT,
                                      in_feat // env.BLOCK_IN,
                                      env.BLOCK_IN).transpose((0, 2, 1, 3))

    # Build
    if "vta" in target.keys:
        mod = vta.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    else:
        mod = tvm.build(s, [data, kernel, res],
                        target=target,
                        target_host=env.target_host,
                        name="dense")
    temp = utils.tempdir()
    mod.save(temp.relpath("dense.o"))
    remote.upload(temp.relpath("dense.o"))
    f = remote.load_module("dense.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("dense", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.reshape(batch_size, out_feat)
        res_ref = res_ref >> 8
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
def run_conv2d_transpose(
    env, remote, wl, target, check_correctness=True, print_ir=False, samples=4
):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        fcompute = topi.arm_cpu.conv2d_transpose_nchw
        fschedule = topi.arm_cpu.schedule_conv2d_transpose_nchw
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        fcompute = vta.top.conv2d_transpose_packed
        fschedule = vta.top.schedule_conv2d_transpose_packed

    # Derive shapes depending upon packing

    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
    if data_pack:
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
    else:
        data_shape = a_shape
        kernel_shape = w_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:

        res = fcompute(
            data, kernel, (wl.hstride, wl.wstride), padding, env.acc_dtype, (wl.o_hpad, wl.o_wpad)
        )
        res = topi.right_shift(res, env.WGT_WIDTH)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height - 1) * wl.hstride - 2 * wl.hpad + wl.hkernel + wl.o_hpad
    fout_width = (wl.width - 1) * wl.wstride - 2 * wl.wpad + wl.wkernel + wl.o_wpad
    num_ops = (
        2
        * wl.batch
        * fout_height
        * fout_width
        * wl.hkernel
        * wl.wkernel
        * wl.out_filter
        * wl.in_filter
    )

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act and wgt types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1))
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(
            w_min, w_max, size=(wl.in_filter, wl.out_filter, wl.hkernel, wl.wkernel)
        ).astype(kernel.dtype)
        r_np = tvm.topi.testing.conv2d_transpose_nchw_python(
            a_np.astype(env.acc_dtype),
            w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride),
            wl.hpad,
            (wl.o_hpad, wl.o_wpad),
        ).astype(env.acc_dtype)
        return a_np, w_np, r_np

    # Data in original format
    data_np, kernel_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.hkernel,
            wl.wkernel,
        ).transpose((2, 0, 4, 5, 3, 1))
        kernel_np = np.flip(kernel_np, 2)
        kernel_np = np.flip(kernel_np, 3)

    # Build
    if "vta" in target.keys:
        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
            mod = vta.build(
                s,
                [data, kernel, res],
                target=target,
                target_host=env.target_host,
                name="conv2d_transpose",
            )
    else:
        mod = tvm.build(
            s,
            [data, kernel, res],
            target=target,
            target_host=env.target_host,
            name="conv2d_transpose",
        )
    temp = utils.tempdir()
    mod.save(temp.relpath("conv2d_transpose.o"))
    remote.upload(temp.relpath("conv2d_transpose.o"))
    f = remote.load_module("conv2d_transpose.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("conv2d_transpose", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.transpose((0, 4, 1, 5, 2, 3)).reshape(
                wl.batch, wl.out_filter, fout_height, fout_width
            )
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops))

    return correct, cost, stats
Beispiel #26
0
######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function. By default TVM compiles into a type-erased
# function that can be directly called from python side.
#
# In the following line, we use :code:`tvm.build` to create a function.
# The build function takes the schedule, the desired signature of the
# function(including the inputs and outputs) as well as target language
# we want to compile to.
#
my_vadd = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_vadd")

######################################################################
# Saving the Module
# ~~~~~~~~~~~~~~~~~
# TVM lets us save our module into a file so it can loaded back later. This
# is called ahead-of-time compilation and allows us to save some compilation
# time.
# More importantly, this allows us to cross-compile the executable on our
# development machine and send it over to the Pynq FPGA board over RPC for
# execution.

# Write the compiled module into an object file.
temp = util.tempdir()
my_vadd.save(temp.relpath("vadd.o"))
Beispiel #27
0
######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from tvm.topi.testing import conv2d_nchw_python

# Compile the TVM module
with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
    my_conv = vta.build(s, [data, kernel, res],
                        tvm.target.Target("ext_dev", host=env.target_host),
                        name="my_conv")
temp = utils.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(-128,
                            128,
                            size=(batch_size, in_channels, height,
                                  width)).astype(data.dtype)
Beispiel #28
0
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from topi.testing import conv2d_nchw_python

# Compile the TVM module
my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
temp = util.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(
    -128, 128,
    size=(batch_size, in_channels, height, width)).astype(data.dtype)
kernel_np = np.random.randint(
    -128, 128,
Beispiel #29
0
######################################################################
# This concludes the scheduling portion of this tutorial.

######################################################################
# TVM Compilation
# ---------------
# After we have finished specifying the schedule, we can compile it
# into a TVM function. By default TVM compiles into a type-erased
# function that can be directly called from python side.
#
# In the following line, we use :code:`tvm.build` to create a function.
# The build function takes the schedule, the desired signature of the
# function(including the inputs and outputs) as well as target language
# we want to compile to.
#
my_vadd = vta.build(s, [A, B, C], "ext_dev", env.target_host, name="my_vadd")

######################################################################
# Saving the Module
# ~~~~~~~~~~~~~~~~~
# TVM lets us save our module into a file so it can loaded back later. This
# is called ahead-of-time compilation and allows us to save some compilation
# time.
# More importantly, this allows us to cross-compile the executable on our
# development machine and send it over to the Pynq FPGA board over RPC for
# execution.

# Write the compiled module into an object file.
temp = util.tempdir()
my_vadd.save(temp.relpath("vadd.o"))
Beispiel #30
0
        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
            a_buf = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
            )  # DRAM->SRAM
            if use_imm:
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf"
                )  # compute
            else:
                b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype)
                b_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf"
                )  # DRAM->SRAM
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT),
                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                    "res_buf",
                )  # compute5B
            res = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT),
                lambda *i: res_buf(*i).astype(env.inp_dtype),
                "res",
            )  # SRAM->DRAM
            # schedule
            s = te.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                else:
                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
            temp = utils.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            dev = remote.ext_dev(0)
            a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(
                    b.dtype
                )
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, dev)
            res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, dev)
                f(a_nd, b_nd, res_nd)

            np.testing.assert_equal(res_np, res_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("ALU {} execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))