Ejemplo n.º 1
0
def _lower(mod,
           target,
           params):
    """ Helper to lower VTA properly.
    """
    # pylint: disable=import-outside-toplevel
    from tvm import relay
    from tvm.relay.backend import graph_runtime_codegen

    if hasattr(target, 'device_name') and target.device_name == "vta":
        import vta
        with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            mod, _ = relay.optimize(mod, target, params)
            grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
            grc.codegen(mod["main"])
            return

    # default case
    # Try graph codegen first to extract autotvm tasks.
    # If failed to compile, then fallback to use VM compiler.
    # TODO: Currently VM compiler is likely to stack overflow for large models.
    try:
        opt_mod, _ = relay.optimize(mod, target, params)
        grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
        grc.codegen(opt_mod["main"])
    except tvm.TVMError:
        compiler = relay.vm.VMCompiler()
        if params:
            compiler.set_params(params)
        compiler.lower(mod, target=target)
Ejemplo n.º 2
0
def _lower(mod, target, params):
    """Helper to lower VTA properly."""
    # pylint: disable=import-outside-toplevel
    from tvm import relay
    from tvm.relay.backend import graph_executor_codegen

    if hasattr(target, "device_name") and target.device_name == "vta":
        import vta

        with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            mod, _ = relay.optimize(mod, target, params)
            grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
            grc.codegen(mod["main"])
            return

    # default case
    # Try graph codegen first to extract autotvm tasks.
    # If failed to compile, then fallback to use VM compiler.
    # TODO: Currently VM compiler is likely to stack overflow for large models.
    try:
        # TODO(jwfromm) Remove this once AlterOpLayout bug that mutates
        # source module is fixed. Until then, create a clone.
        mod_clone = deepcopy(mod)
        opt_mod, _ = relay.optimize(mod_clone, target, params)
        grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
        grc.codegen(opt_mod["main"])
    except tvm.TVMError as e:
        print("Get errors with GraphExecutorCodegen for task extraction. "
              "Fallback to VMCompiler. Error details:\n%s" % str(e))
        mod_clone = deepcopy(mod)
        compiler = relay.vm.VMCompiler()
        if params:
            compiler.set_params(params)
        compiler.lower(mod_clone, target=target)
Ejemplo n.º 3
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                            name="a",
                            dtype=env.acc_dtype)
        a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i),
                            "a_buf")  # DRAM->SRAM
        max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.max(a_buf(*i), 0),
                              "res_buf")  # relu
        min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: tvm.min(max_buf(*i),
                                                 (1 <<
                                                  (env.INP_WIDTH - 1)) - 1),
                              "max_buf")  # relu
        res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                          lambda *i: min_buf(*i).astype(env.inp_dtype),
                          "min_buf")  # SRAM->DRAM
        # schedule
        s = tvm.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        a_np = np.random.randint(-256,
                                 256,
                                 size=(m, n, env.BATCH,
                                       env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 <<
                                   (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, ctx)
        res_nd = tvm.nd.array(
            np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx)

        if env.TARGET == "tsim":
            simulator.tsim_init("libvta_hw")

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.asnumpy())

        if env.TARGET == "tsim":
            print("Relu test took {} clock cycles".format(
                simulator.tsim_cycles()))
Ejemplo n.º 4
0
    def _run(env, remote):
        # declare
        n = 21
        m = 20
        pad_before = [0, 1, 0, 0]
        pad_after = [1, 3, 0, 0]
        x = tvm.placeholder((n, m, env.BATCH, env.BLOCK_OUT),
                            name="x",
                            dtype=env.acc_dtype)
        x_buf = topi.nn.pad(x, pad_before, pad_after, name="y")
        # insert no-op that won't be optimized away
        y_buf = tvm.compute(
            (n + pad_before[0] + pad_after[0],
             m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT),
            lambda *i: x_buf(*i) >> 0, "y_buf")
        y = tvm.compute(
            (n + pad_before[0] + pad_after[0],
             m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT),
            lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
        # schedule
        s = tvm.create_schedule(y.op)
        s[x_buf].set_scope(env.acc_scope)
        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
        s[y_buf].set_scope(env.acc_scope)
        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
        s[y].pragma(y.op.axis[0], env.dma_copy)
        # build
        with vta.build_config():
            mod = vta.build(s, [x, y], "ext_dev", env.target_host)

        if not remote:
            return
        temp = util.tempdir()
        mod.save(temp.relpath("padded_load.o"))
        remote.upload(temp.relpath("padded_load.o"))
        f = remote.load_module("padded_load.o")
        # verify
        ctx = remote.ext_dev(0)
        x_np = np.random.randint(1, 2, size=(n, m, env.BATCH,
                                             env.BLOCK_OUT)).astype(x.dtype)
        y_np = np.zeros((n + pad_before[0] + pad_after[0],
                         m + pad_before[1] + pad_after[1], env.BATCH,
                         env.BLOCK_OUT)).astype(y.dtype)
        y_np[pad_before[0]:pad_before[0] + n,
             pad_before[1]:pad_before[1] + m, :] = x_np
        x_nd = tvm.nd.array(x_np, ctx)
        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(x_nd, y_nd)

        np.testing.assert_equal(y_np, y_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Padded load execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Ejemplo n.º 5
0
    def _run(env, remote):
        m = 8
        n = 10
        # compute
        a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
        a_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
        )  # DRAM->SRAM
        max_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf"
        )  # relu
        min_buf = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1),
            "max_buf",
        )  # relu
        res = te.compute(
            (m, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: min_buf(*i).astype(env.inp_dtype),
            "min_buf",
        )  # SRAM->DRAM
        # schedule
        s = te.create_schedule(res.op)
        s[a_buf].set_scope(env.acc_scope)  # SRAM
        s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
        s[max_buf].set_scope(env.acc_scope)  # SRAM
        s[min_buf].set_scope(env.acc_scope)  # SRAM
        s[max_buf].pragma(max_buf.op.axis[0], env.alu)  # compute
        s[min_buf].pragma(min_buf.op.axis[0], env.alu)  # compute
        s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
        # build
        with vta.build_config():
            mod = vta.build(s, [a, res], "ext_dev", env.target_host)
        if not remote:
            return
        temp = utils.tempdir()
        mod.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        dev = remote.ext_dev(0)
        a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
        res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(a_nd, res_nd)

        np.testing.assert_equal(res_np, res_nd.numpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Relu execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Ejemplo n.º 6
0
 def conv_normal(print_ir):
     print("----- CONV2D End-to-End Test-------")
     with vta.build_config():
         s = vta.top.schedule_packed_conv2d([res])
         if print_ir:
             print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))
     cost = verify(s, True)
     gops = (num_ops / cost.mean) / float(10 ** 9)
     print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
Ejemplo n.º 7
0
def build_model(model_name, remote, target, ctx, vta_env):
    """Build the inference graph runtime."""
    # Load pre-configured AutoTVM schedules.
    with autotvm.tophub.context(target):
        # Populate the shape and data type dictionary for ResNet input.
        dtype_dict = {'data': 'float32'}
        shape_dict = {'data': (vta_env.BATCH, 3, 224, 224)}

        # Get off-the-shelf gluon model and convert to Relay.
        gluon_model = vision.get_model(model_name, pretrained=True)

        # Start frontend compilation.
        mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

        # Update shape and type dictionary.
        shape_dict.update({k: v.shape for k, v in params.items()})
        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

        # Perform quantization in Relay.
        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
            relay_prog = relay.quantize.quantize(mod['main'], params=params)

        # Perform graph packing and constant folding for VTA target.
        if target.device_name == 'vta':
            assert vta_env.BLOCK_IN == vta_env.BLOCK_OUT
            relay_prog = graph_pack(relay_prog,
                                    vta_env.BATCH,
                                    vta_env.BLOCK_OUT,
                                    vta_env.WGT_WIDTH,
                                    start_name=START_PACK,
                                    stop_name=STOP_PACK)

        # Compile Relay program with AlterOpLayout disabled.
        with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}):
            if target.device_name == 'vta':
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=vta_env.target,
                        params=params,
                        target_host=vta_env.target_host)
            else:
                graph, lib, params = relay.build(
                    relay_prog,
                    target=target,
                    params=params,
                    target_host=vta_env.target_host)

        # Send the inference library over to the remote RPC server
        temp = util.tempdir()
        lib.save(temp.relpath('graphlib.o'))
        remote.upload(temp.relpath('graphlib.o'))
        lib = remote.load_module('graphlib.o')

        graph_module = graph_runtime.create(graph, lib, ctx)
        graph_module.set_input(**params)
        return graph_module
Ejemplo n.º 8
0
def generate_graph(graph_fn, params_fn, device="vta"):
    # Measure build start time
    build_start = time.time()

    # Derive the TVM target
    target = tvm.target.create("llvm -device={}".format(device))

    # Derive the LLVM compiler flags
    # When targetting the Pynq, cross-compile to ARMv7 ISA
    if env.TARGET == "sim":
        target_host = "llvm"
    elif env.TARGET == "pynq":
        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"

    # Load the ResNet-18 graph and parameters
    sym = nnvm.graph.load_json(open(graph_fn).read())
    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())

    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    if target.device_name == "vta":
        assert env.BLOCK_IN == env.BLOCK_OUT
        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    # Compile NNVM graph
    with nnvm.compiler.build_config(opt_level=3):
        if target.device_name != "vta":
            graph, lib, params = nnvm.compiler.build(
                sym, target, shape_dict, dtype_dict,
                params=params, target_host=target_host)
        else:
            with vta.build_config():
                graph, lib, params = nnvm.compiler.build(
                    sym, target, shape_dict, dtype_dict,
                    params=params, target_host=target_host)

    # Save the compiled inference graph library
    assert tvm.module.enabled("rpc")
    temp = util.tempdir()
    lib.save(temp.relpath("graphlib.o"))

    # Send the inference library over to the remote RPC server
    remote.upload(temp.relpath("graphlib.o"))
    lib = remote.load_module("graphlib.o")

    # Measure build time
    build_time = time.time() - build_start
    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))

    return graph, lib, params
Ejemplo n.º 9
0
    def _run(env, remote):
        n = 6
        x = tvm.placeholder(
            (n, n, env.BATCH, env.BLOCK_OUT),
            name="x",
            dtype=env.acc_dtype)
        x_buf = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: x(*i), "x_buf")
        # insert no-op that won't be optimized away
        y_buf = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: x_buf(*i)>>0, "y_buf")
        y = tvm.compute(
            (n, n, env.BATCH, env.BLOCK_OUT),
            lambda *i: y_buf(*i).astype(env.inp_dtype), "y")
        # schedule
        s = tvm.create_schedule(y.op)
        s[x_buf].set_scope(env.acc_scope)
        s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy)
        s[y_buf].set_scope(env.acc_scope)
        s[y_buf].pragma(y_buf.op.axis[0], env.alu)
        s[y].pragma(y.op.axis[0], env.dma_copy)

        # verification
        with vta.build_config():
            m = vta.build(s, [x, y], "ext_dev", env.target_host)

        if not remote:
            return
        temp = util.tempdir()
        m.save(temp.relpath("load_act.o"))
        remote.upload(temp.relpath("load_act.o"))
        f = remote.load_module("load_act.o")
        # verify
        ctx = remote.ext_dev(0)
        x_np = np.random.randint(
            1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype)
        y_np = x_np.astype(y.dtype)
        x_nd = tvm.nd.array(x_np, ctx)
        y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype)

        if env.TARGET in ["sim", "tsim"]:
            simulator.clear_stats()

        f(x_nd, y_nd)

        np.testing.assert_equal(y_np, y_nd.asnumpy())

        if env.TARGET in ["sim", "tsim"]:
            sim_stats = simulator.stats()
            print("Save load execution statistics:")
            for k, v in sim_stats.items():
                print("\t{:<16}: {:>16}".format(k, v))
Ejemplo n.º 10
0
 def conv_normal(print_ir):
     print("----- CONV2D End-to-End Test-------")
     with vta.build_config():
         s = vta.top.schedule_packed_conv2d([res])
         if print_ir:
             print(
                 vta.lower(s, [data, kernel, bias, res],
                           simple_mode=True))
     cost = verify(s, True)
     gops = (num_ops / cost.mean) / float(10**9)
     print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
Ejemplo n.º 11
0
def _build(func, target, target_host, params):
    """ Helper to build VTA properly.
    """

    from tvm import relay

    if hasattr(target, 'device_name') and target.device_name == "vta":
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            import vta
            with vta.build_config():
                return relay.build(func, target, target_host, params)
    # default case
    return relay.build(func, target, target_host, params)
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--network-arch',
                        type=argparse.FileType('r'),
                        default='network.arch')
    parser.add_argument('--af-params', default='w2l_params_af.bin')
    parser.add_argument('--nfeat', type=int, default=40)
    parser.add_argument('--nlabel', type=int, default=30)
    parser.add_argument('--max-inp-len', type=int, default=741)
    # input len of librispeech: mean=741, median=577, max=3494, std=515. heavily tailed.
    parser.add_argument('--no-params', action='store_true')
    parser.add_argument('--device', choices=('vta', 'vtacpu'), default='vta')
    args = parser.parse_args()

    net = make_net(args.network_arch, args.nfeat, args.nlabel,
                   args.max_inp_len)
    params = make_params(net, args.af_params) if not args.no_params else None

    with relay.build_config(opt_level=3):
        if args.device == 'vta':
            net = vta.graph.clean_cast(net)
            net = vta.graph.clean_conv_fuse(net)
            # net = vta.graph.pack(net, ...)  # ???
            vta.build_config().__enter__()

        graph, lib, params = relay.build(net,
                                         params=params,
                                         target=f'llvm -device={args.device}',
                                         target_host='llvm')

        if args.device == 'vta':
            vta.build_config().__exit__()

    lib.export_library('wav2letter2.so')
    with open('wav2letter2.json', 'w') as f_graph_json:
        f_graph_json.write(graph)
    with open('wav2letter2.params', 'wb') as f_params:
        f_params.write(nnvm.compiler.save_param_dict(params))
Ejemplo n.º 13
0
        def verify(s, name=None):
            # Build with the CSE pass disabled as otherwise it would complicate the test
            with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
                mod = vta.build(
                    s, [x, w, y],
                    tvm.target.Target("ext_dev", host=env.target_host))
            temp = utils.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            dev = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, dev)
            w_nd = tvm.nd.array(w_np, dev)
            y_nd = tvm.nd.array(y_np, dev)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("GEMM schedule:{} execution statistics:".format(name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
Ejemplo n.º 14
0
def _lower(mod, target, params):
    """ Helper to lower VTA properly.
    """
    # pylint: disable=import-outside-toplevel
    from tvm import relay
    from tvm.relay.backend import graph_runtime_codegen

    if hasattr(target, 'device_name') and target.device_name == "vta":
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            import vta
            with vta.build_config():
                mod, _ = relay.optimize(mod, target, params)
                grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
                grc.codegen(mod["main"])
    # default case
    compiler = relay.vm.VMCompiler()
    if params:
        compiler.set_params(params)
    compiler.lower(mod, target=target)
Ejemplo n.º 15
0
def _lower(func,
           target,
           params):
    """ Helper to lower VTA properly.
    """

    from tvm import relay
    from tvm.relay.backend import graph_runtime_codegen

    if hasattr(target, 'device_name') and target.device_name == "vta":
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            import vta
            with vta.build_config():
                mod, _ = relay.optimize(func, target, params)
                grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
                return grc.codegen(mod["main"])
    # default case
    mod, _ = relay.optimize(func, target, params)
    grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target)
    return grc.codegen(mod["main"])
Ejemplo n.º 16
0
def compile_mxnet_gulon_resnet(_env, _model):
    """ Compile Model """
    # Generate tvm IR from mxnet gluon model
    # Populate the shape and data type dictionary for ImageNet classifier input
    dtype_dict = {"data": 'float32'}
    shape_dict = {"data": (_env.BATCH, 3, 224, 224)}
    # Get off the shelf gluon model, and convert to relay
    gluon_model = vision.get_model(_model, pretrained=True)
    # Start front end compilation
    mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)
    mod = merge_transform_to_mxnet_model(mod)
    # Update shape and type dictionary
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Load pre-configured AutoTVM schedules
    with autotvm.tophub.context(_env.target):
        # Perform quantization in Relay
        # Note: We set opt_level to 3 in order to fold batch norm
        with relay.build_config(opt_level=3):
            with relay.quantize.qconfig(global_scale=8.0,
                                        skip_conv_layers=[0]):
                mod = relay.quantize.quantize(mod, params=params)
            # Perform graph packing and constant folding for VTA target
            relay_prog = graph_pack(mod["main"],
                                    _env.BATCH,
                                    _env.BLOCK_IN,
                                    _env.WGT_WIDTH,
                                    start_name=PACK_DICT[_model][0],
                                    stop_name=PACK_DICT[_model][1])

    # Compile Relay program with AlterOpLayout disabled
    with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
        with vta.build_config(debug_flag=0):
            graph, lib, params = relay.build(relay_prog,
                                             target=_env.target,
                                             params=params,
                                             target_host=_env.target_host)

    return graph, lib, params
def extract_tasks(sym, params, target, target_host):
    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    assert env.BLOCK_IN == env.BLOCK_OUT
    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    with vta.build_config():
        tasks = autotvm.task.extract_from_graph(graph=sym,
                                                shape=shape_dict,
                                                dtype=dtype_dict,
                                                target=target,
                                                params=params,
                                                symbols=(nnvm.sym.conv2d, ),
                                                target_host=target_host)
    return tasks
Ejemplo n.º 18
0
def _lower(mod, target, params, opt_level=3):
    """Helper to lower VTA properly."""
    # pylint: disable=import-outside-toplevel
    from tvm import relay
    from tvm.relay.backend import graph_executor_codegen

    if hasattr(target, "device_name") and target.device_name == "vta":
        import vta

        with vta.build_config(opt_level=opt_level, disabled_pass={"AlterOpLayout"}):
            mod, _ = relay.optimize(mod, target, params)
            grc = graph_executor_codegen.GraphExecutorCodegen(None, target)
            grc.codegen(mod, mod["main"])
            return

    # Alter op layout code has been written expecting that tuning is applied
    # without it, so we disable AlterOpLayout to maintain that behavior.
    with tvm.transform.PassContext(opt_level=opt_level, disabled_pass={"AlterOpLayout"}):
        compiler = relay.vm.VMCompiler()
        if params:
            compiler.set_params(params)
        compiler.lower(mod, target=target)
def generate_graph(sym, params, target, target_host):
    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    assert env.BLOCK_IN == env.BLOCK_OUT
    sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    # Compile NNVM graph
    with nnvm.compiler.build_config(opt_level=3):
        with vta.build_config():
            graph, lib, params = nnvm.compiler.build(sym,
                                                     target,
                                                     shape_dict,
                                                     dtype_dict,
                                                     params=params,
                                                     target_host=target_host)

    return graph, lib, params
Ejemplo n.º 20
0
def tune_and_evaluate(tuning_opt):

    if env.TARGET != "sim":
        # Get remote from fleet node
        remote = autotvm.measure.request_remote(env.TARGET,
                                                tracker_host,
                                                tracker_port,
                                                timeout=10000)
        # Reconfigure the JIT runtime and FPGA.
        vta.reconfig_runtime(remote)
        vta.program_fpga(remote, bitstream=None)
    else:
        # In simulation mode, host the RPC server locally.
        remote = rpc.LocalSession()

    # Register VTA tuning tasks
    register_vta_tuning_tasks()

    # Perform task extraction on Relay program
    print("Extract tasks...")
    relay_prog, params = compile_network(env, target, network, start_pack,
                                         stop_pack)
    mod = tvm.IRModule.from_expr(relay_prog)
    tasks = autotvm.task.extract_from_program(mod,
                                              params=params,
                                              ops=(tvm.relay.op.nn.conv2d, ),
                                              target=target,
                                              target_host=env.target_host)

    # We should have extracted 10 convolution tasks
    assert len(tasks) == 10
    print("Extracted {} conv2d tasks:".format(len(tasks)))
    for tsk in tasks:
        inp = tsk.args[0][1]
        wgt = tsk.args[1][1]
        batch = inp[0] * inp[4]
        in_filter = inp[1] * inp[5]
        out_filter = wgt[0] * wgt[4]
        height, width = inp[2], inp[3]
        hkernel, wkernel = wgt[2], wgt[3]
        hstride, wstride = tsk.args[2][0], tsk.args[2][1]
        hpad, wpad = tsk.args[3][0], tsk.args[3][1]
        print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format(
            batch, height, width, in_filter, out_filter, hkernel, wkernel,
            hpad, wpad, hstride, wstride))

    # We do not run the tuning in our webpage server since it takes too long.
    # Comment the following line to run it by yourself.
    return

    # run tuning tasks
    print("Tuning...")
    tune_tasks(tasks, **tuning_opt)

    # compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[log_file]):
        # Compile network
        print("Compile...")
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(relay_prog,
                                                 target=target,
                                                 params=params,
                                                 target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)

        # Export library
        print("Upload...")
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Generate the graph runtime
        ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)
        m = graph_runtime.create(graph, lib, ctx)

        # upload parameters to device
        image = tvm.nd.array(
            (np.random.uniform(size=(1, 3, 224, 224))).astype('float32'))
        m.set_input(**params)
        m.set_input('data', image)

        # evaluate
        print("Evaluate inference time cost...")
        timer = m.module.time_evaluator("run", ctx, number=1, repeat=10)
        tcost = timer()
        prof_res = np.array(tcost.results) * 1000  # convert to millisecond
        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
              (np.mean(prof_res), np.std(prof_res)))
Ejemplo n.º 21
0
                    check_correctness=True))
    }
    tune_tasks(tasks, **tuning_opt)

    # Compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[opt.log_filename]): 

        # Compile network
        print("Compiling network with best tuning parameters...")
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(
                    relay_prog, target=target,
                    params=params, target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog, target=target,
                        params=params, target_host=env.target_host)

        # Export library
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # If detailed runtime info is needed build with debug runtime
        if opt.debug_profile:
            m = debug_runtime.create(graph, lib, ctx)
        else:
            m = graph_runtime.create(graph, lib, ctx)
Ejemplo n.º 22
0
def generate_graph(graph_fn, params_fn, device="vta"):
    # Measure build start time
    build_start = time.time()

    # Derive the TVM target
    target = tvm.target.create("llvm -device={}".format(device))

    # Derive the LLVM compiler flags
    # When targetting the Pynq, cross-compile to ARMv7 ISA
    if env.TARGET == "sim":
        target_host = "llvm"
    elif env.TARGET == "pynq":
        target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon"

    # Load the ResNet-18 graph and parameters
    sym = nnvm.graph.load_json(open(graph_fn).read())
    params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read())

    # Populate the shape and data type dictionary
    shape_dict = {"data": (1, 3, 224, 224)}
    dtype_dict = {"data": 'float32'}
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

    # Apply NNVM graph optimization passes
    sym = vta.graph.clean_cast(sym)
    sym = vta.graph.clean_conv_fuse(sym)
    if target.device_name == "vta":
        assert env.BLOCK_IN == env.BLOCK_OUT
        sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT)

    # Compile NNVM graph
    with nnvm.compiler.build_config(opt_level=3):
        if target.device_name != "vta":
            graph, lib, params = nnvm.compiler.build(sym,
                                                     target,
                                                     shape_dict,
                                                     dtype_dict,
                                                     params=params,
                                                     target_host=target_host)
        else:
            with vta.build_config():
                graph, lib, params = nnvm.compiler.build(
                    sym,
                    target,
                    shape_dict,
                    dtype_dict,
                    params=params,
                    target_host=target_host)

    # Save the compiled inference graph library
    assert tvm.module.enabled("rpc")
    temp = util.tempdir()
    lib.save(temp.relpath("graphlib.o"))

    # Send the inference library over to the remote RPC server
    remote.upload(temp.relpath("graphlib.o"))
    lib = remote.load_module("graphlib.o")

    # Measure build time
    build_time = time.time() - build_start
    print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time))

    return graph, lib, params
Ejemplo n.º 23
0
def run_conv2d(env,
               remote,
               wl,
               target,
               check_correctness=True,
               print_ir=False,
               samples=4):

    # Workload assertions
    assert wl.hpad == wl.wpad

    # Perform packing only if we are targeting the accelerator
    if "arm_cpu" in target.keys:
        data_pack = False
        layout = "NCHW"
        conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack
        conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack
    elif "vta" in target.keys:
        data_pack = True
        layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN)
        conv2d_fcompute = vta.top.conv2d_packed
        conv2d_fschedule = vta.top.schedule_conv2d_packed

    # Derive shapes depending upon packing
    a_shape = (wl.batch, wl.in_filter, wl.height, wl.width)
    w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel)
    b_shape = (wl.batch, wl.out_filter, 1, 1)
    if data_pack:
        data_shape = (
            wl.batch // env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            wl.height,
            wl.width,
            env.BATCH,
            env.BLOCK_IN,
        )
        kernel_shape = (
            wl.out_filter // env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        bias_shape = (
            wl.batch // env.BATCH,
            wl.out_filter // env.BLOCK_OUT,
            1,
            1,
            env.BATCH,
            env.BLOCK_OUT,
        )
    else:
        data_shape = a_shape
        kernel_shape = w_shape
        bias_shape = b_shape
    data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype)
    bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype)
    padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad))

    # Define base computation schedule
    with target:
        if data_pack:
            res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride),
                                  padding, (1, 1), layout, env.acc_dtype)
        else:
            res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride),
                                  padding, (1, 1), env.acc_dtype)
        res = topi.right_shift(res, 8)
        res = topi.add(res, bias)
        res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res = topi.cast(res, env.out_dtype)
        # Derive base schedule
        s = conv2d_fschedule([res])
        if print_ir:
            print(vta.lower(s, [data, kernel, bias, res], simple_mode=True))

    # Derive number of ops
    fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1
    fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1
    num_ops = (2 * wl.batch * fout_height * fout_width * wl.hkernel *
               wl.wkernel * wl.out_filter * wl.in_filter)

    # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc")
    def get_ref_data():
        # derive min max for act, wgt, and bias types (max non inclusive)
        a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 <<
                                                        (env.INP_WIDTH - 1))
        w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 <<
                                                        (env.WGT_WIDTH - 1))
        b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH -
                                 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2)
        a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype)
        w_np = np.random.randint(w_min, w_max,
                                 size=w_shape).astype(kernel.dtype)
        b_np = np.random.randint(b_min, b_max,
                                 size=b_shape).astype(env.acc_dtype)
        r_np = tvm.topi.testing.conv2d_nchw_python(
            a_np.astype(env.acc_dtype),
            w_np.astype(env.acc_dtype),
            (wl.hstride, wl.wstride),
            wl.hpad,
        ).astype(env.acc_dtype)
        return a_np, w_np, b_np, r_np

    # Data in original format
    data_np, kernel_np, bias_np, res_ref = get_ref_data()
    if data_pack:
        data_np = data_np.reshape(
            wl.batch // env.BATCH,
            env.BATCH,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.height,
            wl.width,
        ).transpose((0, 2, 4, 5, 1, 3))
        kernel_np = kernel_np.reshape(
            wl.out_filter // env.BLOCK_OUT,
            env.BLOCK_OUT,
            wl.in_filter // env.BLOCK_IN,
            env.BLOCK_IN,
            wl.hkernel,
            wl.wkernel,
        ).transpose((0, 2, 4, 5, 1, 3))
        bias_np = bias_np.reshape(wl.batch // env.BATCH,
                                  wl.out_filter // env.BLOCK_OUT, 1, 1,
                                  env.BATCH, env.BLOCK_OUT)

    # Build
    if "vta" in target.keys:
        with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
            mod = vta.build(
                s,
                [data, kernel, bias, res],
                target=tvm.target.Target(target, host=env.target_host),
                name="conv2d",
            )
    else:
        mod = tvm.build(
            s,
            [data, kernel, bias, res],
            target=tvm.target.Target(target, host=env.target_host),
            name="conv2d",
        )
    temp = utils.tempdir()
    mod.save(temp.relpath("conv2d.o"))
    remote.upload(temp.relpath("conv2d.o"))
    f = remote.load_module("conv2d.o")
    dev = remote.device(str(target))

    res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype)
    data_arr = tvm.nd.array(data_np, dev)
    kernel_arr = tvm.nd.array(kernel_np, dev)
    bias_arr = tvm.nd.array(bias_np, dev)
    res_arr = tvm.nd.array(res_np, dev)
    time_f = f.time_evaluator("conv2d", dev, number=samples)

    # In vta sim mode, collect simulator runtime statistics
    stats = {}
    cost = None
    if env.TARGET in ["sim", "tsim"]:
        # Check if we're in local RPC mode (allows us to rebuild the
        # runtime on the fly when varying the VTA designs)
        local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0"))
        if local_rpc:
            if env.TARGET == "sim":
                remote.get_function("vta.simulator.profiler_clear")()
            else:
                remote.get_function("vta.tsim.profiler_clear")()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            if env.TARGET == "sim":
                stats = json.loads(
                    remote.get_function("vta.simulator.profiler_status")())
            else:
                stats = json.loads(
                    remote.get_function("vta.tsim.profiler_status")())
        else:
            simulator.clear_stats()
            cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)
            stats = simulator.stats()
    else:
        cost = time_f(data_arr, kernel_arr, bias_arr, res_arr)

    # Check correctness
    correct = False
    if check_correctness:
        res_orig = res_arr.numpy()
        if data_pack:
            res_orig = res_orig.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter,
                                            fout_height, fout_width)
            bias_np = bias_np.transpose(
                (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1)
        res_ref = res_ref >> env.WGT_WIDTH
        res_ref += bias_np
        res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1)
        res_ref = res_ref.astype(env.out_dtype)
        correct = np.allclose(res_orig, res_ref)

    gops = (num_ops / cost.mean) / float(10**9)
    status = "PASSED" if correct else "FAILED"
    if "arm_cpu" in target.keys:
        device = "CPU"
    elif "vta" in target.keys:
        device = "VTA"
    print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" %
          (device, status, cost.mean, gops))

    return correct, cost, stats
Ejemplo n.º 24
0
# VTA compute intrinsics.
print(vta.lower(s, [data, kernel, res], simple_mode=True))

######################################################################
# TVM Compilation and Verification
# --------------------------------
# After specifying the schedule, we can compile it into a TVM function.
# We save the module so we can send it over RPC.
# We run the function and verify it against a numpy implementation to
# ensure correctness.

# This library facilitates 2D convolution testing
from tvm.topi.testing import conv2d_nchw_python

# Compile the TVM module
with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}):
    my_conv = vta.build(s, [data, kernel, res],
                        tvm.target.Target("ext_dev", host=env.target_host),
                        name="my_conv")
temp = utils.tempdir()
my_conv.save(temp.relpath("conv2d.o"))
remote.upload(temp.relpath("conv2d.o"))
f = remote.load_module("conv2d.o")

# Get the remote device context
ctx = remote.ext_dev(0)

# Initialize the data and kernel arrays randomly in the int range
# of (-128, 128] in NCHW layout
data_np = np.random.randint(-128,
                            128,
Ejemplo n.º 25
0
        ),
    }
    tune_tasks(tasks, **tuning_opt)

    # Compile kernels with history best records
    with autotvm.tophub.context(target, extra_files=[opt.log_filename]):

        # Compile network
        print("Compiling network with best tuning parameters...")
        if target.device_name != "vta":
            with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )
        else:
            with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
                graph, lib, params = relay.build(
                    relay_prog, target=target, params=params, target_host=env.target_host
                )

        # Export library
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # If detailed runtime info is needed build with debug runtime
        if opt.debug_profile:
            m = debug_runtime.create(graph, lib, ctx)
        else:
            m = graph_runtime.create(graph, lib, ctx)
Ejemplo n.º 26
0
        def check_alu(tvm_op, np_op=None, use_imm=False):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                name="a",
                                dtype=env.acc_dtype)
            a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                lambda *i: a(*i), "a_buf")  #DRAM->SRAM
            if use_imm:
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), imm),
                                      "res_buf")  #compute
            else:
                b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT),
                                    name="b",
                                    dtype=env.acc_dtype)
                b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                    lambda *i: b(*i), "b_buf")  #DRAM->SRAM
                res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                                      lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                                      "res_buf")  #compute5B
            res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT),
                              lambda *i: res_buf(*i).astype(env.inp_dtype),
                              "res")  #SRAM->DRAM
            # schedule
            s = tvm.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                else:
                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            ctx = remote.ext_dev(0)
            a_np = np.random.randint(-16,
                                     16,
                                     size=(m, n, env.BATCH,
                                           env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16,
                                         16,
                                         size=(m, n, env.BATCH,
                                               env.BLOCK_OUT)).astype(b.dtype)
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, ctx)
            res_nd = tvm.nd.array(
                np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype),
                ctx)

            if env.TARGET == "tsim":
                simulator.tsim_init("libvta_hw")

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, ctx)
                f(a_nd, b_nd, res_nd)
            np.testing.assert_equal(res_np, res_nd.asnumpy())
Ejemplo n.º 27
0
        relay_prog = mod["main"]

    # Compile Relay program with AlterOpLayout disabled
    if target.device_name != "vta":
        with tvm.transform.PassContext(opt_level=3,
                                       disabled_pass={"AlterOpLayout"}):
            graph, lib, params = relay.build(relay_prog,
                                             target=tvm.target.Target(
                                                 target, host=env.target_host),
                                             params=params)
    else:
        if env.TARGET == "intelfocl":
            # multiple targets to run both on cpu and vta
            target = {"cpu": env.target_vta_cpu, "ext_dev": target}
        with vta.build_config(
                opt_level=3,
                disabled_pass={"AlterOpLayout", "tir.CommonSubexprElimTIR"}):
            graph, lib, params = relay.build(relay_prog,
                                             target=tvm.target.Target(
                                                 target, host=env.target_host),
                                             params=params)

    # Measure Relay build time
    build_time = time.time() - build_start
    print(model + " inference graph built in {0:.2f}s!".format(build_time))

    # Send the inference library over to the remote RPC server
    temp = utils.tempdir()
    lib.export_library(temp.relpath("graphlib.tar"))
    remote.upload(temp.relpath("graphlib.tar"))
    lib = remote.load_module("graphlib.tar")
Ejemplo n.º 28
0
def main(model,
         start_pack,
         stop_pack,
         data_shape=(1, 3, 224, 224),
         dtype='float32'):
    # Make sure that TVM was compiled with RPC=1
    assert tvm.module.enabled("rpc")

    ######################################################################
    # Define the platform and model targets
    # -------------------------------------
    # Execute on CPU vs. VTA, and define the model.

    # Load VTA parameters from the vta/config/vta_config.json file
    env = vta.get_env()

    # Set ``device=arm_cpu`` to run inference on the CPU
    # or ``device=vta`` to run inference on the FPGA.
    device = "vta"
    target = env.target if device == "vta" else env.target_vta_cpu

    # Name of Gluon model to compile
    # The ``start_pack`` and ``stop_pack`` labels indicate where
    # to start and end the graph packing relay pass: in other words
    # where to start and finish offloading to VTA.

    ######################################################################
    # Obtain an execution remote
    # ---------------------------------
    # When target is 'pynq', reconfigure FPGA and runtime.
    # Otherwise, if target is 'sim', execute locally.
    print(f"Target is {env.TARGET}")
    if env.TARGET in ["sim", "tsim"]:
        remote = rpc.LocalSession()
    else:
        print(f"Error, incorrect target for benchmarking: {env.TARGET}")

    # Get execution context from remote
    ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

    ######################################################################
    # Build the inference graph runtime
    # ---------------------------------
    # Grab ResNet-18 model from Gluon model zoo and compile with Relay.
    # The compilation steps are:
    #    1) Front end translation from MxNet into Relay module.
    #    2) Apply 8-bit quantization: here we skip the first conv layer,
    #       and dense layer which will both be executed in fp32 on the CPU.
    #    3) Perform graph packing to alter the data layout for tensorization.
    #    4) Perform constant folding to reduce number of operators (e.g. eliminate
    #       batch norm multiply).
    #    5) Perform relay build to object file.
    #    6) Load the object file onto remote (FPGA device).
    #    7) Generate graph runtime, `m`.

    # Load pre-configured AutoTVM schedules
    with autotvm.tophub.context(target):

        # Populate the shape and data type dictionary for ResNet input
        dtype_dict = {"data": 'float32'}
        shape_dict = {"data": data_shape}

        # Measure build start time
        build_start = time.time()

        # Start front end compilation
        if model == 'resnet':
            mod, params = test_resnet_mxnet(env)
        elif model == 'yolo':
            mod, params = test_yolo_darknet()
        elif model == 'lenet':
            mod, params = lenet()
        elif model == 'mobilenet':
            mod, params = mobilenet()
        else:
            print(f"Error, incorrect model name: {model}")

        ### Need to bind params

        # Update shape and type dictionary
        shape_dict.update({k: v.shape for k, v in params.items()})
        dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
        with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
            relay_prog = relay.quantize.quantize(mod['main'], params=params)

        print(f"Finishing quantizing graph")
        # Perform graph packing and constant folding for VTA target
        if target.device_name == "vta":
            assert env.BLOCK_IN == env.BLOCK_OUT
            relay_prog = graph_pack(relay_prog,
                                    env.BATCH,
                                    env.BLOCK_OUT,
                                    env.WGT_WIDTH,
                                    start_name=start_pack,
                                    stop_name=stop_pack)

        print(f"Finishing packing graph")

        # Compile Relay program with AlterOpLayout disabled
        with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            if target.device_name != "vta":
                graph, lib, params = relay.build(relay_prog,
                                                 target=target,
                                                 params=params,
                                                 target_host=env.target_host)
            else:
                with vta.build_config():
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)

        # Measure Relay build time
        build_time = time.time() - build_start
        print(model + " inference graph built in {0:.2f}s!".format(build_time))

        # Send the inference library over to the remote RPC server
        temp = util.tempdir()
        lib.save(temp.relpath("graphlib.o"))
        remote.upload(temp.relpath("graphlib.o"))
        lib = remote.load_module("graphlib.o")

        # Graph runtime
        m = graph_runtime.create(graph, lib, ctx)
    #
    # # Set the network parameters and inputs
    data = np.random.uniform(size=data_shape).astype(dtype)

    m.set_input(**params)
    m.set_input('data', tvm.nd.array(data.astype(dtype)))

    # Perform inference and gather execution statistics
    # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator
    num = 1  # number of times we run module for a single measurement
    rep = 1  # number of measurements (we derive std dev from this)
    timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)

    if env.TARGET in ["sim", "tsim"]:
        simulator.clear_stats()
        timer()
        sim_stats = simulator.stats()
        print("\nExecution statistics:")
        for k, v in sim_stats.items():
            # Since we execute the workload many times, we need to normalize stats
            # Note that there is always one warm up run
            # Therefore we divide the overall stats by (num * rep + 1)
            print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
    else:
        tcost = timer()
        std = np.std(tcost.results) * 1000
        mean = tcost.mean * 1000
        print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" %
              (mean, std, env.BATCH))
        print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
Ejemplo n.º 29
0
                                        round_for_shift=True):
                mod = relay.quantize.quantize(mod, params=params)
            # Perform graph packing and constant folding for VTA target
            mod = graph_pack(mod["main"],
                             env.BATCH,
                             env.BLOCK_OUT,
                             env.WGT_WIDTH,
                             start_name=pack_dict[MODEL_NAME][0],
                             stop_name=pack_dict[MODEL_NAME][1],
                             start_name_idx=pack_dict[MODEL_NAME][2],
                             stop_name_idx=pack_dict[MODEL_NAME][3])
    else:
        mod = mod["main"]

    # Compile Relay program with AlterOpLayout disabled
    with vta.build_config(disabled_pass={"AlterOpLayout"}):
        graph, lib, params = relay.build(mod,
                                         target=target,
                                         params=params,
                                         target_host=env.target_host)

    # Measure Relay build time
    build_time = time.time() - build_start
    print(MODEL_NAME +
          " inference graph built in {0:.2f}s!".format(build_time))

    # Send the inference library over to the remote RPC server
    temp = util.tempdir()
    lib.save(temp.relpath("graphlib.o"))
    remote.upload(temp.relpath("graphlib.o"))
    lib = remote.load_module("graphlib.o")
    def compile_model(self):
        if device == 'vta':
            self.remote = rpc.connect(self.pynq_addr, 9091)
            vta.reconfig_runtime(self.remote)
            vta.program_fpga(self.remote, bitstream=None)
        else:
            self.remote = rpc.LocalSession()

        self.ctx = self.remote.ext_dev(
            0) if device == 'vta' else self.remote.cpu(0)

        # Load pre-configured AutoTVM schedules
        with autotvm.tophub.context(target):

            # Populate the shape and data type dictionary for ResNet input
            dtype_dict = {'data': 'float32'}
            shape_dict = {'data': (env.BATCH, 3, 224, 224)}

            gluon_model = vision.resnet18_v1(
                pretrained=True, ctx=ctx
            ).features if args.nonsplit else splitnet.resnet18_v1_split(
                self.id + 1)

            # Measure build start time
            build_start = time.time()

            # Start front end compilation
            mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

            # Update shape and type dictionary
            shape_dict.update({k: v.shape for k, v in params.items()})
            dtype_dict.update({k: str(v.dtype) for k, v in params.items()})

            # Perform quantization in Relay
            with relay.quantize.qconfig(global_scale=8.0,
                                        skip_conv_layers=[0]):
                relay_prog = relay.quantize.quantize(mod['main'],
                                                     params=params)

            # Perform graph packing and constant folding for VTA target
            if target.device_name == 'vta':
                assert env.BLOCK_IN == env.BLOCK_OUT
                relay_prog = graph_pack(relay_prog,
                                        env.BATCH,
                                        env.BLOCK_OUT,
                                        env.WGT_WIDTH,
                                        start_name=start_pack,
                                        stop_name=stop_pack)

            # Compile Relay program with AlterOpLayout disabled
            with relay.build_config(opt_level=3,
                                    disabled_pass={'AlterOpLayout'}):
                if target.device_name != 'vta':
                    graph, lib, params = relay.build(
                        relay_prog,
                        target=target,
                        params=params,
                        target_host=env.target_host)
                else:
                    with vta.build_config():
                        graph, lib, params = relay.build(
                            relay_prog,
                            target=target,
                            params=params,
                            target_host=env.target_host)

            self.params = params

            # Measure Relay build time
            build_time = time.time() - build_start
            print(f'inference graph for thread {self.id} built in {0:.4f}s!'.
                  format(build_time))

            # Send the inference library over to the remote RPC server
            temp = util.tempdir()
            lib.save(temp.relpath('graphlib.o'))
            self.remote.upload(temp.relpath('graphlib.o'))
            lib = self.remote.load_module('graphlib.o')

            # Graph runtime
            self.m = graph_runtime.create(graph, lib, self.ctx)
Ejemplo n.º 31
0
        def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None):
            """Test ALU"""
            m = 8
            n = 8
            imm = np.random.randint(1, 5)
            # compute
            a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype)
            a_buf = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf"
            )  # DRAM->SRAM
            if use_imm:
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf"
                )  # compute
            else:
                b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype)
                b_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf"
                )  # DRAM->SRAM
                res_buf = te.compute(
                    (m, n, env.BATCH, env.BLOCK_OUT),
                    lambda *i: tvm_op(a_buf(*i), b_buf(*i)),
                    "res_buf",
                )  # compute5B
            res = te.compute(
                (m, n, env.BATCH, env.BLOCK_OUT),
                lambda *i: res_buf(*i).astype(env.inp_dtype),
                "res",
            )  # SRAM->DRAM
            # schedule
            s = te.create_schedule(res.op)
            s[a_buf].set_scope(env.acc_scope)  # SRAM
            s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM
            s[res_buf].set_scope(env.acc_scope)  # SRAM
            s[res_buf].pragma(res_buf.op.axis[0], env.alu)  # compute
            s[res].pragma(res.op.axis[0], env.dma_copy)  # SRAM->DRAM
            if not use_imm:
                s[b_buf].set_scope(env.acc_scope)  # SRAM
                s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy)  # DRAM->SRAM

            if not remote:
                return

            # build
            with vta.build_config():
                if use_imm:
                    mod = vta.build(s, [a, res], "ext_dev", env.target_host)
                else:
                    mod = vta.build(s, [a, b, res], "ext_dev", env.target_host)
            temp = utils.tempdir()
            mod.save(temp.relpath("load_act.o"))
            remote.upload(temp.relpath("load_act.o"))
            f = remote.load_module("load_act.o")
            # verify
            dev = remote.ext_dev(0)
            a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype)
            if use_imm:
                res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm)
            else:
                b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(
                    b.dtype
                )
                res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np)
            res_np = res_np.astype(res.dtype)
            a_nd = tvm.nd.array(a_np, dev)
            res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            if use_imm:
                f(a_nd, res_nd)
            else:
                b_nd = tvm.nd.array(b_np, dev)
                f(a_nd, b_nd, res_nd)

            np.testing.assert_equal(res_np, res_nd.numpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("ALU {} execution statistics:".format(test_name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))