def _lower(mod, target, params): """ Helper to lower VTA properly. """ # pylint: disable=import-outside-toplevel from tvm import relay from tvm.relay.backend import graph_runtime_codegen if hasattr(target, 'device_name') and target.device_name == "vta": import vta with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): mod, _ = relay.optimize(mod, target, params) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc.codegen(mod["main"]) return # default case # Try graph codegen first to extract autotvm tasks. # If failed to compile, then fallback to use VM compiler. # TODO: Currently VM compiler is likely to stack overflow for large models. try: opt_mod, _ = relay.optimize(mod, target, params) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc.codegen(opt_mod["main"]) except tvm.TVMError: compiler = relay.vm.VMCompiler() if params: compiler.set_params(params) compiler.lower(mod, target=target)
def _lower(mod, target, params): """Helper to lower VTA properly.""" # pylint: disable=import-outside-toplevel from tvm import relay from tvm.relay.backend import graph_executor_codegen if hasattr(target, "device_name") and target.device_name == "vta": import vta with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): mod, _ = relay.optimize(mod, target, params) grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(mod["main"]) return # default case # Try graph codegen first to extract autotvm tasks. # If failed to compile, then fallback to use VM compiler. # TODO: Currently VM compiler is likely to stack overflow for large models. try: # TODO(jwfromm) Remove this once AlterOpLayout bug that mutates # source module is fixed. Until then, create a clone. mod_clone = deepcopy(mod) opt_mod, _ = relay.optimize(mod_clone, target, params) grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(opt_mod["main"]) except tvm.TVMError as e: print("Get errors with GraphExecutorCodegen for task extraction. " "Fallback to VMCompiler. Error details:\n%s" % str(e)) mod_clone = deepcopy(mod) compiler = relay.vm.VMCompiler() if params: compiler.set_params(params) compiler.lower(mod_clone, target=target)
def _run(env, remote): m = 8 n = 10 # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") # DRAM->SRAM max_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.max(a_buf(*i), 0), "res_buf") # relu min_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf") # relu res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf") # SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy()) if env.TARGET == "tsim": print("Relu test took {} clock cycles".format( simulator.tsim_cycles()))
def _run(env, remote): # declare n = 21 m = 20 pad_before = [0, 1, 0, 0] pad_after = [1, 3, 0, 0] x = tvm.placeholder((n, m, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype) x_buf = topi.nn.pad(x, pad_before, pad_after, name="y") # insert no-op that won't be optimized away y_buf = tvm.compute( (n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT), lambda *i: x_buf(*i) >> 0, "y_buf") y = tvm.compute( (n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y") # schedule s = tvm.create_schedule(y.op) s[x_buf].set_scope(env.acc_scope) s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy) s[y_buf].set_scope(env.acc_scope) s[y_buf].pragma(y_buf.op.axis[0], env.alu) s[y].pragma(y.op.axis[0], env.dma_copy) # build with vta.build_config(): mod = vta.build(s, [x, y], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() mod.save(temp.relpath("padded_load.o")) remote.upload(temp.relpath("padded_load.o")) f = remote.load_module("padded_load.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint(1, 2, size=(n, m, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = np.zeros((n + pad_before[0] + pad_after[0], m + pad_before[1] + pad_after[1], env.BATCH, env.BLOCK_OUT)).astype(y.dtype) y_np[pad_before[0]:pad_before[0] + n, pad_before[1]:pad_before[1] + m, :] = x_np x_nd = tvm.nd.array(x_np, ctx) y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Padded load execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _run(env, remote): m = 8 n = 10 # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf" ) # DRAM->SRAM max_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.max(a_buf(*i), 0), "res_buf" ) # relu min_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm.te.min(max_buf(*i), (1 << (env.INP_WIDTH - 1)) - 1), "max_buf", ) # relu res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: min_buf(*i).astype(env.inp_dtype), "min_buf", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[max_buf].set_scope(env.acc_scope) # SRAM s[min_buf].set_scope(env.acc_scope) # SRAM s[max_buf].pragma(max_buf.op.axis[0], env.alu) # compute s[min_buf].pragma(min_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM # build with vta.build_config(): mod = vta.build(s, [a, res], "ext_dev", env.target_host) if not remote: return temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-256, 256, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) res_np = np.clip(a_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(a_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Relu execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def conv_normal(print_ir): print("----- CONV2D End-to-End Test-------") with vta.build_config(): s = vta.top.schedule_packed_conv2d([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10 ** 9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
def build_model(model_name, remote, target, ctx, vta_env): """Build the inference graph runtime.""" # Load pre-configured AutoTVM schedules. with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input. dtype_dict = {'data': 'float32'} shape_dict = {'data': (vta_env.BATCH, 3, 224, 224)} # Get off-the-shelf gluon model and convert to Relay. gluon_model = vision.get_model(model_name, pretrained=True) # Start frontend compilation. mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary. shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay. with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target. if target.device_name == 'vta': assert vta_env.BLOCK_IN == vta_env.BLOCK_OUT relay_prog = graph_pack(relay_prog, vta_env.BATCH, vta_env.BLOCK_OUT, vta_env.WGT_WIDTH, start_name=START_PACK, stop_name=STOP_PACK) # Compile Relay program with AlterOpLayout disabled. with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name == 'vta': with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=vta_env.target, params=params, target_host=vta_env.target_host) else: graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=vta_env.target_host) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) remote.upload(temp.relpath('graphlib.o')) lib = remote.load_module('graphlib.o') graph_module = graph_runtime.create(graph, lib, ctx) graph_module.set_input(**params) return graph_module
def generate_graph(graph_fn, params_fn, device="vta"): # Measure build start time build_start = time.time() # Derive the TVM target target = tvm.target.create("llvm -device={}".format(device)) # Derive the LLVM compiler flags # When targetting the Pynq, cross-compile to ARMv7 ISA if env.TARGET == "sim": target_host = "llvm" elif env.TARGET == "pynq": target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon" # Load the ResNet-18 graph and parameters sym = nnvm.graph.load_json(open(graph_fn).read()) params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) # Compile NNVM graph with nnvm.compiler.build_config(opt_level=3): if target.device_name != "vta": graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) else: with vta.build_config(): graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) # Save the compiled inference graph library assert tvm.module.enabled("rpc") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) # Send the inference library over to the remote RPC server remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Measure build time build_time = time.time() - build_start print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) return graph, lib, params
def _run(env, remote): n = 6 x = tvm.placeholder( (n, n, env.BATCH, env.BLOCK_OUT), name="x", dtype=env.acc_dtype) x_buf = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x(*i), "x_buf") # insert no-op that won't be optimized away y_buf = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: x_buf(*i)>>0, "y_buf") y = tvm.compute( (n, n, env.BATCH, env.BLOCK_OUT), lambda *i: y_buf(*i).astype(env.inp_dtype), "y") # schedule s = tvm.create_schedule(y.op) s[x_buf].set_scope(env.acc_scope) s[x_buf].pragma(x_buf.op.axis[0], env.dma_copy) s[y_buf].set_scope(env.acc_scope) s[y_buf].pragma(y_buf.op.axis[0], env.alu) s[y].pragma(y.op.axis[0], env.dma_copy) # verification with vta.build_config(): m = vta.build(s, [x, y], "ext_dev", env.target_host) if not remote: return temp = util.tempdir() m.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) x_np = np.random.randint( 1, 10, size=(n, n, env.BATCH, env.BLOCK_OUT)).astype(x.dtype) y_np = x_np.astype(y.dtype) x_nd = tvm.nd.array(x_np, ctx) y_nd = tvm.nd.empty(y_np.shape, ctx=ctx, dtype=y_np.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, y_nd) np.testing.assert_equal(y_np, y_nd.asnumpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("Save load execution statistics:") for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def conv_normal(print_ir): print("----- CONV2D End-to-End Test-------") with vta.build_config(): s = vta.top.schedule_packed_conv2d([res]) if print_ir: print( vta.lower(s, [data, kernel, bias, res], simple_mode=True)) cost = verify(s, True) gops = (num_ops / cost.mean) / float(10**9) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))
def _build(func, target, target_host, params): """ Helper to build VTA properly. """ from tvm import relay if hasattr(target, 'device_name') and target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): return relay.build(func, target, target_host, params) # default case return relay.build(func, target, target_host, params)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--network-arch', type=argparse.FileType('r'), default='network.arch') parser.add_argument('--af-params', default='w2l_params_af.bin') parser.add_argument('--nfeat', type=int, default=40) parser.add_argument('--nlabel', type=int, default=30) parser.add_argument('--max-inp-len', type=int, default=741) # input len of librispeech: mean=741, median=577, max=3494, std=515. heavily tailed. parser.add_argument('--no-params', action='store_true') parser.add_argument('--device', choices=('vta', 'vtacpu'), default='vta') args = parser.parse_args() net = make_net(args.network_arch, args.nfeat, args.nlabel, args.max_inp_len) params = make_params(net, args.af_params) if not args.no_params else None with relay.build_config(opt_level=3): if args.device == 'vta': net = vta.graph.clean_cast(net) net = vta.graph.clean_conv_fuse(net) # net = vta.graph.pack(net, ...) # ??? vta.build_config().__enter__() graph, lib, params = relay.build(net, params=params, target=f'llvm -device={args.device}', target_host='llvm') if args.device == 'vta': vta.build_config().__exit__() lib.export_library('wav2letter2.so') with open('wav2letter2.json', 'w') as f_graph_json: f_graph_json.write(graph) with open('wav2letter2.params', 'wb') as f_params: f_params.write(nnvm.compiler.save_param_dict(params))
def verify(s, name=None): # Build with the CSE pass disabled as otherwise it would complicate the test with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [x, w, y], tvm.target.Target("ext_dev", host=env.target_host)) temp = utils.tempdir() mod.save(temp.relpath("gemm.o")) remote.upload(temp.relpath("gemm.o")) f = remote.load_module("gemm.o") # verify dev = remote.ext_dev(0) x_np = np.random.randint(-128, 128, size=(o, n, env.BATCH, env.BLOCK_IN)).astype(x.dtype) w_np = np.random.randint(-128, 128, size=(m, n, env.BLOCK_OUT, env.BLOCK_IN)).astype(w.dtype) y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype) x_nd = tvm.nd.array(x_np, dev) w_nd = tvm.nd.array(w_np, dev) y_nd = tvm.nd.array(y_np, dev) y_np = y_np.astype(env.acc_dtype) for b in range(o): for i in range(m): for j in range(n): y_np[b, i, :] += np.dot( x_np[b, j, :].astype(env.acc_dtype), w_np[i, j].T.astype(env.acc_dtype)) y_np = np.right_shift(y_np, 8) y_np = np.clip(y_np, 0, (1 << (env.INP_WIDTH - 1)) - 1).astype(y.dtype) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() f(x_nd, w_nd, y_nd) np.testing.assert_equal(y_np, y_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("GEMM schedule:{} execution statistics:".format(name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))
def _lower(mod, target, params): """ Helper to lower VTA properly. """ # pylint: disable=import-outside-toplevel from tvm import relay from tvm.relay.backend import graph_runtime_codegen if hasattr(target, 'device_name') and target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): mod, _ = relay.optimize(mod, target, params) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) grc.codegen(mod["main"]) # default case compiler = relay.vm.VMCompiler() if params: compiler.set_params(params) compiler.lower(mod, target=target)
def _lower(func, target, params): """ Helper to lower VTA properly. """ from tvm import relay from tvm.relay.backend import graph_runtime_codegen if hasattr(target, 'device_name') and target.device_name == "vta": with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): import vta with vta.build_config(): mod, _ = relay.optimize(func, target, params) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) return grc.codegen(mod["main"]) # default case mod, _ = relay.optimize(func, target, params) grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) return grc.codegen(mod["main"])
def compile_mxnet_gulon_resnet(_env, _model): """ Compile Model """ # Generate tvm IR from mxnet gluon model # Populate the shape and data type dictionary for ImageNet classifier input dtype_dict = {"data": 'float32'} shape_dict = {"data": (_env.BATCH, 3, 224, 224)} # Get off the shelf gluon model, and convert to relay gluon_model = vision.get_model(_model, pretrained=True) # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) mod = merge_transform_to_mxnet_model(mod) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(_env.target): # Perform quantization in Relay # Note: We set opt_level to 3 in order to fold batch norm with relay.build_config(opt_level=3): with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target relay_prog = graph_pack(mod["main"], _env.BATCH, _env.BLOCK_IN, _env.WGT_WIDTH, start_name=PACK_DICT[_model][0], stop_name=PACK_DICT[_model][1]) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): with vta.build_config(debug_flag=0): graph, lib, params = relay.build(relay_prog, target=_env.target, params=params, target_host=_env.target_host) return graph, lib, params
def extract_tasks(sym, params, target, target_host): # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) with vta.build_config(): tasks = autotvm.task.extract_from_graph(graph=sym, shape=shape_dict, dtype=dtype_dict, target=target, params=params, symbols=(nnvm.sym.conv2d, ), target_host=target_host) return tasks
def _lower(mod, target, params, opt_level=3): """Helper to lower VTA properly.""" # pylint: disable=import-outside-toplevel from tvm import relay from tvm.relay.backend import graph_executor_codegen if hasattr(target, "device_name") and target.device_name == "vta": import vta with vta.build_config(opt_level=opt_level, disabled_pass={"AlterOpLayout"}): mod, _ = relay.optimize(mod, target, params) grc = graph_executor_codegen.GraphExecutorCodegen(None, target) grc.codegen(mod, mod["main"]) return # Alter op layout code has been written expecting that tuning is applied # without it, so we disable AlterOpLayout to maintain that behavior. with tvm.transform.PassContext(opt_level=opt_level, disabled_pass={"AlterOpLayout"}): compiler = relay.vm.VMCompiler() if params: compiler.set_params(params) compiler.lower(mod, target=target)
def generate_graph(sym, params, target, target_host): # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) # Compile NNVM graph with nnvm.compiler.build_config(opt_level=3): with vta.build_config(): graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) return graph, lib, params
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote(env.TARGET, tracker_host, tracker_port, timeout=10000) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program(mod, params=params, ops=(tvm.relay.op.nn.conv2d, ), target=target, target_host=env.target_host) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print("({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride)) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library print("Upload...") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.create(graph, lib, ctx) # upload parameters to device image = tvm.nd.array( (np.random.uniform(size=(1, 3, 224, 224))).astype('float32')) m.set_input(**params) m.set_input('data', image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
check_correctness=True)) } tune_tasks(tasks, **tuning_opt) # Compile kernels with history best records with autotvm.tophub.context(target, extra_files=[opt.log_filename]): # Compile network print("Compiling network with best tuning parameters...") with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Export library temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx)
def generate_graph(graph_fn, params_fn, device="vta"): # Measure build start time build_start = time.time() # Derive the TVM target target = tvm.target.create("llvm -device={}".format(device)) # Derive the LLVM compiler flags # When targetting the Pynq, cross-compile to ARMv7 ISA if env.TARGET == "sim": target_host = "llvm" elif env.TARGET == "pynq": target_host = "llvm -mtriple=armv7-none-linux-gnueabihf -mcpu=cortex-a9 -mattr=+neon" # Load the ResNet-18 graph and parameters sym = nnvm.graph.load_json(open(graph_fn).read()) params = nnvm.compiler.load_param_dict(open(params_fn, 'rb').read()) # Populate the shape and data type dictionary shape_dict = {"data": (1, 3, 224, 224)} dtype_dict = {"data": 'float32'} shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Apply NNVM graph optimization passes sym = vta.graph.clean_cast(sym) sym = vta.graph.clean_conv_fuse(sym) if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT sym = vta.graph.pack(sym, shape_dict, env.BATCH, env.BLOCK_OUT) # Compile NNVM graph with nnvm.compiler.build_config(opt_level=3): if target.device_name != "vta": graph, lib, params = nnvm.compiler.build(sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) else: with vta.build_config(): graph, lib, params = nnvm.compiler.build( sym, target, shape_dict, dtype_dict, params=params, target_host=target_host) # Save the compiled inference graph library assert tvm.module.enabled("rpc") temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) # Send the inference library over to the remote RPC server remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Measure build time build_time = time.time() - build_start print("ResNet-18 inference graph built in {0:.2f}s!".format(build_time)) return graph, lib, params
def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) conv2d_fcompute = vta.top.conv2d_packed conv2d_fschedule = vta.top.schedule_conv2d_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = ( wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN, ) kernel_shape = ( wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN, ) bias_shape = ( wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT, ) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: if data_pack: res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype) else: res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = conv2d_fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = (2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter) # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) r_np = tvm.topi.testing.conv2d_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, ).astype(env.acc_dtype) return a_np, w_np, b_np, r_np # Data in original format data_np, kernel_np, bias_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width, ).transpose((0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape( wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel, ).transpose((0, 2, 4, 5, 1, 3)) bias_np = bias_np.reshape(wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) # Build if "vta" in target.keys: with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): mod = vta.build( s, [data, kernel, bias, res], target=tvm.target.Target(target, host=env.target_host), name="conv2d", ) else: mod = tvm.build( s, [data, kernel, bias, res], target=tvm.target.Target(target, host=env.target_host), name="conv2d", ) temp = utils.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) bias_arr = tvm.nd.array(bias_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("conv2d", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.numpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) bias_np = bias_np.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1) res_ref = res_ref >> env.WGT_WIDTH res_ref += bias_np res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
# VTA compute intrinsics. print(vta.lower(s, [data, kernel, res], simple_mode=True)) ###################################################################### # TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # This library facilitates 2D convolution testing from tvm.topi.testing import conv2d_nchw_python # Compile the TVM module with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): my_conv = vta.build(s, [data, kernel, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_conv") temp = utils.tempdir() my_conv.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and kernel arrays randomly in the int range # of (-128, 128] in NCHW layout data_np = np.random.randint(-128, 128,
), } tune_tasks(tasks, **tuning_opt) # Compile kernels with history best records with autotvm.tophub.context(target, extra_files=[opt.log_filename]): # Compile network print("Compiling network with best tuning parameters...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) # Export library temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # If detailed runtime info is needed build with debug runtime if opt.debug_profile: m = debug_runtime.create(graph, lib, ctx) else: m = graph_runtime.create(graph, lib, ctx)
def check_alu(tvm_op, np_op=None, use_imm=False): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf") #DRAM->SRAM if use_imm: res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf") #compute else: b = tvm.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf") #DRAM->SRAM res_buf = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf") #compute5B res = tvm.compute((m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res") #SRAM->DRAM # schedule s = tvm.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build(s, [a, res], "ext_dev", env.target_host) else: mod = vta.build(s, [a, b, res], "ext_dev", env.target_host) temp = util.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify ctx = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(b.dtype) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, ctx) res_nd = tvm.nd.array( np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), ctx) if env.TARGET == "tsim": simulator.tsim_init("libvta_hw") if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, ctx) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.asnumpy())
relay_prog = mod["main"] # Compile Relay program with AlterOpLayout disabled if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(relay_prog, target=tvm.target.Target( target, host=env.target_host), params=params) else: if env.TARGET == "intelfocl": # multiple targets to run both on cpu and vta target = {"cpu": env.target_vta_cpu, "ext_dev": target} with vta.build_config( opt_level=3, disabled_pass={"AlterOpLayout", "tir.CommonSubexprElimTIR"}): graph, lib, params = relay.build(relay_prog, target=tvm.target.Target( target, host=env.target_host), params=params) # Measure Relay build time build_time = time.time() - build_start print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = utils.tempdir() lib.export_library(temp.relpath("graphlib.tar")) remote.upload(temp.relpath("graphlib.tar")) lib = remote.load_module("graphlib.tar")
def main(model, start_pack, stop_pack, data_shape=(1, 3, 224, 224), dtype='float32'): # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") ###################################################################### # Define the platform and model targets # ------------------------------------- # Execute on CPU vs. VTA, and define the model. # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. ###################################################################### # Obtain an execution remote # --------------------------------- # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. print(f"Target is {env.TARGET}") if env.TARGET in ["sim", "tsim"]: remote = rpc.LocalSession() else: print(f"Error, incorrect target for benchmarking: {env.TARGET}") # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### # Build the inference graph runtime # --------------------------------- # Grab ResNet-18 model from Gluon model zoo and compile with Relay. # The compilation steps are: # 1) Front end translation from MxNet into Relay module. # 2) Apply 8-bit quantization: here we skip the first conv layer, # and dense layer which will both be executed in fp32 on the CPU. # 3) Perform graph packing to alter the data layout for tensorization. # 4) Perform constant folding to reduce number of operators (e.g. eliminate # batch norm multiply). # 5) Perform relay build to object file. # 6) Load the object file onto remote (FPGA device). # 7) Generate graph runtime, `m`. # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {"data": 'float32'} shape_dict = {"data": data_shape} # Measure build start time build_start = time.time() # Start front end compilation if model == 'resnet': mod, params = test_resnet_mxnet(env) elif model == 'yolo': mod, params = test_yolo_darknet() elif model == 'lenet': mod, params = lenet() elif model == 'mobilenet': mod, params = mobilenet() else: print(f"Error, incorrect model name: {model}") ### Need to bind params # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) print(f"Finishing quantizing graph") # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) print(f"Finishing packing graph") # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Graph runtime m = graph_runtime.create(graph, lib, ctx) # # # Set the network parameters and inputs data = np.random.uniform(size=data_shape).astype(dtype) m.set_input(**params) m.set_input('data', tvm.nd.array(data.astype(dtype))) # Perform inference and gather execution statistics # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator num = 1 # number of times we run module for a single measurement rep = 1 # number of measurements (we derive std dev from this) timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() timer() sim_stats = simulator.stats() print("\nExecution statistics:") for k, v in sim_stats.items(): # Since we execute the workload many times, we need to normalize stats # Note that there is always one warm up run # Therefore we divide the overall stats by (num * rep + 1) print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) else: tcost = timer() std = np.std(tcost.results) * 1000 mean = tcost.mean * 1000 print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
round_for_shift=True): mod = relay.quantize.quantize(mod, params=params) # Perform graph packing and constant folding for VTA target mod = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[MODEL_NAME][0], stop_name=pack_dict[MODEL_NAME][1], start_name_idx=pack_dict[MODEL_NAME][2], stop_name_idx=pack_dict[MODEL_NAME][3]) else: mod = mod["main"] # Compile Relay program with AlterOpLayout disabled with vta.build_config(disabled_pass={"AlterOpLayout"}): graph, lib, params = relay.build(mod, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start print(MODEL_NAME + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o")
def compile_model(self): if device == 'vta': self.remote = rpc.connect(self.pynq_addr, 9091) vta.reconfig_runtime(self.remote) vta.program_fpga(self.remote, bitstream=None) else: self.remote = rpc.LocalSession() self.ctx = self.remote.ext_dev( 0) if device == 'vta' else self.remote.cpu(0) # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {'data': 'float32'} shape_dict = {'data': (env.BATCH, 3, 224, 224)} gluon_model = vision.resnet18_v1( pretrained=True, ctx=ctx ).features if args.nonsplit else splitnet.resnet18_v1_split( self.id + 1) # Measure build start time build_start = time.time() # Start front end compilation mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict) # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) # Perform quantization in Relay with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) # Perform graph packing and constant folding for VTA target if target.device_name == 'vta': assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={'AlterOpLayout'}): if target.device_name != 'vta': graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) self.params = params # Measure Relay build time build_time = time.time() - build_start print(f'inference graph for thread {self.id} built in {0:.4f}s!'. format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath('graphlib.o')) self.remote.upload(temp.relpath('graphlib.o')) lib = self.remote.load_module('graphlib.o') # Graph runtime self.m = graph_runtime.create(graph, lib, self.ctx)
def check_alu(tvm_op, np_op=None, use_imm=False, test_name=None): """Test ALU""" m = 8 n = 8 imm = np.random.randint(1, 5) # compute a = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="a", dtype=env.acc_dtype) a_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: a(*i), "a_buf" ) # DRAM->SRAM if use_imm: res_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), imm), "res_buf" ) # compute else: b = te.placeholder((m, n, env.BATCH, env.BLOCK_OUT), name="b", dtype=env.acc_dtype) b_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: b(*i), "b_buf" ) # DRAM->SRAM res_buf = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: tvm_op(a_buf(*i), b_buf(*i)), "res_buf", ) # compute5B res = te.compute( (m, n, env.BATCH, env.BLOCK_OUT), lambda *i: res_buf(*i).astype(env.inp_dtype), "res", ) # SRAM->DRAM # schedule s = te.create_schedule(res.op) s[a_buf].set_scope(env.acc_scope) # SRAM s[a_buf].pragma(a_buf.op.axis[0], env.dma_copy) # DRAM->SRAM s[res_buf].set_scope(env.acc_scope) # SRAM s[res_buf].pragma(res_buf.op.axis[0], env.alu) # compute s[res].pragma(res.op.axis[0], env.dma_copy) # SRAM->DRAM if not use_imm: s[b_buf].set_scope(env.acc_scope) # SRAM s[b_buf].pragma(b_buf.op.axis[0], env.dma_copy) # DRAM->SRAM if not remote: return # build with vta.build_config(): if use_imm: mod = vta.build(s, [a, res], "ext_dev", env.target_host) else: mod = vta.build(s, [a, b, res], "ext_dev", env.target_host) temp = utils.tempdir() mod.save(temp.relpath("load_act.o")) remote.upload(temp.relpath("load_act.o")) f = remote.load_module("load_act.o") # verify dev = remote.ext_dev(0) a_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype(a.dtype) if use_imm: res_np = np_op(a_np, imm) if np_op else tvm_op(a_np, imm) else: b_np = np.random.randint(-16, 16, size=(m, n, env.BATCH, env.BLOCK_OUT)).astype( b.dtype ) res_np = np_op(a_np, b_np) if np_op else tvm_op(a_np, b_np) res_np = res_np.astype(res.dtype) a_nd = tvm.nd.array(a_np, dev) res_nd = tvm.nd.array(np.zeros((m, n, env.BATCH, env.BLOCK_OUT)).astype(res.dtype), dev) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() if use_imm: f(a_nd, res_nd) else: b_nd = tvm.nd.array(b_np, dev) f(a_nd, b_nd, res_nd) np.testing.assert_equal(res_np, res_nd.numpy()) if env.TARGET in ["sim", "tsim"]: sim_stats = simulator.stats() print("ALU {} execution statistics:".format(test_name)) for k, v in sim_stats.items(): print("\t{:<16}: {:>16}".format(k, v))