def test_env_scope(): env = vta.get_env() cfg = env.pkg_config().cfg_dict cfg["TARGET"] = "xyz" with vta.Environment(cfg): assert vta.get_env().TARGET == "xyz" assert vta.get_env().TARGET == env.TARGET
def run_single(model, target_name, device, config): """Run the experiment on a single target.""" with init_vta_env(target_name): vta_env = vta.get_env() target = vta_env.target if device == 'vta' else vta_env.target_vta_cpu # Make sure TVM was compiled with RPC support. assert tvm.module.enabled('rpc') remote = init_remote(vta_env, config) # Get execution context from remote ctx = remote.ext_dev(0) if device == 'vta' else remote.cpu(0) graph_module = build_model(model, remote, target, ctx, vta_env) mean_time = run_model(graph_module, remote, ctx, vta_env, config) return mean_time
def convert_to_vta(model_path, image_channel, image_size): device = torch.device('cpu') model = torch.load(model_path, map_location=device) model = model.eval() input_shape = [1, image_channel, image_size, image_size] input_data = torch.randn(input_shape) scripted_model = torch.jit.trace(model, input_data).eval() shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) print(mod["main"]) remote = rpc.LocalSession() ctx = remote.ext_dev(0) target = 'vta' target_host = 'vta' env = vta.get_env() pack_dict = { "yolov3-tiny": ["nn.max_pool2d", "cast", 8, 237], } MODEL_NAME = 'yolov3-tiny' with tvm.transform.PassContext(opt_level=2): with relay.quantize.qconfig(global_scale=33.0, skip_conv_layers=[0], store_lowbit_output=True, round_for_shift=True): mod = relay.quantize.quantize(mod, params=params) print(mod["main"]) mod = graph_pack(mod["main"], env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=pack_dict[MODEL_NAME][0], stop_name=pack_dict[MODEL_NAME][1], start_name_idx=pack_dict[MODEL_NAME][2], stop_name_idx=pack_dict[MODEL_NAME][3]) return mod
args = parser.parse_args() ctx = mx.cpu() # get dense layer if args.nonsplit: dense = vision.resnet18_v1(pretrained=True, ctx=ctx).output else: dense = gluon.nn.Dense(1000) dense.load_parameters('params/dense-1.params', ctx=ctx) #get categories for imagenet categories = np.array(json.load(open('image_net_labels.json', 'r'))) assert tvm.module.enabled('rpc') # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # device, `vta` or `cpu` device = 'vta' target = env.target if device == 'vta' else env.target_vta_cpu start_pack = 'nn.max_pool2d' stop_pack = 'nn.global_avg_pool2d' # perform inference and gather execution statistics num = 1 # number of times we run module for a single measurement rep = 1 # number of measurements (we derive std dev from this) # ip addresses of pynq boards, hardcoded for demo if args.nonsplit: pynqs = ['192.168.2.5']
# --------- # We start by programming the Pynq's FPGA and building its RPC runtime # as we did in the VTA introductory tutorial. from __future__ import absolute_import, print_function import os import tvm import vta import numpy as np from tvm import rpc from tvm.contrib import util from vta.testing import simulator # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # We read the Pynq RPC host IP address and port number from the OS environment host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99") port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091")) # We configure both the bitstream and the runtime system on the Pynq # to match the VTA configuration specified by the vta_config.json file. if env.TARGET == "pynq": # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") remote = rpc.connect(host, port) # Reconfigure the JIT runtime vta.reconfig_runtime(remote)
def test_env(): env = vta.get_env() mock = env.mock assert mock.alu == "skip_alu"
import os from os.path import exists import numpy as np from mxnet.gluon.model_zoo import vision import tvm from tvm import autotvm, relay from tvm.relay import op, transform import vta from vta.top import graph_pack from vta.top.graphpack import run_opt_pass # Load VTA parameters from the vta/config/vta_config.json file ENV = vta.get_env() assert ENV.target.device_name == "vta" # Dictionary lookup for when to start/end bit packing PACK_DICT = { "resnet18_v1": ["nn.max_pool2d", "nn.global_avg_pool2d", None, None], } # Name of Gluon model to compile MODEL = "resnet18_v1" assert MODEL in PACK_DICT def merge_transform_to_mxnet_model(mod): """ Add Image Transform Logic Into Model """ svalue = np.array([123., 117., 104.]) sub_data = relay.Constant(tvm.nd.array(svalue)).astype("float32")
def main(model, start_pack, stop_pack, data_shape=(1, 3, 224, 224), dtype='float32'): # Make sure that TVM was compiled with RPC=1 assert tvm.module.enabled("rpc") ###################################################################### # Define the platform and model targets # ------------------------------------- # Execute on CPU vs. VTA, and define the model. # Load VTA parameters from the vta/config/vta_config.json file env = vta.get_env() # Set ``device=arm_cpu`` to run inference on the CPU # or ``device=vta`` to run inference on the FPGA. device = "vta" target = env.target if device == "vta" else env.target_vta_cpu # Name of Gluon model to compile # The ``start_pack`` and ``stop_pack`` labels indicate where # to start and end the graph packing relay pass: in other words # where to start and finish offloading to VTA. ###################################################################### # Obtain an execution remote # --------------------------------- # When target is 'pynq', reconfigure FPGA and runtime. # Otherwise, if target is 'sim', execute locally. print(f"Target is {env.TARGET}") if env.TARGET in ["sim", "tsim"]: remote = rpc.LocalSession() else: print(f"Error, incorrect target for benchmarking: {env.TARGET}") # Get execution context from remote ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) ###################################################################### # Build the inference graph runtime # --------------------------------- # Grab ResNet-18 model from Gluon model zoo and compile with Relay. # The compilation steps are: # 1) Front end translation from MxNet into Relay module. # 2) Apply 8-bit quantization: here we skip the first conv layer, # and dense layer which will both be executed in fp32 on the CPU. # 3) Perform graph packing to alter the data layout for tensorization. # 4) Perform constant folding to reduce number of operators (e.g. eliminate # batch norm multiply). # 5) Perform relay build to object file. # 6) Load the object file onto remote (FPGA device). # 7) Generate graph runtime, `m`. # Load pre-configured AutoTVM schedules with autotvm.tophub.context(target): # Populate the shape and data type dictionary for ResNet input dtype_dict = {"data": 'float32'} shape_dict = {"data": data_shape} # Measure build start time build_start = time.time() # Start front end compilation if model == 'resnet': mod, params = test_resnet_mxnet(env) elif model == 'yolo': mod, params = test_yolo_darknet() elif model == 'lenet': mod, params = lenet() elif model == 'mobilenet': mod, params = mobilenet() else: print(f"Error, incorrect model name: {model}") ### Need to bind params # Update shape and type dictionary shape_dict.update({k: v.shape for k, v in params.items()}) dtype_dict.update({k: str(v.dtype) for k, v in params.items()}) with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]): relay_prog = relay.quantize.quantize(mod['main'], params=params) print(f"Finishing quantizing graph") # Perform graph packing and constant folding for VTA target if target.device_name == "vta": assert env.BLOCK_IN == env.BLOCK_OUT relay_prog = graph_pack(relay_prog, env.BATCH, env.BLOCK_OUT, env.WGT_WIDTH, start_name=start_pack, stop_name=stop_pack) print(f"Finishing packing graph") # Compile Relay program with AlterOpLayout disabled with relay.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): if target.device_name != "vta": graph, lib, params = relay.build(relay_prog, target=target, params=params, target_host=env.target_host) else: with vta.build_config(): graph, lib, params = relay.build( relay_prog, target=target, params=params, target_host=env.target_host) # Measure Relay build time build_time = time.time() - build_start print(model + " inference graph built in {0:.2f}s!".format(build_time)) # Send the inference library over to the remote RPC server temp = util.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Graph runtime m = graph_runtime.create(graph, lib, ctx) # # # Set the network parameters and inputs data = np.random.uniform(size=data_shape).astype(dtype) m.set_input(**params) m.set_input('data', tvm.nd.array(data.astype(dtype))) # Perform inference and gather execution statistics # More on: https://docs.tvm.ai/api/python/module.html#tvm.module.Module.time_evaluator num = 1 # number of times we run module for a single measurement rep = 1 # number of measurements (we derive std dev from this) timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep) if env.TARGET in ["sim", "tsim"]: simulator.clear_stats() timer() sim_stats = simulator.stats() print("\nExecution statistics:") for k, v in sim_stats.items(): # Since we execute the workload many times, we need to normalize stats # Note that there is always one warm up run # Therefore we divide the overall stats by (num * rep + 1) print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1))) else: tcost = timer() std = np.std(tcost.results) * 1000 mean = tcost.mean * 1000 print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH)) print("Average per sample inference time: %.2fms" % (mean / env.BATCH))