def compile_and_run_vm(mod, params, data_np, target): with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target=target, params=params) dev = tvm.device(target, 0) vm = VirtualMachine(vm_exec, dev) vm.set_input("main", **{input_name: data_np}) return vm.run()
def f_timer( rt_mod: Union[tvm.runtime.Module, tvm.runtime.vm.Executable], dev: tvm.device, input_data: Dict[str, NDArray], ) -> None: """Run and benchmark the given runtime module, print out the result. Parameters ---------- rt_mod : Union[tvm.runtime.Module, tvm.runtime.vm.Executable] The runtime module or vm executable. dev : tvm.device The device type to run workload. input_data : Dict[str, np.ndarray] The input data as a dictionary. """ from tvm.contrib.graph_executor import GraphModule # pylint:disable=import-outside-toplevel from tvm.runtime.vm import VirtualMachine # pylint:disable=import-outside-toplevel try: if backend == "vm": vm = VirtualMachine(rt_mod, dev) # pylint: disable=invalid-name ftimer = vm.benchmark(dev, min_repeat_ms=500, repeat=5, number=1, end_to_end=False, **input_data) elif backend == "graph": mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator("run", dev, min_repeat_ms=500, repeat=5, number=1)() else: raise ValueError( f"Backend {backend} not supported in f_timer!") results = list(np.array(ftimer.results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) print("-------------------------------") print(f" Min (ms) : {min(results)}") print(f" Max (ms) : {max(results)}") print(f" Median (ms) : {median(results)}") print(f"Average (ms) : {sum(results) / len(results)}") except Exception as exc: # pylint: disable=broad-except print(f"Run module f_timer via RPC failed, exception: {exc}", )
def test_detection_models(): img = "test_street_small.jpg" img_url = ("https://raw.githubusercontent.com/dmlc/web-data/" "master/gluoncv/detection/street_small.jpg") download(img_url, img) input_shape = (1, 3, in_size, in_size) target = "llvm" input_name = "input0" shape_list = [(input_name, input_shape)] score_threshold = 0.9 scripted_model = generate_jit_model(1) mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): vm_exec = relay.vm.compile(mod, target=target, params=params) ctx = tvm.cpu() vm = VirtualMachine(vm_exec, ctx) data = process_image(img) pt_res = scripted_model(data) data = data.detach().numpy() vm.set_input("main", **{input_name: data}) tvm_res = vm.run() # Note: due to accumulated numerical error, we can't directly compare results # with pytorch output. Some boxes might have a quite tiny difference in score # and the order can become different. We just measure how many valid boxes # there are for input image. pt_scores = pt_res[1].detach().numpy().tolist() tvm_scores = tvm_res[1].asnumpy().tolist() num_pt_valid_scores = num_tvm_valid_scores = 0 for score in pt_scores: if score >= score_threshold: num_pt_valid_scores += 1 else: break for score in tvm_scores: if score >= score_threshold: num_tvm_valid_scores += 1 else: break assert num_pt_valid_scores == num_tvm_valid_scores, ( "Output mismatch: Under score threshold {}, Pytorch has {} valid " "boxes while TVM has {}.".format(score_threshold, num_pt_valid_scores, num_tvm_valid_scores))
def get_ref_vm(mod, params, target="cuda"): with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target=target, params=params) code, lib = vm_exec.save() dev = tvm.device(target, 0) vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib) return VirtualMachine(vm_exec, dev), dev
def profile_and_build_vm( mod, params, sm, split_k_slices=[1], tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro", use_fast_math=False, use_3xtf32=True, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, split_k_slices=split_k_slices, use_3xtf32=use_3xtf32, profile_all_alignments=False, find_first_valid=True, tmp_dir=tmp_dir, ) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def test_detection_models(): img = "test_street_small.jpg" img_url = ( "https://raw.githubusercontent.com/dmlc/web-data/" "master/gluoncv/detection/street_small.jpg" ) download(img_url, img) input_shape = (1, 3, in_size, in_size) input_name = "input0" shape_list = [(input_name, input_shape)] scripted_model = generate_jit_model(1) mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) data = process_image(img) data_np = data.detach().numpy() with torch.no_grad(): pt_res = scripted_model(data) for target in ["llvm", "cuda"]: with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target=target, params=params) ctx = tvm.context(target, 0) vm = VirtualMachine(vm_exec, ctx) vm.set_input("main", **{input_name: data_np}) tvm_res = vm.run() # Bounding boxes tvm.testing.assert_allclose( pt_res[0].cpu().numpy(), tvm_res[0].asnumpy(), rtol=1e-5, atol=1e-5 ) # Scores tvm.testing.assert_allclose( pt_res[1].cpu().numpy(), tvm_res[1].asnumpy(), rtol=1e-5, atol=1e-5 ) # Class ids np.testing.assert_equal(pt_res[2].cpu().numpy(), tvm_res[2].asnumpy()) score_threshold = 0.9 print("Num boxes:", pt_res[0].cpu().numpy().shape[0]) print("Num valid boxes:", np.sum(pt_res[1].cpu().numpy() >= score_threshold))
def profile_and_build_vm( mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro" ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1): sim_q = relay.qnn.op.simulated_quantize( input_data, scale, zp, axis=axis, out_dtype=dtype, ) mod = tvm.IRModule.from_expr(sim_q) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, "llvm", params=None) vm = VirtualMachine(vm_exec, tvm.cpu(0)) return vm
# best performance, due to the existence of large dense operator in # torchvision rcnn models. # Add "-libs=mkl" to get best performance on x86 target. # For x86 machine supports AVX512, the complete target is # "llvm -mcpu=skylake-avx512 -libs=mkl" target = "llvm" with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): vm_exec = relay.vm.compile(mod, target=target, params=params) ###################################################################### # Inference with Relay VM # ----------------------- dev = tvm.cpu() vm = VirtualMachine(vm_exec, dev) vm.set_input("main", **{input_name: img}) tvm_res = vm.run() ###################################################################### # Get boxes with score larger than 0.9 # ------------------------------------ score_threshold = 0.9 boxes = tvm_res[0].numpy().tolist() valid_boxes = [] for i, score in enumerate(tvm_res[1].numpy().tolist()): if score > score_threshold: valid_boxes.append(boxes[i]) else: break
def run_vm(code, lib): vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib) vm = VirtualMachine(vm_exec, tvm.gpu(0)) result = vm.invoke("main", data=i_data) return result
def run_vm(vm_exec, input_, ctx=tvm.cpu(0)): vm = VirtualMachine(vm_exec, ctx) _out = vm.invoke("main", input_) return vmobj_to_list(_out)
def run_case(dtype, image, target): # Check image import os import json import sys STAT_REPEAT=os.environ.get('STAT_REPEAT','') if STAT_REPEAT=='' or STAT_REPEAT==None: STAT_REPEAT=10 STAT_REPEAT=int(STAT_REPEAT) # FGG: set model files via CK env CATEG_FILE = '../synset.txt' synset = eval(open(os.path.join(CATEG_FILE)).read()) files=[] val={} # FGG: set timers import time timers={} img_orig = cv2.imread(image) img = cv2.cvtColor(img_orig, cv2.COLOR_BGR2RGB) output_height, output_width, _ = 224, 224, 3 img = resize_with_aspectratio(img, output_height, output_width, inter_pol=cv2.INTER_AREA) img = center_crop(img, output_height, output_width) img = np.asarray(img, dtype='float32') # normalize image means = np.array([123.68, 116.78, 103.94], dtype=np.float32) img -= means # transpose if needed img = img.transpose([2, 0, 1]) import matplotlib.pyplot as plt img1 = img.transpose([1, 2, 0]) arr_ = np.squeeze(img1) # you can give axis attribute if you wanna squeeze in specific dimension plt.imshow(arr_) # plt.show() plt.savefig('pre-processed-image.png') # Load model model_path=os.environ.get('CK_ENV_ONNX_MODEL_ONNX_FILEPATH','') if model_path=='': print ('Error: environment variable CK_ENV_ONNX_MODEL_ONNX_FILEPATH is not defined') exit(1) opt = rt.SessionOptions() sess = rt.InferenceSession(model_path, opt) inputs = [meta.name for meta in sess.get_inputs()] outputs = [meta.name for meta in sess.get_outputs()] print (inputs) print (outputs) if os.environ.get('USE_TVM','')=='yes': import tvm from tvm import relay import onnx del sess # Load model via ONNX to be used with TVM print ('') print ('ONNX: load model ...') print ('') onnx_model = onnx.load(model_path) # Init TVM # TBD: add tvm platform selector if os.environ.get('USE_CUDA','')=='yes': # CK TVM package must be built with CUDA enabled ctx = tvm.cuda(0) else: ctx = tvm.cpu(0) tvm_ctx = ctx build_conf = {'relay.backend.use_auto_scheduler': False} opt_lvl = int(os.environ.get('TVM_OPT_LEVEL', 3)) target = os.environ.get('TVM_TARGET', 'llvm -mcpu=znver2') target_host=None params={} # New target API tvm_target = tvm.target.Target(target, host=target_host) input_shape = (1, 3, 224, 224) shape_dict = {inputs[0]: input_shape} print ('') print ('TVM: import model ...') print ('') # Extra param: opset=12 mod, params = relay.frontend.from_onnx(onnx_model, shape_dict, freeze_params=True) print ('') print ('TVM: transform to static ...') print ('') mod = relay.transform.DynamicToStatic()(mod) print ('') print ('TVM: apply extra optimizations ...') print ('') # Padding optimization # Adds extra optimizations mod = relay.transform.FoldExplicitPadding()(mod) print ('') print ('TVM: build model ...') print ('') executor=os.environ.get('MLPERF_TVM_EXECUTOR','graph') if executor == "graph" or executor == "debug": from tvm.contrib import graph_executor # Without history with tvm.transform.PassContext(opt_level=opt_lvl, config=build_conf): graph_module = relay.build(mod, target=tvm_target, params=params) lib = graph_module print ('') print ('TVM: init graph engine ...') print ('') sess = graph_executor.GraphModule(lib['default'](ctx)) elif executor == "vm": from tvm.runtime.vm import VirtualMachine # Without history with tvm.transform.PassContext(opt_level=opt_lvl, config=build_conf): vm_exec = relay.vm.compile(mod, target=tvm_target, params=params) r_exec = vm_exec print ('') print ('TVM: init VM ...') print ('') sess = VirtualMachine(r_exec, ctx) # For now only graph sess.set_input(inputs[0], tvm.nd.array([img])) # Run TVM inference sess.run() # Process TVM outputs output = [] for i in range(sess.get_num_outputs()): # Take only the output of batch size for dynamic batches if len(output)<(i+1): output.append([]) output[i].append(sess.get_output(i).asnumpy()[0]) else: inp={inputs[0]:np.array([img], dtype=np.float32)} output=sess.run(outputs, inp) top1 = np.argmax(output[1]) #.asnumpy()) top5=[] atop5 = get_top5(output[1][0]) #.asnumpy()) print ('') print('Prediction Top1:', top1, synset[top1]) print ('') print('Prediction Top5:') for p in atop5: out=p[1] name=synset[out] print (' * {} {}'.format(out, name)) ck_results={ 'prediction':synset[top1] } with open('tmp-ck-timer.json', 'w') as ck_results_file: json.dump(ck_results, ck_results_file, indent=2, sort_keys=True) return