def relay_build_with_tensorrt( mod: Module, target: Target, params: dict, ) -> List[BuilderResult]: """Build a Relay IRModule with TensorRT BYOC Parameters ---------- mod : IRModule The Relay IRModule to build. target : Target The target to build the module for. params : Dict[str, NDArray] The parameter dict to build the module with. Returns ------- mod : runtime.Module The built module. """ from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt assert isinstance(target, Target) mod, config = partition_for_tensorrt(mod, params) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): result = tvm.relay.build_module._build_module_no_factory( mod, "cuda", "llvm", params) assert isinstance(result, Module) return result
def test_tensorrt_serialize(): if skip_codegen_test(): return import mxnet from mxnet.gluon.model_zoo.vision import get_model block = get_model("resnet18_v1", pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={"data": (1, 3, 224, 224)}, dtype="float32") # Compile mod, config = tensorrt.partition_for_tensorrt(mod, params) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): lib = relay.build(mod, "cuda", params=params) # Serialize lib.export_library("compiled.so") # Deserialize loaded_lib = tvm.runtime.load_module("compiled.so") # Run if skip_runtime_test(): return gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"]( tvm.gpu(0))) i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32") gen_module.run(data=i_data)
def test_tensorrt_dynamic_batch(): if skip_codegen_test(): return batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2] x_shape = (relay.Any(), 1, 8, 8) x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32") result_arr = [{} for _ in range(len(batches_to_test))] for use_trt in [True, False]: x = relay.var("x", shape=x_shape, dtype="float32") out = relay.nn.relu(x) f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f if use_trt: mod, _ = tensorrt.partition_for_tensorrt(mod) if not skip_runtime_test(): with relay.build_config(opt_level=3): relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") for i, batch_size in enumerate(batches_to_test): result_arr[i][use_trt] = relay_exec.evaluate()( x_data[:batch_size, ...]) if not skip_runtime_test(): for i in range(len(batches_to_test)): assert_result_dict_holds(result_arr[i])
def test_tensorrt_simple(): if skip_codegen_test(): return dtype = "float32" xshape = (1, 3, 2, 2) yshape = (1, 3, 1, 1) zshape = (1, 1, 1, 1) x = relay.var("x", shape=(xshape), dtype=dtype) y = relay.var("y", shape=(yshape), dtype=dtype) z = relay.var("z", shape=(zshape), dtype=dtype) w = z * (x + y) out = relay.nn.relu(w) f = relay.Function([x, y, z], out) mod = tvm.IRModule() mod["main"] = f mod, config = tensorrt.partition_for_tensorrt(mod) with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, params = relay.build(mod, "cuda") if skip_runtime_test(): return mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) x_data = np.random.uniform(-1, 1, xshape).astype(dtype) y_data = np.random.uniform(-1, 1, yshape).astype(dtype) z_data = np.random.uniform(-1, 1, zshape).astype(dtype) mod.run(x=x_data, y=y_data, z=z_data) results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
def test_tensorrt_dynamic_batch_conv(): if skip_codegen_test(): return batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2] x_shape = (relay.Any(), 32, 8, 8) x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32") k_shape = (16, 32, 3, 3) params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")} result_dict = {} for use_trt in [True, False]: x = relay.var("x", shape=x_shape, dtype="float32") kernel = relay.var("kernel", shape=k_shape, dtype="float32") out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1) f = relay.Function([x, kernel], out) mod = tvm.IRModule() mod["main"] = f if use_trt: mod = tensorrt.partition_for_tensorrt(mod, params) if not skip_runtime_test(): with relay.build_config(opt_level=3): relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") for i, batch_size in enumerate(batches_to_test): result_dict[(i, use_trt)] = relay_exec.evaluate()( x=x_data[:batch_size, ...], **params ) if not skip_runtime_test(): for i in range(len(batches_to_test)): assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
def test_dynamic_offload(): """ This test checks for proper dynamic offloading of relay graphs. An addition between the outputs of two conv2d's is performed, one of them having all static args whereas the other has a arg with dynamic shape. It is expected for the TRT partitioner to offload the conv2d with dynamic arg to TVM while running the other in TRT. """ if skip_codegen_test(): return data_shape = (1, 32, 8, 8) k_shape = (1, 32, 3, 3) x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32") y = relay.var("y", shape=(data_shape), dtype="float32") kernel = relay.var("kernel", shape=(k_shape), dtype="float32") def get_expected(): def set_func_attr(func, compile_name, symbol_name): func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) func = func.with_attr("Compiler", compile_name) func = func.with_attr("global_symbol", symbol_name) return func # Create a nested TRT function that matches the expected output mod = tvm.IRModule() var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32") out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4]) f1 = GlobalVar("tensorrt_0") func = relay.Function([var1, kernel_trt], out1) func = set_func_attr(func, "tensorrt", "tensorrt_0") mod[f1] = func mod = relay.transform.InferType()(mod) # Create the main function out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out = relay.add(out1, f1(y, kernel)) f = relay.Function([x, y, kernel], out) mod["main"] = f mod = relay.transform.InferType()(mod) return mod # Create relay function that will be offloaded to TRT out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4]) out = relay.add(out1, out2) f = relay.Function([x, y, kernel], out) # Pass the function to TRT compilation mod = tvm.IRModule() mod["main"] = f mod = relay.transform.InferType()(mod) mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={}) # Get the expected relay graph and compare mod_exp = get_expected() tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
def test_tensorrt_serialize_graph_runtime(): if skip_codegen_test(): return import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model data_shape = (1, 3, 224, 224) data_type = "float32" i_data = np.random.uniform(0, 1, data_shape).astype(data_type) block = get_model("resnet18_v1", pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type) mod, config = tensorrt.partition_for_tensorrt(mod) tmpdir = utils.tempdir() def compile_graph(mod, params): with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, params = relay.build(mod, params=params, target="cuda") params = relay.save_param_dict(params) return graph, lib, params def run_graph(graph, lib, params): mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) mod_.load_params(params) mod_.run(data=i_data) res = mod_.get_output(0) return res def save_graph(graph, lib, params): # Serialize with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json: f_graph_json.write(graph) with open(tmpdir.relpath("compiled.params"), "wb") as f_params: f_params.write(params) lib.export_library(tmpdir.relpath("compiled.so")) def load_graph(): # Deserialize with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json: graph = f_graph_json.read() with open(tmpdir.relpath("compiled.params"), "rb") as f_params: params = bytearray(f_params.read()) lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so")) return graph, lib, params # Test serialization with graph runtime graph, lib, graph_params = compile_graph(mod, params) save_graph(graph, lib, graph_params) loaded_graph, loaded_lib, loaded_params = load_graph() if not skip_runtime_test(): result_dict = dict() result_dict["graph"] = run_graph(graph, lib, graph_params) result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib, loaded_params) assert_result_dict_holds(result_dict)
def run_and_verify_func(config, target="cuda"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. Parameters ---------- config : Tuple[relay.Function, Dict[str, NDArray], List[str]] A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and 3) A list of which vars should be considered params. """ if skip_codegen_test(): return f, input_shapes, is_param = config params = { x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param } input_dict = { k: np.random.uniform(-1, 1, v).astype(np.float32) for k, v in input_shapes.items() if k not in is_param } # Run TRT mod = tvm.IRModule() mod["main"] = f mod, config = tensorrt.partition_for_tensorrt(mod, params) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, graph_params = relay.build(mod, target, params=params) if skip_runtime_test(): return ctx = tvm.context(target) mod = graph_runtime.create(graph, lib, ctx=ctx) mod.set_input(**graph_params) mod.run(**input_dict) results = [mod.get_output(i) for i in range(mod.get_num_outputs())] # Run reference mod = tvm.IRModule() mod["main"] = f with tvm.transform.PassContext(opt_level=3): graph, lib, graph_params = relay.build(mod, target, params=params) mod = graph_runtime.create(graph, lib, ctx=ctx) mod.set_input(**graph_params) mod.run(**input_dict) ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())] assert len(results) == len(ref_results) for i in range(len(results)): res = results[i].asnumpy() ref_res = ref_results[i].asnumpy() assert res.shape == ref_res.shape tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
def compile_and_run(i_data, input_shape, dtype, use_trt=True, num_iteration=1): import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model def check_trt_used(graph): import json graph = json.loads(graph) num_trt_subgraphs = sum([ 1 for n in graph["nodes"] if n.get("attrs", {}).get( "func_name", "").startswith("tensorrt_") ]) assert num_trt_subgraphs >= 1 block = get_model(model, pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod, params) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, params = relay.build(mod, "cuda", params=params) check_trt_used(graph) else: with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build(mod, "cuda", params=params) if skip_runtime_test(): return mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) mod.set_input(**params) # Warmup for i in range(10): mod.run(data=i_data) # Time times = [] for i in range(num_iteration): start_time = time.time() mod.run(data=i_data) res = mod.get_output(0) times.append(time.time() - start_time) latency = 1000.0 * np.mean(times) print(model, latency) return res
def test_tensorrt_serialize_vm(): if skip_codegen_test(): return import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model data_shape = (1, 3, 224, 224) data_type = "float32" i_data = np.random.uniform(0, 1, data_shape).astype(data_type) block = get_model("resnet18_v1", pretrained=True) mod, params = relay.frontend.from_mxnet(block, shape={"data": data_shape}, dtype=data_type) mod, config = tensorrt.partition_for_tensorrt(mod) tmpdir = utils.tempdir() def compile_vm(mod, params): with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): vm_exec = relay.vm.compile(mod, target="cuda", params=params) code, lib = vm_exec.save() return code, lib def run_vm(code, lib): vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib) vm = VirtualMachine(vm_exec, tvm.gpu(0)) result = vm.invoke("main", data=i_data) return result def save_vm(code, lib): # save and load the code and lib file. lib.export_library(tmpdir.relpath("path_lib.so")) with open(tmpdir.relpath("path_code.ro"), "wb") as fo: fo.write(code) def load_vm(): lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so")) code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read()) return lib, code # Test serialization with VM code_vm, lib_vm = compile_vm(mod, params) save_vm(code_vm, lib_vm) loaded_lib_vm, loaded_code_vm = load_vm() if not skip_runtime_test(): result_dict = dict() result_dict["vm"] = run_vm(code_vm, lib_vm) result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm) assert_result_dict_holds(result_dict)
def run_and_verify_func(config, target="cuda"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. Parameters ---------- config : Tuple[relay.Function, Dict[str, NDArray], List[str]] A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and 3) A list of which vars should be considered params. """ if skip_codegen_test(): return f, input_shapes, is_param = config params = { x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param } input_dict = { k: np.random.uniform(-1, 1, v).astype(np.float32) for k, v in input_shapes.items() if k not in is_param } ctx = tvm.context(target) result_dict = dict() for mode in ["graph", "vm"]: for use_trt in [False, True]: mod = tvm.IRModule() mod["main"] = f result_key = mode + ("_trt" if use_trt else "") if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod, params) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) else: with tvm.transform.PassContext(opt_level=3): exec = relay.create_executor(mode, mod=mod, ctx=ctx, target=target) if not skip_runtime_test(): result_dict[result_key] = exec.evaluate()(**input_dict, **params) if not skip_runtime_test(): assert_result_dict_holds(result_dict)
def convert_traced_model_to_vm_trt( traced_module: torch.jit.TopLevelTracedModule, np_sample_input: np.ndarray, target: str ) -> tvm.runtime.vm.Executable: """ This function converts a traced pytorch model to VM + TRT. """ input_shape = np_sample_input.shape input_name = "input0" shape_list = [(input_name, input_shape)] mod, params = relay.frontend.from_pytorch(traced_module, shape_list) mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True) with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]): vm_trt_exec = relay.vm.compile(mod, target=target, params=params) return vm_trt_exec
def compile_and_run(mod, params, i_data, mode="vm", use_trt=True): assert mode in ["graph", "vm"] if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod, params) check_trt_used(mod) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") else: with tvm.transform.PassContext(opt_level=3): exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None return res
def test_tensorrt_simple(): if skip_codegen_test(): return dtype = "float32" xshape = (1, 3, 2, 2) yshape = (1, 3, 1, 1) zshape = (1, 1, 1, 1) x = relay.var("x", shape=(xshape), dtype=dtype) y = relay.var("y", shape=(yshape), dtype=dtype) z = relay.var("z", shape=(zshape), dtype=dtype) w = z * (x + y) out = relay.nn.relu(w) f = relay.Function([x, y, z], out) x_data = np.random.uniform(-1, 1, xshape).astype(dtype) y_data = np.random.uniform(-1, 1, yshape).astype(dtype) z_data = np.random.uniform(-1, 1, zshape).astype(dtype) result_dict = dict() for mode in ["vm", "graph"]: for use_trt in [True, False]: mod = tvm.IRModule() mod["main"] = f result_key = mode + ("_trt" if use_trt else "") if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}): relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") else: with tvm.transform.PassContext(opt_level=3): relay_exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") if not skip_runtime_test(): result_dict[result_key] = relay_exec.evaluate()(x_data, y_data, z_data) if not skip_runtime_test(): assert_result_dict_holds(result_dict)
def test_tensorrt_not_compatible(): if skip_codegen_test(): return dtype = "float32" xshape = (1, 32, 14, 14) x_data = np.random.uniform(-1, 1, xshape).astype(dtype) x = relay.var("x", shape=(xshape), dtype=dtype) y = relay.add(x, x) z = relay.erf(y) out = relay.nn.relu(z) f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f mod, config = tensorrt.partition_for_tensorrt(mod) for mode in ["graph", "vm"]: with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") if not skip_runtime_test(): results = exec.evaluate()(x_data)
def test_tensorrt_not_compatible(): if skip_codegen_test(): return dtype = "float32" xshape = (1, 32, 14, 14) x = relay.var("x", shape=(xshape), dtype=dtype) y = relay.add(x, x) z = relay.erf(y) out = relay.nn.relu(z) f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f mod, config = tensorrt.partition_for_tensorrt(mod) with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, params = relay.build(mod, "cuda") if skip_runtime_test(): return mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0)) x_data = np.random.uniform(-1, 1, xshape).astype(dtype) mod.run(x=x_data) results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
def build_relay_with_tensorrt( mod: "IRModule", target: "Target", params: Dict[str, "NDArray"], ) -> "Module": """Build a Relay IRModule with TensorRT BYOC Parameters ---------- mod : IRModule The Relay IRModule to build. target : Target The target to build the module for. params : Dict[str, NDArray] The parameter dict to build the module with. Returns ------- mod : runtime.Module The built module. """ from tvm.ir.transform import PassContext from tvm.relay.build_module import _build_module_no_factory as relay_build from tvm.relay.op.contrib import tensorrt from tvm.runtime import Module mod, config = tensorrt.partition_for_tensorrt(mod, params) with PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config}, ): result = relay_build(mod, target=target, target_host=None, params=params) assert isinstance(result, Module) return result
def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): result_arr = [{} for _ in range(len(x_data_list))] for use_trt in [True, False]: x = relay.var("x", shape=x_shape, dtype="float32") out = relay.reshape(x, new_shape) f = relay.Function([x], out) mod = tvm.IRModule() mod["main"] = f if use_trt: mod, _ = tensorrt.partition_for_tensorrt( mod, params={}, remove_no_mac_subgraphs=False ) assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt if not skip_runtime_test(): with relay.build_config(opt_level=3): relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") for i, x_data in enumerate(x_data_list): result_arr[i][use_trt] = relay_exec.evaluate()(x_data) if not skip_runtime_test(): for i in range(len(x_data_list)): assert_result_dict_holds(result_arr[i])
def verify_meta_schedule_with_tensorrt( mod, params, data_shape, use_meta_sched: bool = True, use_trt: bool = True, mode: str = "vm" ): if use_meta_sched: # With meta_schedule dev = "cuda" # Build if use_trt: from tvm.meta_schedule.testing import relay_build_with_tensorrt builder = LocalBuilder(f_build=relay_build_with_tensorrt) else: def relay_build_without_tensorrt( mod: Module, target: Target, params: dict, ) -> List[BuilderResult]: return tvm.relay.build_module._build_module_no_factory(mod, "cuda", "llvm", params) builder = LocalBuilder(f_build=relay_build_without_tensorrt) builder_input = BuilderInput(mod, Target(dev, host="llvm"), params) (builder_result,) = builder.build([builder_input]) assert builder_result.error_msg is None assert builder_result.artifact_path is not None # Run evaluator_config = EvaluatorConfig( number=5, repeat=2, min_repeat_ms=0, enable_cpu_cache_flush=False, ) runner_input = RunnerInput( builder_result.artifact_path, "cuda", [TensorInfo("float32", data_shape)] ) def eval_func(rt_mod, device, evaluator_config, repeated_args): rt_mod = tvm.contrib.graph_executor.GraphModule(rt_mod["default"](device)) eval = rt_mod.module.time_evaluator( func_name="run", dev=device, number=evaluator_config.number, repeat=evaluator_config.repeat, min_repeat_ms=evaluator_config.min_repeat_ms, f_preproc="cache_flush_cpu_non_first_arg" if evaluator_config.enable_cpu_cache_flush else "", ) repeated_costs: List[List[float]] = [] for args in repeated_args: profile_result = eval(*args) repeated_costs.append(profile_result.results) costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)] return costs runner = LocalRunner( evaluator_config=evaluator_config, f_run_evaluator=eval_func, ) # Run the module (runner_future,) = runner.run([runner_input]) runner_result = runner_future.result() assert runner_result is not None assert runner_result.run_secs is not None assert runner_result.error_msg is None for result in runner_result.run_secs: if isinstance(result, FloatImm): result = result.value assert isinstance(result, float) assert result >= 0.0 else: # Without meta_schedule if use_trt: mod, config = tensorrt.partition_for_tensorrt(mod) with tvm.transform.PassContext( opt_level=3, config={"relay.ext.tensorrt.options": config} ): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda" ).evaluate() else: with tvm.transform.PassContext(opt_level=3): func = relay.create_executor( mode, mod=mod, device=tvm.cuda(0), target="cuda", params=params ).evaluate()
def test_trt_int8(): """ This Function is used to use tensorrt int8 to compile a resnet34 model, and compare cosine distance between the output of the original model and trt int8 tvm ouput """ if skip_codegen_test() or skip_runtime_test(): return try: from PIL import Image from scipy.spatial import distance except: print("please install scipy and Image python packages") return try: import torch import torchvision from torchvision import transforms except: print("please install pytorch python package") return os.environ["TVM_TENSORRT_USE_INT8"] = "1" os.environ["TENSORRT_NUM_CALI_INT8"] = "10" model_name = "resnet34" model = getattr(torchvision.models, model_name)(pretrained=True) model = model.eval() # We grab the TorchScripted model via tracing input_shape = [1, 3, 224, 224] input_data = torch.randn(input_shape) scripted_model = torch.jit.trace(model, input_data).eval() img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true" img_path = download_testdata(img_url, "cat.png", module="data") img = Image.open(img_path).resize((224, 224)) my_preprocess = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ]) img = my_preprocess(img) img = np.expand_dims(img, 0) input_name = "input0" shape_list = [(input_name, img.shape)] mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) # compile the model target = "cuda" dev = tvm.cuda() mod = partition_for_tensorrt(mod, params) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target=target, params=params) gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"]) if num_cali_int8 != 0: print("start calibrating data ... ") for i in range(num_cali_int8): tvm_data = tvm.nd.array(img) gen_module.set_input(input_name, tvm_data) gen_module.run(data=tvm_data) print("finished calibrating data ... ") # get output of tvm model print("rebuild engine and test to run ... ") tvm_data = tvm.nd.array(img) gen_module.set_input(input_name, tvm_data) gen_module.run(data=tvm_data) out = gen_module.get_output(0) # check output of tvm and output of pytorch model are equal torch_data = torch.from_numpy(img) model = scripted_model.eval() torch_output = model(torch_data) cosine_distance_res = distance.cosine(out.numpy(), torch_output.detach().cpu().numpy()) assert cosine_distance_res <= 0.01 # Evaluate print("Evaluate inference time cost...") ftimer = gen_module.module.time_evaluator("run", dev, repeat=10, min_repeat_ms=500) prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond message = "Mean inference time (std dev): %.2f ms (%.2f ms)" % ( np.mean(prof_res), np.std(prof_res), ) print(message)