Ejemplo n.º 1
0
def relay_build_with_tensorrt(
    mod: Module,
    target: Target,
    params: dict,
) -> List[BuilderResult]:
    """Build a Relay IRModule with TensorRT BYOC
    Parameters
    ----------
    mod : IRModule
        The Relay IRModule to build.
    target : Target
        The target to build the module for.
    params : Dict[str, NDArray]
        The parameter dict to build the module with.
    Returns
    -------
    mod : runtime.Module
        The built module.
    """
    from tvm.relay.op.contrib.tensorrt import partition_for_tensorrt

    assert isinstance(target, Target)
    mod, config = partition_for_tensorrt(mod, params)
    with tvm.transform.PassContext(
            opt_level=3, config={"relay.ext.tensorrt.options": config}):
        result = tvm.relay.build_module._build_module_no_factory(
            mod, "cuda", "llvm", params)
    assert isinstance(result, Module)
    return result
Ejemplo n.º 2
0
def test_tensorrt_serialize():
    if skip_codegen_test():
        return
    import mxnet
    from mxnet.gluon.model_zoo.vision import get_model

    block = get_model("resnet18_v1", pretrained=True)
    mod, params = relay.frontend.from_mxnet(block,
                                            shape={"data": (1, 3, 224, 224)},
                                            dtype="float32")
    # Compile
    mod, config = tensorrt.partition_for_tensorrt(mod, params)
    with tvm.transform.PassContext(
            opt_level=3, config={"relay.ext.tensorrt.options": config}):
        lib = relay.build(mod, "cuda", params=params)
    # Serialize
    lib.export_library("compiled.so")
    # Deserialize
    loaded_lib = tvm.runtime.load_module("compiled.so")
    # Run
    if skip_runtime_test():
        return
    gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib["default"](
        tvm.gpu(0)))
    i_data = np.random.uniform(0, 1, (1, 3, 224, 224)).astype("float32")
    gen_module.run(data=i_data)
Ejemplo n.º 3
0
def test_tensorrt_dynamic_batch():
    if skip_codegen_test():
        return

    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
    x_shape = (relay.Any(), 1, 8, 8)
    x_data = np.ones([max(batches_to_test)] +
                     list(x_shape)[1:]).astype("float32")
    result_arr = [{} for _ in range(len(batches_to_test))]
    for use_trt in [True, False]:
        x = relay.var("x", shape=x_shape, dtype="float32")
        out = relay.nn.relu(x)
        f = relay.Function([x], out)
        mod = tvm.IRModule()
        mod["main"] = f
        if use_trt:
            mod, _ = tensorrt.partition_for_tensorrt(mod)

        if not skip_runtime_test():
            with relay.build_config(opt_level=3):
                relay_exec = relay.create_executor("vm",
                                                   mod=mod,
                                                   ctx=tvm.cpu(0),
                                                   target="llvm")

            for i, batch_size in enumerate(batches_to_test):
                result_arr[i][use_trt] = relay_exec.evaluate()(
                    x_data[:batch_size, ...])

    if not skip_runtime_test():
        for i in range(len(batches_to_test)):
            assert_result_dict_holds(result_arr[i])
Ejemplo n.º 4
0
def test_tensorrt_simple():
    if skip_codegen_test():
        return
    dtype = "float32"
    xshape = (1, 3, 2, 2)
    yshape = (1, 3, 1, 1)
    zshape = (1, 1, 1, 1)
    x = relay.var("x", shape=(xshape), dtype=dtype)
    y = relay.var("y", shape=(yshape), dtype=dtype)
    z = relay.var("z", shape=(zshape), dtype=dtype)
    w = z * (x + y)
    out = relay.nn.relu(w)
    f = relay.Function([x, y, z], out)

    mod = tvm.IRModule()
    mod["main"] = f
    mod, config = tensorrt.partition_for_tensorrt(mod)
    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
        graph, lib, params = relay.build(mod, "cuda")
    if skip_runtime_test():
        return
    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)
    mod.run(x=x_data, y=y_data, z=z_data)
    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
Ejemplo n.º 5
0
def test_tensorrt_dynamic_batch_conv():
    if skip_codegen_test():
        return
    batches_to_test = [1, 1, 0, 2, 3, 0, 1, 3, 2]
    x_shape = (relay.Any(), 32, 8, 8)
    x_data = np.ones([max(batches_to_test)] + list(x_shape)[1:]).astype("float32")
    k_shape = (16, 32, 3, 3)
    params = {"kernel": np.random.uniform(-1, 1, k_shape).astype("float32")}
    result_dict = {}
    for use_trt in [True, False]:
        x = relay.var("x", shape=x_shape, dtype="float32")
        kernel = relay.var("kernel", shape=k_shape, dtype="float32")
        out = relay.nn.conv2d(x, kernel, channels=16, kernel_size=(3, 3), groups=1)
        f = relay.Function([x, kernel], out)
        mod = tvm.IRModule()
        mod["main"] = f
        if use_trt:
            mod = tensorrt.partition_for_tensorrt(mod, params)

        if not skip_runtime_test():
            with relay.build_config(opt_level=3):
                relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")

            for i, batch_size in enumerate(batches_to_test):
                result_dict[(i, use_trt)] = relay_exec.evaluate()(
                    x=x_data[:batch_size, ...], **params
                )

    if not skip_runtime_test():
        for i in range(len(batches_to_test)):
            assert_result_matches(result_dict[(i, True)], result_dict[(i, False)])
Ejemplo n.º 6
0
def test_dynamic_offload():
    """
    This test checks for proper dynamic offloading of relay graphs. An addition between
    the outputs of two conv2d's is performed, one of them having all static args whereas
    the other has a arg with dynamic shape. It is expected for the TRT partitioner to
    offload the conv2d with dynamic arg to TVM while running the other in TRT.
    """

    if skip_codegen_test():
        return

    data_shape = (1, 32, 8, 8)
    k_shape = (1, 32, 3, 3)

    x = relay.var("x", shape=(data_shape[0], data_shape[1], Any(), Any()), dtype="float32")
    y = relay.var("y", shape=(data_shape), dtype="float32")
    kernel = relay.var("kernel", shape=(k_shape), dtype="float32")

    def get_expected():
        def set_func_attr(func, compile_name, symbol_name):
            func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
            func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1))
            func = func.with_attr("Compiler", compile_name)
            func = func.with_attr("global_symbol", symbol_name)
            return func

        # Create a nested TRT function that matches the expected output
        mod = tvm.IRModule()
        var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32")
        kernel_trt = relay.var("tensorrt_0_i1", shape=(k_shape), dtype="float32")
        out1 = relay.nn.conv2d(var1, kernel_trt, channels=k_shape[0], kernel_size=k_shape[2:4])
        f1 = GlobalVar("tensorrt_0")
        func = relay.Function([var1, kernel_trt], out1)
        func = set_func_attr(func, "tensorrt", "tensorrt_0")
        mod[f1] = func
        mod = relay.transform.InferType()(mod)

        # Create the main function
        out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
        out = relay.add(out1, f1(y, kernel))
        f = relay.Function([x, y, kernel], out)
        mod["main"] = f
        mod = relay.transform.InferType()(mod)
        return mod

    # Create relay function that will be offloaded to TRT
    out1 = relay.nn.conv2d(x, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
    out2 = relay.nn.conv2d(y, kernel, channels=k_shape[0], kernel_size=k_shape[2:4])
    out = relay.add(out1, out2)
    f = relay.Function([x, y, kernel], out)

    # Pass the function to TRT compilation
    mod = tvm.IRModule()
    mod["main"] = f
    mod = relay.transform.InferType()(mod)
    mod_trt, config = tensorrt.partition_for_tensorrt(mod, params={})

    # Get the expected relay graph and compare
    mod_exp = get_expected()
    tvm.ir.assert_structural_equal(mod_trt, mod_exp, map_free_vars=True)
Ejemplo n.º 7
0
def test_tensorrt_serialize_graph_runtime():
    if skip_codegen_test():
        return
    import mxnet as mx
    from mxnet.gluon.model_zoo.vision import get_model

    data_shape = (1, 3, 224, 224)
    data_type = "float32"
    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
    block = get_model("resnet18_v1", pretrained=True)
    mod, params = relay.frontend.from_mxnet(block,
                                            shape={"data": data_shape},
                                            dtype=data_type)
    mod, config = tensorrt.partition_for_tensorrt(mod)
    tmpdir = utils.tempdir()

    def compile_graph(mod, params):
        with tvm.transform.PassContext(
                opt_level=3, config={"relay.ext.tensorrt.options": config}):
            graph, lib, params = relay.build(mod, params=params, target="cuda")
            params = relay.save_param_dict(params)
        return graph, lib, params

    def run_graph(graph, lib, params):
        mod_ = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
        mod_.load_params(params)
        mod_.run(data=i_data)
        res = mod_.get_output(0)
        return res

    def save_graph(graph, lib, params):
        # Serialize
        with open(tmpdir.relpath("compiled.json"), "w") as f_graph_json:
            f_graph_json.write(graph)
        with open(tmpdir.relpath("compiled.params"), "wb") as f_params:
            f_params.write(params)
        lib.export_library(tmpdir.relpath("compiled.so"))

    def load_graph():
        # Deserialize
        with open(tmpdir.relpath("compiled.json"), "r") as f_graph_json:
            graph = f_graph_json.read()
        with open(tmpdir.relpath("compiled.params"), "rb") as f_params:
            params = bytearray(f_params.read())
        lib = tvm.runtime.load_module(tmpdir.relpath("compiled.so"))
        return graph, lib, params

    # Test serialization with graph runtime
    graph, lib, graph_params = compile_graph(mod, params)
    save_graph(graph, lib, graph_params)
    loaded_graph, loaded_lib, loaded_params = load_graph()

    if not skip_runtime_test():
        result_dict = dict()
        result_dict["graph"] = run_graph(graph, lib, graph_params)
        result_dict["graph_ref"] = run_graph(loaded_graph, loaded_lib,
                                             loaded_params)
        assert_result_dict_holds(result_dict)
Ejemplo n.º 8
0
def run_and_verify_func(config, target="cuda"):
    """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.

    Parameters
    ----------
    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
        3) A list of which vars should be considered params.
    """
    if skip_codegen_test():
        return
    f, input_shapes, is_param = config
    params = {
        x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32)
        for x in is_param
    }
    input_dict = {
        k: np.random.uniform(-1, 1, v).astype(np.float32)
        for k, v in input_shapes.items() if k not in is_param
    }

    # Run TRT
    mod = tvm.IRModule()
    mod["main"] = f
    mod, config = tensorrt.partition_for_tensorrt(mod, params)
    with tvm.transform.PassContext(
            opt_level=3, config={"relay.ext.tensorrt.options": config}):
        graph, lib, graph_params = relay.build(mod, target, params=params)
    if skip_runtime_test():
        return
    ctx = tvm.context(target)
    mod = graph_runtime.create(graph, lib, ctx=ctx)
    mod.set_input(**graph_params)
    mod.run(**input_dict)
    results = [mod.get_output(i) for i in range(mod.get_num_outputs())]

    # Run reference
    mod = tvm.IRModule()
    mod["main"] = f
    with tvm.transform.PassContext(opt_level=3):
        graph, lib, graph_params = relay.build(mod, target, params=params)
    mod = graph_runtime.create(graph, lib, ctx=ctx)
    mod.set_input(**graph_params)
    mod.run(**input_dict)
    ref_results = [mod.get_output(i) for i in range(mod.get_num_outputs())]

    assert len(results) == len(ref_results)
    for i in range(len(results)):
        res = results[i].asnumpy()
        ref_res = ref_results[i].asnumpy()
        assert res.shape == ref_res.shape
        tvm.testing.assert_allclose(res, ref_res, rtol=1e-3, atol=1e-3)
Ejemplo n.º 9
0
    def compile_and_run(i_data,
                        input_shape,
                        dtype,
                        use_trt=True,
                        num_iteration=1):
        import mxnet as mx
        from mxnet.gluon.model_zoo.vision import get_model

        def check_trt_used(graph):
            import json

            graph = json.loads(graph)
            num_trt_subgraphs = sum([
                1 for n in graph["nodes"] if n.get("attrs", {}).get(
                    "func_name", "").startswith("tensorrt_")
            ])
            assert num_trt_subgraphs >= 1

        block = get_model(model, pretrained=True)
        mod, params = relay.frontend.from_mxnet(block,
                                                shape={"data": input_shape},
                                                dtype=dtype)

        if use_trt:
            mod, config = tensorrt.partition_for_tensorrt(mod, params)
            with tvm.transform.PassContext(
                    opt_level=3, config={"relay.ext.tensorrt.options":
                                         config}):
                graph, lib, params = relay.build(mod, "cuda", params=params)
            check_trt_used(graph)
        else:
            with tvm.transform.PassContext(opt_level=3):
                graph, lib, params = relay.build(mod, "cuda", params=params)

        if skip_runtime_test():
            return
        mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
        mod.set_input(**params)
        # Warmup
        for i in range(10):
            mod.run(data=i_data)
        # Time
        times = []
        for i in range(num_iteration):
            start_time = time.time()
            mod.run(data=i_data)
            res = mod.get_output(0)
            times.append(time.time() - start_time)
        latency = 1000.0 * np.mean(times)
        print(model, latency)
        return res
Ejemplo n.º 10
0
def test_tensorrt_serialize_vm():
    if skip_codegen_test():
        return
    import mxnet as mx
    from mxnet.gluon.model_zoo.vision import get_model

    data_shape = (1, 3, 224, 224)
    data_type = "float32"
    i_data = np.random.uniform(0, 1, data_shape).astype(data_type)
    block = get_model("resnet18_v1", pretrained=True)
    mod, params = relay.frontend.from_mxnet(block,
                                            shape={"data": data_shape},
                                            dtype=data_type)
    mod, config = tensorrt.partition_for_tensorrt(mod)
    tmpdir = utils.tempdir()

    def compile_vm(mod, params):
        with tvm.transform.PassContext(
                opt_level=3, config={"relay.ext.tensorrt.options": config}):
            vm_exec = relay.vm.compile(mod, target="cuda", params=params)
            code, lib = vm_exec.save()
        return code, lib

    def run_vm(code, lib):
        vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
        vm = VirtualMachine(vm_exec, tvm.gpu(0))
        result = vm.invoke("main", data=i_data)
        return result

    def save_vm(code, lib):
        # save and load the code and lib file.
        lib.export_library(tmpdir.relpath("path_lib.so"))
        with open(tmpdir.relpath("path_code.ro"), "wb") as fo:
            fo.write(code)

    def load_vm():
        lib = tvm.runtime.load_module(tmpdir.relpath("path_lib.so"))
        code = bytearray(open(tmpdir.relpath("path_code.ro"), "rb").read())
        return lib, code

    # Test serialization with VM
    code_vm, lib_vm = compile_vm(mod, params)
    save_vm(code_vm, lib_vm)
    loaded_lib_vm, loaded_code_vm = load_vm()

    if not skip_runtime_test():
        result_dict = dict()
        result_dict["vm"] = run_vm(code_vm, lib_vm)
        result_dict["vm_ref"] = run_vm(loaded_code_vm, loaded_lib_vm)
        assert_result_dict_holds(result_dict)
Ejemplo n.º 11
0
def run_and_verify_func(config, target="cuda"):
    """Test a Relay func by compiling, running, and comparing TVM and TRT outputs.

    Parameters
    ----------
    config : Tuple[relay.Function, Dict[str, NDArray], List[str]]
        A tuple containing 1) The function to test, 2) A dictionary of var names to input shapes and
        3) A list of which vars should be considered params.
    """
    if skip_codegen_test():
        return
    f, input_shapes, is_param = config
    params = {
        x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32)
        for x in is_param
    }
    input_dict = {
        k: np.random.uniform(-1, 1, v).astype(np.float32)
        for k, v in input_shapes.items() if k not in is_param
    }
    ctx = tvm.context(target)

    result_dict = dict()
    for mode in ["graph", "vm"]:
        for use_trt in [False, True]:
            mod = tvm.IRModule()
            mod["main"] = f
            result_key = mode + ("_trt" if use_trt else "")
            if use_trt:
                mod, config = tensorrt.partition_for_tensorrt(mod, params)
                with tvm.transform.PassContext(
                        opt_level=3,
                        config={"relay.ext.tensorrt.options": config}):
                    exec = relay.create_executor(mode,
                                                 mod=mod,
                                                 ctx=ctx,
                                                 target=target)
            else:
                with tvm.transform.PassContext(opt_level=3):
                    exec = relay.create_executor(mode,
                                                 mod=mod,
                                                 ctx=ctx,
                                                 target=target)
            if not skip_runtime_test():
                result_dict[result_key] = exec.evaluate()(**input_dict,
                                                          **params)

    if not skip_runtime_test():
        assert_result_dict_holds(result_dict)
Ejemplo n.º 12
0
    def convert_traced_model_to_vm_trt(
        traced_module: torch.jit.TopLevelTracedModule, np_sample_input: np.ndarray, target: str
    ) -> tvm.runtime.vm.Executable:
        """
        This function converts a traced pytorch model to VM + TRT.
        """
        input_shape = np_sample_input.shape
        input_name = "input0"
        shape_list = [(input_name, input_shape)]
        mod, params = relay.frontend.from_pytorch(traced_module, shape_list)
        mod, config = tensorrt.partition_for_tensorrt(mod, params, remove_no_mac_subgraphs=True)
        with tvm.transform.PassContext(opt_level=3, disabled_pass=["FoldScaleAxis"]):
            vm_trt_exec = relay.vm.compile(mod, target=target, params=params)

        return vm_trt_exec
Ejemplo n.º 13
0
    def compile_and_run(mod, params, i_data, mode="vm", use_trt=True):
        assert mode in ["graph", "vm"]

        if use_trt:
            mod, config = tensorrt.partition_for_tensorrt(mod, params)
            check_trt_used(mod)
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.ext.tensorrt.options": config}
            ):
                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
        else:
            with tvm.transform.PassContext(opt_level=3):
                exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")

        res = exec.evaluate()(i_data, **params) if not skip_runtime_test() else None
        return res
Ejemplo n.º 14
0
def test_tensorrt_simple():
    if skip_codegen_test():
        return
    dtype = "float32"
    xshape = (1, 3, 2, 2)
    yshape = (1, 3, 1, 1)
    zshape = (1, 1, 1, 1)
    x = relay.var("x", shape=(xshape), dtype=dtype)
    y = relay.var("y", shape=(yshape), dtype=dtype)
    z = relay.var("z", shape=(zshape), dtype=dtype)
    w = z * (x + y)
    out = relay.nn.relu(w)
    f = relay.Function([x, y, z], out)

    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
    y_data = np.random.uniform(-1, 1, yshape).astype(dtype)
    z_data = np.random.uniform(-1, 1, zshape).astype(dtype)

    result_dict = dict()
    for mode in ["vm", "graph"]:
        for use_trt in [True, False]:
            mod = tvm.IRModule()
            mod["main"] = f
            result_key = mode + ("_trt" if use_trt else "")
            if use_trt:
                mod, config = tensorrt.partition_for_tensorrt(mod)
                with tvm.transform.PassContext(
                        opt_level=3,
                        config={"relay.ext.tensorrt.options": config}):
                    relay_exec = relay.create_executor(mode,
                                                       mod=mod,
                                                       ctx=tvm.gpu(0),
                                                       target="cuda")
            else:
                with tvm.transform.PassContext(opt_level=3):
                    relay_exec = relay.create_executor(mode,
                                                       mod=mod,
                                                       ctx=tvm.gpu(0),
                                                       target="cuda")
            if not skip_runtime_test():
                result_dict[result_key] = relay_exec.evaluate()(x_data, y_data,
                                                                z_data)

    if not skip_runtime_test():
        assert_result_dict_holds(result_dict)
Ejemplo n.º 15
0
def test_tensorrt_not_compatible():
    if skip_codegen_test():
        return
    dtype = "float32"
    xshape = (1, 32, 14, 14)
    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)

    x = relay.var("x", shape=(xshape), dtype=dtype)
    y = relay.add(x, x)
    z = relay.erf(y)
    out = relay.nn.relu(z)
    f = relay.Function([x], out)
    mod = tvm.IRModule()
    mod["main"] = f
    mod, config = tensorrt.partition_for_tensorrt(mod)
    for mode in ["graph", "vm"]:
        with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
            exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda")
            if not skip_runtime_test():
                results = exec.evaluate()(x_data)
Ejemplo n.º 16
0
def test_tensorrt_not_compatible():
    if skip_codegen_test():
        return
    dtype = "float32"
    xshape = (1, 32, 14, 14)
    x = relay.var("x", shape=(xshape), dtype=dtype)
    y = relay.add(x, x)
    z = relay.erf(y)
    out = relay.nn.relu(z)
    f = relay.Function([x], out)
    mod = tvm.IRModule()
    mod["main"] = f
    mod, config = tensorrt.partition_for_tensorrt(mod)
    with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}):
        graph, lib, params = relay.build(mod, "cuda")
    if skip_runtime_test():
        return
    mod = graph_runtime.create(graph, lib, ctx=tvm.gpu(0))
    x_data = np.random.uniform(-1, 1, xshape).astype(dtype)
    mod.run(x=x_data)
    results = [mod.get_output(i).asnumpy() for i in range(mod.get_num_outputs())]
Ejemplo n.º 17
0
def build_relay_with_tensorrt(
    mod: "IRModule",
    target: "Target",
    params: Dict[str, "NDArray"],
) -> "Module":
    """Build a Relay IRModule with TensorRT BYOC

    Parameters
    ----------
    mod : IRModule
        The Relay IRModule to build.

    target : Target
        The target to build the module for.

    params : Dict[str, NDArray]
        The parameter dict to build the module with.

    Returns
    -------
    mod : runtime.Module
        The built module.
    """
    from tvm.ir.transform import PassContext
    from tvm.relay.build_module import _build_module_no_factory as relay_build
    from tvm.relay.op.contrib import tensorrt
    from tvm.runtime import Module

    mod, config = tensorrt.partition_for_tensorrt(mod, params)
    with PassContext(
            opt_level=3,
            config={"relay.ext.tensorrt.options": config},
    ):
        result = relay_build(mod,
                             target=target,
                             target_host=None,
                             params=params)
    assert isinstance(result, Module)
    return result
Ejemplo n.º 18
0
    def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt):
        result_arr = [{} for _ in range(len(x_data_list))]
        for use_trt in [True, False]:
            x = relay.var("x", shape=x_shape, dtype="float32")
            out = relay.reshape(x, new_shape)
            f = relay.Function([x], out)
            mod = tvm.IRModule()
            mod["main"] = f
            if use_trt:
                mod, _ = tensorrt.partition_for_tensorrt(
                    mod, params={}, remove_no_mac_subgraphs=False
                )
                assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt
            if not skip_runtime_test():
                with relay.build_config(opt_level=3):
                    relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm")

                for i, x_data in enumerate(x_data_list):
                    result_arr[i][use_trt] = relay_exec.evaluate()(x_data)

        if not skip_runtime_test():
            for i in range(len(x_data_list)):
                assert_result_dict_holds(result_arr[i])
Ejemplo n.º 19
0
def verify_meta_schedule_with_tensorrt(
    mod, params, data_shape, use_meta_sched: bool = True, use_trt: bool = True, mode: str = "vm"
):
    if use_meta_sched:
        # With meta_schedule
        dev = "cuda"

        # Build
        if use_trt:
            from tvm.meta_schedule.testing import relay_build_with_tensorrt

            builder = LocalBuilder(f_build=relay_build_with_tensorrt)
        else:

            def relay_build_without_tensorrt(
                mod: Module,
                target: Target,
                params: dict,
            ) -> List[BuilderResult]:
                return tvm.relay.build_module._build_module_no_factory(mod, "cuda", "llvm", params)

            builder = LocalBuilder(f_build=relay_build_without_tensorrt)

        builder_input = BuilderInput(mod, Target(dev, host="llvm"), params)

        (builder_result,) = builder.build([builder_input])
        assert builder_result.error_msg is None
        assert builder_result.artifact_path is not None

        # Run
        evaluator_config = EvaluatorConfig(
            number=5,
            repeat=2,
            min_repeat_ms=0,
            enable_cpu_cache_flush=False,
        )

        runner_input = RunnerInput(
            builder_result.artifact_path, "cuda", [TensorInfo("float32", data_shape)]
        )

        def eval_func(rt_mod, device, evaluator_config, repeated_args):
            rt_mod = tvm.contrib.graph_executor.GraphModule(rt_mod["default"](device))

            eval = rt_mod.module.time_evaluator(
                func_name="run",
                dev=device,
                number=evaluator_config.number,
                repeat=evaluator_config.repeat,
                min_repeat_ms=evaluator_config.min_repeat_ms,
                f_preproc="cache_flush_cpu_non_first_arg"
                if evaluator_config.enable_cpu_cache_flush
                else "",
            )
            repeated_costs: List[List[float]] = []
            for args in repeated_args:
                profile_result = eval(*args)
                repeated_costs.append(profile_result.results)

            costs = [float(cost) for cost in itertools.chain.from_iterable(repeated_costs)]
            return costs

        runner = LocalRunner(
            evaluator_config=evaluator_config,
            f_run_evaluator=eval_func,
        )

        # Run the module
        (runner_future,) = runner.run([runner_input])
        runner_result = runner_future.result()
        assert runner_result is not None
        assert runner_result.run_secs is not None
        assert runner_result.error_msg is None

        for result in runner_result.run_secs:
            if isinstance(result, FloatImm):
                result = result.value
            assert isinstance(result, float)
            assert result >= 0.0

    else:
        # Without meta_schedule
        if use_trt:
            mod, config = tensorrt.partition_for_tensorrt(mod)
            with tvm.transform.PassContext(
                opt_level=3, config={"relay.ext.tensorrt.options": config}
            ):
                func = relay.create_executor(
                    mode, mod=mod, device=tvm.cuda(0), target="cuda"
                ).evaluate()
        else:
            with tvm.transform.PassContext(opt_level=3):
                func = relay.create_executor(
                    mode, mod=mod, device=tvm.cuda(0), target="cuda", params=params
                ).evaluate()
Ejemplo n.º 20
0
def test_trt_int8():
    """
    This Function is used to use tensorrt int8 to compile a resnet34 model,
    and compare cosine distance between the output of the original model and trt int8 tvm ouput

    """
    if skip_codegen_test() or skip_runtime_test():
        return

    try:
        from PIL import Image
        from scipy.spatial import distance
    except:
        print("please install scipy and Image python packages")
        return

    try:
        import torch
        import torchvision
        from torchvision import transforms
    except:
        print("please install pytorch python package")
        return

    os.environ["TVM_TENSORRT_USE_INT8"] = "1"
    os.environ["TENSORRT_NUM_CALI_INT8"] = "10"
    model_name = "resnet34"
    model = getattr(torchvision.models, model_name)(pretrained=True)
    model = model.eval()

    # We grab the TorchScripted model via tracing
    input_shape = [1, 3, 224, 224]
    input_data = torch.randn(input_shape)
    scripted_model = torch.jit.trace(model, input_data).eval()

    img_url = "https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true"
    img_path = download_testdata(img_url, "cat.png", module="data")
    img = Image.open(img_path).resize((224, 224))
    my_preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])
    img = my_preprocess(img)
    img = np.expand_dims(img, 0)

    input_name = "input0"
    shape_list = [(input_name, img.shape)]
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)

    # compile the model
    target = "cuda"
    dev = tvm.cuda()
    mod = partition_for_tensorrt(mod, params)
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target=target, params=params)

    gen_module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))

    num_cali_int8 = int(os.environ["TENSORRT_NUM_CALI_INT8"])
    if num_cali_int8 != 0:
        print("start calibrating data ... ")
        for i in range(num_cali_int8):
            tvm_data = tvm.nd.array(img)
            gen_module.set_input(input_name, tvm_data)
            gen_module.run(data=tvm_data)
        print("finished calibrating data ... ")

    # get output of tvm model
    print("rebuild engine and test to run ... ")
    tvm_data = tvm.nd.array(img)
    gen_module.set_input(input_name, tvm_data)
    gen_module.run(data=tvm_data)
    out = gen_module.get_output(0)

    # check output of tvm and output of pytorch model are equal
    torch_data = torch.from_numpy(img)
    model = scripted_model.eval()
    torch_output = model(torch_data)

    cosine_distance_res = distance.cosine(out.numpy(),
                                          torch_output.detach().cpu().numpy())
    assert cosine_distance_res <= 0.01

    # Evaluate
    print("Evaluate inference time cost...")
    ftimer = gen_module.module.time_evaluator("run",
                                              dev,
                                              repeat=10,
                                              min_repeat_ms=500)
    prof_res = np.array(ftimer().results) * 1e3  # convert to millisecond
    message = "Mean inference time (std dev): %.2f ms (%.2f ms)" % (
        np.mean(prof_res),
        np.std(prof_res),
    )
    print(message)