Beispiel #1
0
    def test_to_trt(self, dynamic_identity_network):
        builder, network, _ = dynamic_identity_network
        profile = Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4))

        trt_profile = profile.to_trt(builder, network)
        trt_profile.get_shape("X") == ((1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4,
                                                                    4))
Beispiel #2
0
 def test_can_add(self):
     profile = Profile()
     min, opt, max = (1, 1), (2, 2), (4, 4)
     assert profile.add("input", min=min, opt=opt, max=max) is profile
     shape_tuple = profile["input"]
     assert shape_tuple.min == min
     assert shape_tuple.opt == opt
     assert shape_tuple.max == max
Beispiel #3
0
    def test_fill_defaults_does_not_overwrite(self, dynamic_identity_network):
        _, network, _ = dynamic_identity_network
        profile = Profile().add("X", (1, 1, 1, 1), (1, 1, 2, 2), (1, 1, 3, 3))

        profile.fill_defaults(network) is profile
        assert profile["X"].min == (1, 1, 1, 1)
        assert profile["X"].opt == (1, 1, 2, 2)
        assert profile["X"].max == (1, 1, 3, 3)
Beispiel #4
0
 def test_multiple_profiles(self, identity_builder_network):
     builder, network = identity_builder_network
     profiles = [
         Profile().add("x", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)),
         Profile().add("x", (1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)),
     ]
     loader = CreateConfig(profiles=profiles)
     with loader(builder, network) as config:
         assert config.num_optimization_profiles == 2
Beispiel #5
0
 def get_dynamic_shape_profiles(self):
     max_sequence_length = GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[
         self.network_metadata.variant]
     profile = Profile()
     profile.add(
         "input_ids",
         min=(1, 1),
         opt=(1, max_sequence_length // 2),
         max=(1, max_sequence_length),
     )
     return [profile]
Beispiel #6
0
 def test_multiple_profiles(self):
     model = ONNX_MODELS["dynamic_identity"]
     shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)]
     network_loader = NetworkFromOnnxBytes(model.loader)
     profiles = [
         Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)),
         Profile().add("X", *shapes),
     ]
     config_loader = CreateConfig(profiles=profiles)
     with TrtRunner(EngineFromNetwork(network_loader,
                                      config_loader)) as runner:
         runner.context.active_optimization_profile = 1
         for shape in shapes:
             model.check_runner(runner, {"X": shape})
Beispiel #7
0
 def test_multiple_profiles(self):
     model = ONNX_MODELS["dynamic_identity"]
     profile0_shapes = [(1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)]
     profile1_shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)]
     network_loader = NetworkFromOnnxBytes(model.loader)
     profiles = [
         Profile().add("X", *profile0_shapes),
         Profile().add("X", *profile1_shapes),
     ]
     config_loader = CreateConfig(profiles=profiles)
     with TrtRunner(EngineFromNetwork(network_loader,
                                      config_loader)) as runner:
         for index, shapes in enumerate([profile0_shapes, profile1_shapes]):
             runner.set_profile(index)
             for shape in shapes:
                 model.check_runner(runner, {"X": shape})
Beispiel #8
0
 def test_multiple_profiles(self):
     model = ONNX_MODELS["dynamic_identity"]
     shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)]
     network_loader = NetworkFromOnnxBytes(model.loader)
     profiles = [
         Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)),
         Profile().add("X", *shapes),
     ]
     config_loader = CreateConfig(profiles=profiles)
     with TrtRunner(EngineFromNetwork(network_loader,
                                      config_loader)) as runner:
         if misc.version(trt.__version__) < misc.version("7.3"):
             runner.context.active_optimization_profile = 1
         else:
             runner.context.set_optimization_profile_async(
                 1, runner.stream.address())
         for shape in shapes:
             model.check_runner(runner, {"X": shape})
Beispiel #9
0
    def test_empty_tensor_with_dynamic_input_shape_tensor(self):
        model = ONNX_MODELS["empty_tensor_expand"]
        shapes = [(1, 2, 0, 3, 0), (2, 2, 0, 3, 0), (4, 2, 0, 3, 0)]
        network_loader = NetworkFromOnnxBytes(model.loader)
        profiles = [Profile().add("new_shape", *shapes)]
        config_loader = CreateConfig(profiles=profiles)

        with TrtRunner(EngineFromNetwork(network_loader, config_loader)) as runner:
            for shape in shapes:
                model.check_runner(runner, {"new_shape": shape})
Beispiel #10
0
 def test_device_view_dynamic_shapes(self, use_view):
     model = ONNX_MODELS["dynamic_identity"]
     profiles = [
         Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)),
     ]
     runner = TrtRunner(EngineFromNetwork(NetworkFromOnnxBytes(model.loader), CreateConfig(profiles=profiles)))
     with runner, cuda.DeviceArray(shape=(1, 2, 3, 3), dtype=np.float32) as arr:
         inp = np.random.random_sample(size=(1, 2, 3, 3)).astype(np.float32)
         arr.copy_from(inp)
         outputs = runner.infer({"X": cuda.DeviceView(arr.ptr, arr.shape, arr.dtype) if use_view else arr})
         assert np.all(outputs["Y"] == inp)
         assert outputs["Y"].shape == (1, 2, 3, 3)
Beispiel #11
0
    def test_infer_overhead(self, copy_inputs, copy_outputs):
        inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32)
        dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp)

        out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32)  # Using identity model!
        dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype)

        stream = cuda.Stream()

        model = ONNX_MODELS["dynamic_identity"]
        profiles = [
            Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)),
        ]
        inp_name = list(model.input_metadata.keys())[0]

        with engine_from_network(
            network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles)
        ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out:
            # Inference outside the TrtRunner
            def infer():
                if copy_inputs:
                    dev_inp.copy_from(inp, stream=stream)
                context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr)
                if copy_outputs:
                    dev_out.copy_to(out, stream=stream)
                stream.synchronize()

            native_time = time_func(infer)

            feed_dict = {inp_name: (inp if copy_inputs else dev_inp)}
            runner_time = time_func(
                lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs)
            )

        # The overhead should be less than 0.5ms, or the runtime should be within 5%
        print("Absolute difference: {:.5g}".format(runner_time - native_time))
        print("Relative difference: {:.5g}".format(runner_time / native_time))
        assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)
Beispiel #12
0
def main():
    # A Profile maps each input tensor to a range of shapes.
    #
    # TIP: To save lines, calls to `add` can be chained:
    #     profile.add("input0", ...).add("input1", ...)
    #
    #   Of course, you may alternatively write this as:
    #     profile.add("input0", ...)
    #     profile.add("input1", ...)
    #
    profiles = [
        # The low-latency case. For best performance, min == opt == max.
        Profile().add("X",
                      min=(1, 3, 28, 28),
                      opt=(1, 3, 28, 28),
                      max=(1, 3, 28, 28)),
        # The dynamic batching case. We use `4` for the opt batch size since that's our most common case.
        Profile().add("X",
                      min=(1, 3, 28, 28),
                      opt=(4, 3, 28, 28),
                      max=(32, 3, 28, 28)),
        # The offline case. For best performance, min == opt == max.
        Profile().add("X",
                      min=(128, 3, 28, 28),
                      opt=(128, 3, 28, 28),
                      max=(128, 3, 28, 28)),
    ]

    # See examples/api/06_immediate_eval_api for details on immediately evaluated functional loaders like `engine_from_network`.
    engine = engine_from_network(NetworkFromOnnxPath("dynamic_identity.onnx"),
                                 config=CreateConfig(profiles=profiles))

    # We'll save the engine so that we can inspect it with `inspect model`.
    # This should make it easy to see how the engine bindings are laid out.
    save_engine(engine, "dynamic_identity.engine")

    # We'll create, but not activate, three separate runners, each with a separate context.
    #
    # TIP: By providing a context directly, as opposed to via a lazy loader,
    # we can ensure that the runner will *not* take ownership of it.
    #
    low_latency = TrtRunner(engine.create_execution_context())

    # NOTE: The following two lines will cause TensorRT to display errors since profile 0
    # is already in use by the first execution context. We'll suppress them using G_LOGGER.verbosity().
    #
    with G_LOGGER.verbosity(G_LOGGER.CRITICAL):
        dynamic_batching = TrtRunner(engine.create_execution_context())
        offline = TrtRunner(engine.create_execution_context())
        # NOTE: We could update the profile index here (e.g. `context.active_optimization_profile = 2`),
        # but instead, we'll use TrtRunner's `set_profile()` API when we later activate the runner.

    # Finally, we can activate the runners as we need them.
    #
    # NOTE: Since the context and engine are already created, the runner will only need to
    # allocate input and output buffers during activation.

    input_img = np.ones((1, 3, 28, 28), dtype=np.float32)  # An input "image"

    with low_latency:
        outputs = low_latency.infer({"X": input_img})
        assert np.array_equal(outputs["Y"],
                              input_img)  # It's an identity model!

        print("Low latency runner succeeded!")

        # While we're serving requests online, we might decide that we need dynamic batching
        # for a moment.
        #
        # NOTE: We're assuming that activating runners will be cheap here, so we can bring up
        # the dynamic batching runner just-in-time.
        #
        # TIP: If activating the runner is not cheap (e.g. input/output buffers are large),
        # it might be better to keep the runner active the whole time.
        #
        with dynamic_batching:
            # NOTE: The very first time we activate this runner, we need to set
            # the profile index (it's 0 by default). We need to do this *only once*.
            # Alternatively, we could have set the profile index in the context directly (see above).
            #
            dynamic_batching.set_profile(
                1
            )  # Use the second profile, which is intended for dynamic batching.

            # We'll create fake batches by repeating our fake input image.
            small_input_batch = np.repeat(input_img, 4,
                                          axis=0)  # Shape: (4, 3, 28, 28)
            outputs = dynamic_batching.infer({"X": small_input_batch})
            assert np.array_equal(outputs["Y"], small_input_batch)

    # If we need dynamic batching again later, we can activate the runner once more.
    #
    # NOTE: This time, we do *not* need to set the profile.
    #
    with dynamic_batching:
        # NOTE: We can use any shape that's in the range of the profile without
        # additional setup - Polygraphy handles the details behind the scenes!
        #
        large_input_batch = np.repeat(input_img, 16,
                                      axis=0)  # Shape: (16, 3, 28, 28)
        outputs = dynamic_batching.infer({"X": large_input_batch})
        assert np.array_equal(outputs["Y"], large_input_batch)

        print("Dynamic batching runner succeeded!")

    with offline:
        # NOTE: We must set the profile to something other than 0 or 1 since both of those
        # are now in use by the `low_latency` and `dynamic_batching` runners respectively.
        #
        offline.set_profile(
            2
        )  # Use the third profile, which is intended for the offline case.

        large_offline_batch = np.repeat(input_img, 128,
                                        axis=0)  # Shape: (128, 3, 28, 28)
        outputs = offline.infer({"X": large_offline_batch})
        assert np.array_equal(outputs["Y"], large_offline_batch)

        print("Offline runner succeeded!")
import sys

# Data Loader
data_loader = DataLoader(
    input_metadata=TensorMetadata().add('tensor-0', None, (4, 1, 28, 28)))

# Loaders
build_onnxrt_session = SessionFromOnnx(
    '/work/gitlab/tensorrt-cookbook-in-chinese/08-Tool/Polygraphy/runExample/model.onnx'
)
parse_network_from_onnx = NetworkFromOnnxPath(
    '/work/gitlab/tensorrt-cookbook-in-chinese/08-Tool/Polygraphy/runExample/model.onnx'
)
profiles = [
    Profile().add('tensor-0',
                  min=[1, 1, 28, 28],
                  opt=[4, 1, 28, 28],
                  max=[16, 1, 28, 28])
]
create_trt_config = CreateTrtConfig(max_workspace_size=1000000000,
                                    profiles=profiles)
build_engine = EngineFromNetwork(parse_network_from_onnx,
                                 config=create_trt_config)
save_engine = SaveEngine(build_engine, path='model-FP32.plan')

# Runners
runners = [
    OnnxrtRunner(build_onnxrt_session),
    TrtRunner(save_engine),
]

# Runner Execution