Example #1
0
 def test_device_buffer_order_matches_bindings(self):
     model = ONNX_MODELS["reducable"]
     engine = engine_from_network(NetworkFromOnnxBytes(model.loader))
     with engine, TrtRunner(engine) as runner:
         dev_buf_order = list(runner.device_buffers.keys())
         for binding, dev_buf_name in zip(engine, dev_buf_order):
             assert binding == dev_buf_name
Example #2
0
    def onnx_to_trt(self, output_fpath: str, input_fpath: str,
                    network_metadata: NetworkMetadata):
        """
        Converts ONNX file to TRT engine.
        Since TensorRT already supplies converter functions and scripts,
        a default implementation is already provided.

        Arg:
            output_fpath (str): File location of the generated ONNX file.
            input_fpath (str): Input file location of the generated ONNX file.
            network_metadata (NetworkMetadata): Network metadata of the network being converted.

        Returns:
            TRTEngineFile: Newly generated engine.
        """
        result = self.trt_engine_class(output_fpath, network_metadata)
        self.trt_inference_config = CreateConfig(
            fp16=network_metadata.precision.fp16,
            max_workspace_size=result.DEFAULT_TRT_WORKSPACE_MB * 1024 * 1024,
            profiles=result.get_dynamic_shape_profiles(),
            strict_types=result.use_strict_types())

        g_logger_verbosity = (PG_LOGGER.EXTRA_VERBOSE if G_LOGGER.level
                              == G_LOGGER.DEBUG else PG_LOGGER.WARNING)
        with PG_LOGGER.verbosity(g_logger_verbosity):
            network_definition = result.get_network_definition(
                network_from_onnx_path(input_fpath))

            trt_engine = engine_from_network(network_definition,
                                             config=self.trt_inference_config)
            save_engine(trt_engine, output_fpath)

        return result
Example #3
0
 def test_context(self):
     model = ONNX_MODELS["identity"]
     engine = engine_from_network(NetworkFromOnnxBytes(model.loader))
     with engine, TrtRunner(engine.create_execution_context) as runner:
         model.check_runner(runner)
         assert not runner.owns_engine
         assert runner.owns_context
Example #4
0
    def test_calibrator_device_buffers_multiinput(self,
                                                  multi_input_builder_network,
                                                  mode):
        def generate_dev_data(num_batches):
            with cuda.DeviceArray(shape=(1, ), dtype=np.float32) as x:
                for _ in range(num_batches):
                    x.copy_from(np.ones((1, ), dtype=np.float32))
                    xdata = {
                        "array": x,
                        "view": cuda.DeviceView(x.ptr, x.shape, x.dtype),
                        "pointer": x.ptr
                    }[mode]
                    yield {
                        "X0": xdata,
                        "Y0": np.zeros((1, ), dtype=np.float32)
                    }

        builder, network = multi_input_builder_network
        NUM_BATCHES = 2

        calibrator = Calibrator(generate_dev_data(NUM_BATCHES))

        create_config = CreateConfig(int8=True, calibrator=calibrator)
        with engine_from_network((builder, network), create_config):
            assert calibrator.num_batches == NUM_BATCHES
        self.check_calibrator_cleanup(calibrator)
Example #5
0
def main():
    # In Polygraphy, loaders and runners take ownership of objects if they are provided
    # via the return values of callables. For example, we don't need to worry about object
    # lifetimes when we use lazy loaders.
    #
    # Since we are immediately evaluating, we take ownership of objects, and are responsible for freeing them.
    builder, network, parser = network_from_onnx_path("identity.onnx")

    # Extend the network with an identity layer.
    prev_output = network.get_output(0)
    network.unmark_output(prev_output)
    output = network.add_identity(prev_output).get_output(0)
    output.name = "output"
    network.mark_output(output)

    # Create a TensorRT IBuilderConfig so that we can build the engine with FP16 enabled.
    config = create_config(builder, network, fp16=True)

    # We can free everything we constructed above once we're done building the engine.
    # NOTE: In TensorRT 8.0, we do *not* need to use a context manager here.
    with builder, network, parser, config:
        engine = engine_from_network((builder, network), config)

    # NOTE: In TensorRT 8.0, we do *not* need to use a context manager to free `engine`.
    with engine, TrtRunner(engine) as runner:
        inp_data = np.ones((1, 1, 2, 2), dtype=np.float32)

        # NOTE: The runner owns the output buffers and is free to reuse them between `infer()` calls.
        # Thus, if you want to store results from multiple inferences, you should use `copy.deepcopy()`.
        outputs = runner.infer(feed_dict={"x": inp_data})

        assert np.array_equal(outputs["output"], inp_data)  # It's an identity model!

        print("Inference succeeded!")
Example #6
0
    def test_calibrator_with_path_name_cache(self, identity_builder_network):
        builder, network = identity_builder_network
        data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}]

        with tempfile.NamedTemporaryFile() as cache:
            calibrator = Calibrator(data, cache=cache.name)
            create_config = CreateConfig(int8=True, calibrator=calibrator)
            with engine_from_network((builder, network), create_config):
                check_file_non_empty(cache.name)
        self.check_calibrator_cleanup(calibrator)
Example #7
0
    def test_calibrator_generator_data(self, identity_builder_network):
        builder, network = identity_builder_network
        NUM_BATCHES = 2

        calibrator = Calibrator(generate_data(NUM_BATCHES))

        create_config = CreateConfig(int8=True, calibrator=calibrator)
        with engine_from_network((builder, network), create_config):
            assert calibrator.num_batches == NUM_BATCHES
        self.check_calibrator_cleanup(calibrator)
Example #8
0
    def test_multithreaded_runners_from_engine(self):
        model = ONNX_MODELS["identity"]
        engine = engine_from_network(NetworkFromOnnxBytes(model.loader))

        with engine, TrtRunner(engine) as runner0, TrtRunner(engine) as runner1:
            t1 = threading.Thread(target=model.check_runner, args=(runner0,))
            t2 = threading.Thread(target=model.check_runner, args=(runner1,))
            t1.start()
            t2.start()
            t1.join()
            t2.join()
Example #9
0
    def test_calibrator_caches_without_explicit_cache(
            self, identity_builder_network):
        builder, network = identity_builder_network
        data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}]

        calibrator = Calibrator(data)
        # First, populate the cache
        create_config = CreateConfig(int8=True, calibrator=calibrator)
        with engine_from_network((builder, network), create_config):
            pass

        # Check that the internal cache is populated
        assert calibrator.read_calibration_cache()
        self.check_calibrator_cleanup(calibrator)
Example #10
0
    def test_calibrator_basic(self, identity_builder_network, BaseClass):
        if mod.version(trt.__version__) < mod.version(
                "7.0") and BaseClass == trt.IInt8LegacyCalibrator:
            pytest.skip("Bug in TRT 6 causes NaNs with legacy calibrator")

        builder, network = identity_builder_network
        NUM_BATCHES = 2

        data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}] * NUM_BATCHES
        calibrator = Calibrator(data, BaseClass=BaseClass)

        create_config = CreateConfig(int8=True, calibrator=calibrator)
        with engine_from_network((builder, network), create_config):
            assert calibrator.num_batches == NUM_BATCHES
        self.check_calibrator_cleanup(calibrator)
Example #11
0
    def test_calibrator_invalid_input_fails(self, identity_builder_network,
                                            names):
        builder, network = identity_builder_network

        data = [{
            name: np.ones((1, 1, 2, 2), dtype=np.float32)
            for name in names
        }]
        calibrator = Calibrator(data)

        create_config = CreateConfig(int8=True, calibrator=calibrator)

        with pytest.raises(PolygraphyException):
            with engine_from_network((builder, network), create_config):
                pass
Example #12
0
    def test_calibrator_rechecks_cache_on_reset(self,
                                                identity_builder_network):
        builder, network = identity_builder_network
        data = [{"x": np.ones((1, 1, 2, 2), dtype=np.float32)}]

        with tempfile.NamedTemporaryFile(mode="wb+") as cache:
            calibrator = Calibrator(data, cache=cache.name)
            # First, populate the cache
            create_config = CreateConfig(int8=True, calibrator=calibrator)
            with engine_from_network((builder, network), create_config):
                pass

            # Ensure that now the calibrator will read from the cache when reset
            calibrator.reset()
            assert not calibrator.has_cached_scales
            assert len(calibrator.read_calibration_cache()) == get_file_size(
                cache.name)

        self.check_calibrator_cleanup(calibrator)
Example #13
0
    def test_infer_overhead(self, copy_inputs, copy_outputs):
        inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32)
        dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp)

        out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32)  # Using identity model!
        dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype)

        stream = cuda.Stream()

        model = ONNX_MODELS["dynamic_identity"]
        profiles = [
            Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)),
        ]
        inp_name = list(model.input_metadata.keys())[0]

        with engine_from_network(
            network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles)
        ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out:
            # Inference outside the TrtRunner
            def infer():
                if copy_inputs:
                    dev_inp.copy_from(inp, stream=stream)
                context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr)
                if copy_outputs:
                    dev_out.copy_to(out, stream=stream)
                stream.synchronize()

            native_time = time_func(infer)

            feed_dict = {inp_name: (inp if copy_inputs else dev_inp)}
            runner_time = time_func(
                lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs)
            )

        # The overhead should be less than 0.5ms, or the runtime should be within 5%
        print("Absolute difference: {:.5g}".format(runner_time - native_time))
        print("Relative difference: {:.5g}".format(runner_time / native_time))
        assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)
Example #14
0
 def test_serialize_engine(self, identity_network):
     with engine_from_network(identity_network) as engine:
         serialized_engine = bytes_from_engine(engine)
         assert isinstance(serialized_engine, bytes)
Example #15
0
 def test_shape_output(self):
     model = ONNX_MODELS["reshape"]
     engine = engine_from_network(NetworkFromOnnxBytes(model.loader))
     with engine, TrtRunner(engine.create_execution_context) as runner:
         model.check_runner(runner)
Example #16
0
def main():
    # A Profile maps each input tensor to a range of shapes.
    #
    # TIP: To save lines, calls to `add` can be chained:
    #     profile.add("input0", ...).add("input1", ...)
    #
    #   Of course, you may alternatively write this as:
    #     profile.add("input0", ...)
    #     profile.add("input1", ...)
    #
    profiles = [
        # The low-latency case. For best performance, min == opt == max.
        Profile().add("X",
                      min=(1, 3, 28, 28),
                      opt=(1, 3, 28, 28),
                      max=(1, 3, 28, 28)),
        # The dynamic batching case. We use `4` for the opt batch size since that's our most common case.
        Profile().add("X",
                      min=(1, 3, 28, 28),
                      opt=(4, 3, 28, 28),
                      max=(32, 3, 28, 28)),
        # The offline case. For best performance, min == opt == max.
        Profile().add("X",
                      min=(128, 3, 28, 28),
                      opt=(128, 3, 28, 28),
                      max=(128, 3, 28, 28)),
    ]

    # See examples/api/06_immediate_eval_api for details on immediately evaluated functional loaders like `engine_from_network`.
    engine = engine_from_network(NetworkFromOnnxPath("dynamic_identity.onnx"),
                                 config=CreateConfig(profiles=profiles))

    # We'll save the engine so that we can inspect it with `inspect model`.
    # This should make it easy to see how the engine bindings are laid out.
    save_engine(engine, "dynamic_identity.engine")

    # We'll create, but not activate, three separate runners, each with a separate context.
    #
    # TIP: By providing a context directly, as opposed to via a lazy loader,
    # we can ensure that the runner will *not* take ownership of it.
    #
    low_latency = TrtRunner(engine.create_execution_context())

    # NOTE: The following two lines will cause TensorRT to display errors since profile 0
    # is already in use by the first execution context. We'll suppress them using G_LOGGER.verbosity().
    #
    with G_LOGGER.verbosity(G_LOGGER.CRITICAL):
        dynamic_batching = TrtRunner(engine.create_execution_context())
        offline = TrtRunner(engine.create_execution_context())
        # NOTE: We could update the profile index here (e.g. `context.active_optimization_profile = 2`),
        # but instead, we'll use TrtRunner's `set_profile()` API when we later activate the runner.

    # Finally, we can activate the runners as we need them.
    #
    # NOTE: Since the context and engine are already created, the runner will only need to
    # allocate input and output buffers during activation.

    input_img = np.ones((1, 3, 28, 28), dtype=np.float32)  # An input "image"

    with low_latency:
        outputs = low_latency.infer({"X": input_img})
        assert np.array_equal(outputs["Y"],
                              input_img)  # It's an identity model!

        print("Low latency runner succeeded!")

        # While we're serving requests online, we might decide that we need dynamic batching
        # for a moment.
        #
        # NOTE: We're assuming that activating runners will be cheap here, so we can bring up
        # the dynamic batching runner just-in-time.
        #
        # TIP: If activating the runner is not cheap (e.g. input/output buffers are large),
        # it might be better to keep the runner active the whole time.
        #
        with dynamic_batching:
            # NOTE: The very first time we activate this runner, we need to set
            # the profile index (it's 0 by default). We need to do this *only once*.
            # Alternatively, we could have set the profile index in the context directly (see above).
            #
            dynamic_batching.set_profile(
                1
            )  # Use the second profile, which is intended for dynamic batching.

            # We'll create fake batches by repeating our fake input image.
            small_input_batch = np.repeat(input_img, 4,
                                          axis=0)  # Shape: (4, 3, 28, 28)
            outputs = dynamic_batching.infer({"X": small_input_batch})
            assert np.array_equal(outputs["Y"], small_input_batch)

    # If we need dynamic batching again later, we can activate the runner once more.
    #
    # NOTE: This time, we do *not* need to set the profile.
    #
    with dynamic_batching:
        # NOTE: We can use any shape that's in the range of the profile without
        # additional setup - Polygraphy handles the details behind the scenes!
        #
        large_input_batch = np.repeat(input_img, 16,
                                      axis=0)  # Shape: (16, 3, 28, 28)
        outputs = dynamic_batching.infer({"X": large_input_batch})
        assert np.array_equal(outputs["Y"], large_input_batch)

        print("Dynamic batching runner succeeded!")

    with offline:
        # NOTE: We must set the profile to something other than 0 or 1 since both of those
        # are now in use by the `low_latency` and `dynamic_batching` runners respectively.
        #
        offline.set_profile(
            2
        )  # Use the third profile, which is intended for the offline case.

        large_offline_batch = np.repeat(input_img, 128,
                                        axis=0)  # Shape: (128, 3, 28, 28)
        outputs = offline.infer({"X": large_offline_batch})
        assert np.array_equal(outputs["Y"], large_offline_batch)

        print("Offline runner succeeded!")