Beispiel #1
0
        def allocate_buffers(engine):
            input_buffers = OrderedDict()
            output_buffers = OrderedDict()
            bindings = []
            stream = cuda.Stream()
            G_LOGGER.verbose("Using batch size: " +
                             str(engine.max_batch_size) +
                             " during buffer allocation")
            for binding in engine:
                shape = (engine.max_batch_size, ) + tuple(
                    engine.get_binding_shape(binding))
                dtype = engine.get_binding_dtype(binding)

                device_mem = cuda.DeviceArray(shape=shape,
                                              dtype=trt.nptype(dtype))
                G_LOGGER.extra_verbose("Tensor: "
                                       "{:35} | Allocated: {:}".format(
                                           binding, device_mem))

                if engine.binding_is_input(binding):
                    input_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        None, device_mem)
                else:
                    host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype))
                    output_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        host_mem, device_mem)
            return input_buffers, output_buffers, stream
Beispiel #2
0
    def activate_impl(self):
        def make_buffers(engine):
            """
            Creates empty host and device buffers for the specified engine.
            Always uses binding names from Profile 0.
            """
            device_buffers = OrderedDict()
            host_output_buffers = OrderedDict()

            for idx in range(trt_util.get_bindings_per_profile(engine)):
                binding = engine[idx]
                dtype = trt_util.np_dtype_from_trt(
                    engine.get_binding_dtype(binding))
                device_buffers[binding] = cuda.DeviceArray(dtype=dtype)
                if not engine.binding_is_input(binding):
                    host_output_buffers[binding] = np.empty(shape=tuple(),
                                                            dtype=dtype)
            G_LOGGER.extra_verbose(
                "Created device buffers: {:}".format(device_buffers))
            return device_buffers, host_output_buffers

        engine_or_context, owning = util.invoke_if_callable(
            self._engine_or_context)

        if isinstance(engine_or_context, trt.ICudaEngine):
            self.engine = engine_or_context
            self.owns_engine = owning
            self.context = self.engine.create_execution_context()
            self.owns_context = True
            if not self.context:
                G_LOGGER.critical(
                    "Invalid Context. See error log for details.")
        elif isinstance(engine_or_context, trt.IExecutionContext):
            self.engine = None
            self.owns_engine = False
            self.context = engine_or_context
            self.owns_context = owning
        else:
            G_LOGGER.critical(
                "Invalid Engine or Context. Please ensure the engine was built correctly. See error log for details."
            )

        if not owning:
            G_LOGGER.verbose(
                "Object was provided directly instead of via a Callable. This runner will not assume ownership. "
                "Please ensure it is freed.")

        self.device_buffers, self.host_output_buffers = make_buffers(
            self.context.engine)
        self.stream = cuda.Stream()
Beispiel #3
0
    def test_infer_overhead(self, copy_inputs, copy_outputs):
        inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32)
        dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp)

        out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32)  # Using identity model!
        dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype)

        stream = cuda.Stream()

        model = ONNX_MODELS["dynamic_identity"]
        profiles = [
            Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)),
        ]
        inp_name = list(model.input_metadata.keys())[0]

        with engine_from_network(
            network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles)
        ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out:
            # Inference outside the TrtRunner
            def infer():
                if copy_inputs:
                    dev_inp.copy_from(inp, stream=stream)
                context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr)
                if copy_outputs:
                    dev_out.copy_to(out, stream=stream)
                stream.synchronize()

            native_time = time_func(infer)

            feed_dict = {inp_name: (inp if copy_inputs else dev_inp)}
            runner_time = time_func(
                lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs)
            )

        # The overhead should be less than 0.5ms, or the runtime should be within 5%
        print("Absolute difference: {:.5g}".format(runner_time - native_time))
        print("Relative difference: {:.5g}".format(runner_time / native_time))
        assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)