def allocate_buffers(engine): input_buffers = OrderedDict() output_buffers = OrderedDict() bindings = [] stream = cuda.Stream() G_LOGGER.verbose("Using batch size: " + str(engine.max_batch_size) + " during buffer allocation") for binding in engine: shape = (engine.max_batch_size, ) + tuple( engine.get_binding_shape(binding)) dtype = engine.get_binding_dtype(binding) device_mem = cuda.DeviceArray(shape=shape, dtype=trt.nptype(dtype)) G_LOGGER.extra_verbose("Tensor: " "{:35} | Allocated: {:}".format( binding, device_mem)) if engine.binding_is_input(binding): input_buffers[binding] = TrtLegacyRunner.HostDeviceMem( None, device_mem) else: host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype)) output_buffers[binding] = TrtLegacyRunner.HostDeviceMem( host_mem, device_mem) return input_buffers, output_buffers, stream
def activate_impl(self): def make_buffers(engine): """ Creates empty host and device buffers for the specified engine. Always uses binding names from Profile 0. """ device_buffers = OrderedDict() host_output_buffers = OrderedDict() for idx in range(trt_util.get_bindings_per_profile(engine)): binding = engine[idx] dtype = trt_util.np_dtype_from_trt( engine.get_binding_dtype(binding)) device_buffers[binding] = cuda.DeviceArray(dtype=dtype) if not engine.binding_is_input(binding): host_output_buffers[binding] = np.empty(shape=tuple(), dtype=dtype) G_LOGGER.extra_verbose( "Created device buffers: {:}".format(device_buffers)) return device_buffers, host_output_buffers engine_or_context, owning = util.invoke_if_callable( self._engine_or_context) if isinstance(engine_or_context, trt.ICudaEngine): self.engine = engine_or_context self.owns_engine = owning self.context = self.engine.create_execution_context() self.owns_context = True if not self.context: G_LOGGER.critical( "Invalid Context. See error log for details.") elif isinstance(engine_or_context, trt.IExecutionContext): self.engine = None self.owns_engine = False self.context = engine_or_context self.owns_context = owning else: G_LOGGER.critical( "Invalid Engine or Context. Please ensure the engine was built correctly. See error log for details." ) if not owning: G_LOGGER.verbose( "Object was provided directly instead of via a Callable. This runner will not assume ownership. " "Please ensure it is freed.") self.device_buffers, self.host_output_buffers = make_buffers( self.context.engine) self.stream = cuda.Stream()
def test_infer_overhead(self, copy_inputs, copy_outputs): inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32) dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp) out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32) # Using identity model! dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype) stream = cuda.Stream() model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)), ] inp_name = list(model.input_metadata.keys())[0] with engine_from_network( network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles) ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out: # Inference outside the TrtRunner def infer(): if copy_inputs: dev_inp.copy_from(inp, stream=stream) context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr) if copy_outputs: dev_out.copy_to(out, stream=stream) stream.synchronize() native_time = time_func(infer) feed_dict = {inp_name: (inp if copy_inputs else dev_inp)} runner_time = time_func( lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs) ) # The overhead should be less than 0.5ms, or the runtime should be within 5% print("Absolute difference: {:.5g}".format(runner_time - native_time)) print("Relative difference: {:.5g}".format(runner_time / native_time)) assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)