def test_to_trt(self, dynamic_identity_network): builder, network, _ = dynamic_identity_network profile = Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)) trt_profile = profile.to_trt(builder, network) trt_profile.get_shape("X") == ((1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4))
def test_can_add(self): profile = Profile() min, opt, max = (1, 1), (2, 2), (4, 4) assert profile.add("input", min=min, opt=opt, max=max) is profile shape_tuple = profile["input"] assert shape_tuple.min == min assert shape_tuple.opt == opt assert shape_tuple.max == max
def test_fill_defaults_does_not_overwrite(self, dynamic_identity_network): _, network, _ = dynamic_identity_network profile = Profile().add("X", (1, 1, 1, 1), (1, 1, 2, 2), (1, 1, 3, 3)) profile.fill_defaults(network) is profile assert profile["X"].min == (1, 1, 1, 1) assert profile["X"].opt == (1, 1, 2, 2) assert profile["X"].max == (1, 1, 3, 3)
def test_multiple_profiles(self, identity_builder_network): builder, network = identity_builder_network profiles = [ Profile().add("x", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)), Profile().add("x", (1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)), ] loader = CreateConfig(profiles=profiles) with loader(builder, network) as config: assert config.num_optimization_profiles == 2
def get_dynamic_shape_profiles(self): max_sequence_length = GPT2ModelTRTConfig.MAX_SEQUENCE_LENGTH[ self.network_metadata.variant] profile = Profile() profile.add( "input_ids", min=(1, 1), opt=(1, max_sequence_length // 2), max=(1, max_sequence_length), ) return [profile]
def test_multiple_profiles(self): model = ONNX_MODELS["dynamic_identity"] shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)] network_loader = NetworkFromOnnxBytes(model.loader) profiles = [ Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)), Profile().add("X", *shapes), ] config_loader = CreateConfig(profiles=profiles) with TrtRunner(EngineFromNetwork(network_loader, config_loader)) as runner: runner.context.active_optimization_profile = 1 for shape in shapes: model.check_runner(runner, {"X": shape})
def test_multiple_profiles(self): model = ONNX_MODELS["dynamic_identity"] profile0_shapes = [(1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)] profile1_shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)] network_loader = NetworkFromOnnxBytes(model.loader) profiles = [ Profile().add("X", *profile0_shapes), Profile().add("X", *profile1_shapes), ] config_loader = CreateConfig(profiles=profiles) with TrtRunner(EngineFromNetwork(network_loader, config_loader)) as runner: for index, shapes in enumerate([profile0_shapes, profile1_shapes]): runner.set_profile(index) for shape in shapes: model.check_runner(runner, {"X": shape})
def test_multiple_profiles(self): model = ONNX_MODELS["dynamic_identity"] shapes = [(1, 2, 4, 4), (1, 2, 8, 8), (1, 2, 16, 16)] network_loader = NetworkFromOnnxBytes(model.loader) profiles = [ Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)), Profile().add("X", *shapes), ] config_loader = CreateConfig(profiles=profiles) with TrtRunner(EngineFromNetwork(network_loader, config_loader)) as runner: if misc.version(trt.__version__) < misc.version("7.3"): runner.context.active_optimization_profile = 1 else: runner.context.set_optimization_profile_async( 1, runner.stream.address()) for shape in shapes: model.check_runner(runner, {"X": shape})
def test_empty_tensor_with_dynamic_input_shape_tensor(self): model = ONNX_MODELS["empty_tensor_expand"] shapes = [(1, 2, 0, 3, 0), (2, 2, 0, 3, 0), (4, 2, 0, 3, 0)] network_loader = NetworkFromOnnxBytes(model.loader) profiles = [Profile().add("new_shape", *shapes)] config_loader = CreateConfig(profiles=profiles) with TrtRunner(EngineFromNetwork(network_loader, config_loader)) as runner: for shape in shapes: model.check_runner(runner, {"new_shape": shape})
def test_device_view_dynamic_shapes(self, use_view): model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1, 1), (1, 2, 2, 2), (1, 2, 4, 4)), ] runner = TrtRunner(EngineFromNetwork(NetworkFromOnnxBytes(model.loader), CreateConfig(profiles=profiles))) with runner, cuda.DeviceArray(shape=(1, 2, 3, 3), dtype=np.float32) as arr: inp = np.random.random_sample(size=(1, 2, 3, 3)).astype(np.float32) arr.copy_from(inp) outputs = runner.infer({"X": cuda.DeviceView(arr.ptr, arr.shape, arr.dtype) if use_view else arr}) assert np.all(outputs["Y"] == inp) assert outputs["Y"].shape == (1, 2, 3, 3)
def test_infer_overhead(self, copy_inputs, copy_outputs): inp = np.ones(shape=(1, 2, 1024, 1024), dtype=np.float32) dev_inp = cuda.DeviceArray(shape=inp.shape, dtype=inp.dtype).copy_from(inp) out = np.zeros(shape=(1, 2, 1024, 1024), dtype=np.float32) # Using identity model! dev_out = cuda.DeviceArray(shape=out.shape, dtype=out.dtype) stream = cuda.Stream() model = ONNX_MODELS["dynamic_identity"] profiles = [ Profile().add("X", (1, 2, 1024, 1024), (1, 2, 1024, 1024), (1, 2, 1024, 1024)), ] inp_name = list(model.input_metadata.keys())[0] with engine_from_network( network_from_onnx_bytes(model.loader), CreateConfig(profiles=profiles) ) as engine, engine.create_execution_context() as context, TrtRunner(context) as runner, dev_inp, dev_out: # Inference outside the TrtRunner def infer(): if copy_inputs: dev_inp.copy_from(inp, stream=stream) context.execute_async_v2(bindings=[dev_inp.ptr, dev_out.ptr], stream_handle=stream.ptr) if copy_outputs: dev_out.copy_to(out, stream=stream) stream.synchronize() native_time = time_func(infer) feed_dict = {inp_name: (inp if copy_inputs else dev_inp)} runner_time = time_func( lambda: runner.infer(feed_dict, check_inputs=False, copy_outputs_to_host=copy_outputs) ) # The overhead should be less than 0.5ms, or the runtime should be within 5% print("Absolute difference: {:.5g}".format(runner_time - native_time)) print("Relative difference: {:.5g}".format(runner_time / native_time)) assert (runner_time - native_time) < 0.5e-3 or runner_time <= (native_time * 1.05)
def main(): # A Profile maps each input tensor to a range of shapes. # # TIP: To save lines, calls to `add` can be chained: # profile.add("input0", ...).add("input1", ...) # # Of course, you may alternatively write this as: # profile.add("input0", ...) # profile.add("input1", ...) # profiles = [ # The low-latency case. For best performance, min == opt == max. Profile().add("X", min=(1, 3, 28, 28), opt=(1, 3, 28, 28), max=(1, 3, 28, 28)), # The dynamic batching case. We use `4` for the opt batch size since that's our most common case. Profile().add("X", min=(1, 3, 28, 28), opt=(4, 3, 28, 28), max=(32, 3, 28, 28)), # The offline case. For best performance, min == opt == max. Profile().add("X", min=(128, 3, 28, 28), opt=(128, 3, 28, 28), max=(128, 3, 28, 28)), ] # See examples/api/06_immediate_eval_api for details on immediately evaluated functional loaders like `engine_from_network`. engine = engine_from_network(NetworkFromOnnxPath("dynamic_identity.onnx"), config=CreateConfig(profiles=profiles)) # We'll save the engine so that we can inspect it with `inspect model`. # This should make it easy to see how the engine bindings are laid out. save_engine(engine, "dynamic_identity.engine") # We'll create, but not activate, three separate runners, each with a separate context. # # TIP: By providing a context directly, as opposed to via a lazy loader, # we can ensure that the runner will *not* take ownership of it. # low_latency = TrtRunner(engine.create_execution_context()) # NOTE: The following two lines will cause TensorRT to display errors since profile 0 # is already in use by the first execution context. We'll suppress them using G_LOGGER.verbosity(). # with G_LOGGER.verbosity(G_LOGGER.CRITICAL): dynamic_batching = TrtRunner(engine.create_execution_context()) offline = TrtRunner(engine.create_execution_context()) # NOTE: We could update the profile index here (e.g. `context.active_optimization_profile = 2`), # but instead, we'll use TrtRunner's `set_profile()` API when we later activate the runner. # Finally, we can activate the runners as we need them. # # NOTE: Since the context and engine are already created, the runner will only need to # allocate input and output buffers during activation. input_img = np.ones((1, 3, 28, 28), dtype=np.float32) # An input "image" with low_latency: outputs = low_latency.infer({"X": input_img}) assert np.array_equal(outputs["Y"], input_img) # It's an identity model! print("Low latency runner succeeded!") # While we're serving requests online, we might decide that we need dynamic batching # for a moment. # # NOTE: We're assuming that activating runners will be cheap here, so we can bring up # the dynamic batching runner just-in-time. # # TIP: If activating the runner is not cheap (e.g. input/output buffers are large), # it might be better to keep the runner active the whole time. # with dynamic_batching: # NOTE: The very first time we activate this runner, we need to set # the profile index (it's 0 by default). We need to do this *only once*. # Alternatively, we could have set the profile index in the context directly (see above). # dynamic_batching.set_profile( 1 ) # Use the second profile, which is intended for dynamic batching. # We'll create fake batches by repeating our fake input image. small_input_batch = np.repeat(input_img, 4, axis=0) # Shape: (4, 3, 28, 28) outputs = dynamic_batching.infer({"X": small_input_batch}) assert np.array_equal(outputs["Y"], small_input_batch) # If we need dynamic batching again later, we can activate the runner once more. # # NOTE: This time, we do *not* need to set the profile. # with dynamic_batching: # NOTE: We can use any shape that's in the range of the profile without # additional setup - Polygraphy handles the details behind the scenes! # large_input_batch = np.repeat(input_img, 16, axis=0) # Shape: (16, 3, 28, 28) outputs = dynamic_batching.infer({"X": large_input_batch}) assert np.array_equal(outputs["Y"], large_input_batch) print("Dynamic batching runner succeeded!") with offline: # NOTE: We must set the profile to something other than 0 or 1 since both of those # are now in use by the `low_latency` and `dynamic_batching` runners respectively. # offline.set_profile( 2 ) # Use the third profile, which is intended for the offline case. large_offline_batch = np.repeat(input_img, 128, axis=0) # Shape: (128, 3, 28, 28) outputs = offline.infer({"X": large_offline_batch}) assert np.array_equal(outputs["Y"], large_offline_batch) print("Offline runner succeeded!")
import sys # Data Loader data_loader = DataLoader( input_metadata=TensorMetadata().add('tensor-0', None, (4, 1, 28, 28))) # Loaders build_onnxrt_session = SessionFromOnnx( '/work/gitlab/tensorrt-cookbook-in-chinese/08-Tool/Polygraphy/runExample/model.onnx' ) parse_network_from_onnx = NetworkFromOnnxPath( '/work/gitlab/tensorrt-cookbook-in-chinese/08-Tool/Polygraphy/runExample/model.onnx' ) profiles = [ Profile().add('tensor-0', min=[1, 1, 28, 28], opt=[4, 1, 28, 28], max=[16, 1, 28, 28]) ] create_trt_config = CreateTrtConfig(max_workspace_size=1000000000, profiles=profiles) build_engine = EngineFromNetwork(parse_network_from_onnx, config=create_trt_config) save_engine = SaveEngine(build_engine, path='model-FP32.plan') # Runners runners = [ OnnxrtRunner(build_onnxrt_session), TrtRunner(save_engine), ] # Runner Execution