Ejemplo n.º 1
0
    def __call__(self):
        """
        Builds a TensorRT engine.

        Returns:
            trt.ICudaEngine: The engine that was created.
        """
        # If network is a callable, then we own its return value
        ret, owning = misc.try_call(self._network)
        builder, network, parser = misc.unpack_args(ret, num=3)

        with contextlib.ExitStack() as stack:
            provided = "Builder and Network" if parser is None else "Builder, Network, and Parser"
            if owning:
                stack.enter_context(builder)
                stack.enter_context(network)
                if parser is not None:
                    stack.enter_context(parser)
            else:
                G_LOGGER.verbose("{:} were provided directly instead of via a Callable. This loader will not assume ownership. "
                               "Please ensure that they are freed.".format(provided))

            network_log_mode = "full" if G_LOGGER.severity <= G_LOGGER.ULTRA_VERBOSE else "attrs"
            G_LOGGER.super_verbose(lambda: ("Displaying TensorRT Network:\n" + trt_util.str_from_network(network, mode=network_log_mode)))

            config, _ = misc.try_call(self._config, builder, network)
            G_LOGGER.info("Building engine with configuration: {:}".format(trt_util.str_from_config(config)))
            engine = builder.build_engine(network, config)
            if not engine:
                G_LOGGER.critical("Invalid Engine. Please ensure the engine was built correctly")

            if hasattr(config.int8_calibrator, "free"):
                config.int8_calibrator.free()

            return engine
Ejemplo n.º 2
0
def inspect_trt(args):
    from polygraphy.backend.trt import util as trt_util

    if args.model_type == "engine":
        if args.mode != "none":
            G_LOGGER.warning(
                "Displaying layer information for TensorRT engines is not currently supported"
            )

        with tool_util.get_trt_serialized_engine_loader(args)() as engine:
            engine_str = trt_util.str_from_engine(engine)
            G_LOGGER.info("==== TensorRT Engine ====\n{:}".format(engine_str))
    else:
        builder, network, parser = tool_util.get_trt_network_loader(args)()
        with builder, network, parser:
            network_str = trt_util.str_from_network(network, mode=args.mode)
            G_LOGGER.info(
                "==== TensorRT Network ====\n{:}".format(network_str))
Ejemplo n.º 3
0
    def inspect_trt(self, args):
        from polygraphy.backend.trt import util as trt_util

        if self.makers[ModelArgs].model_type == "engine":
            if args.mode != "none":
                G_LOGGER.warning(
                    "Displaying layer information for TensorRT engines is not currently supported"
                )

            with func.invoke(self.makers[TrtLoaderArgs].
                             get_trt_serialized_engine_loader()) as engine:
                engine_str = trt_util.str_from_engine(engine)
                G_LOGGER.info(
                    "==== TensorRT Engine ====\n{:}".format(engine_str))
        else:
            builder, network, parser = func.invoke(
                self.makers[TrtLoaderArgs].get_trt_network_loader())
            with builder, network, parser:
                network_str = trt_util.str_from_network(
                    network, mode=args.mode).strip()
                G_LOGGER.info(
                    "==== TensorRT Network ====\n{:}".format(network_str))
Ejemplo n.º 4
0
    def call_impl(self):
        """
        Returns:
            bytes: The serialized engine that was created.
        """
        # If network is a callable, then we own its return value
        ret, owns_network = util.invoke_if_callable(self._network)
        builder, network, parser = util.unpack_args(ret, num=3)

        if builder is None or network is None:
            G_LOGGER.critical("Expected to recevie a (builder, network) tuple for the `network` parameter, "
                              "but received: ({:}, {:})".format(builder, network))

        with contextlib.ExitStack() as stack:
            if owns_network:
                stack.enter_context(builder)
                stack.enter_context(network)
                if parser is not None:
                    stack.enter_context(parser)
            else:
                provided = "Builder and Network" if parser is None else "Builder, Network, and Parser"
                G_LOGGER.verbose("{:} were provided directly instead of via a Callable. This loader will not assume ownership. "
                                 "Please ensure that they are freed.".format(provided))

            config, owns_config = util.invoke_if_callable(self._config, builder, network)
            if owns_config:
                stack.enter_context(config)
            else:
                G_LOGGER.verbose("Builder configuration was provided directly instead of via a Callable. This loader will not assume "
                                 "ownership. Please ensure it is freed.")

            try:
                config.int8_calibrator.__enter__ # Polygraphy calibrator frees device buffers on exit.
            except AttributeError:
                pass
            else:
                stack.enter_context(config.int8_calibrator)

            network_log_mode = "full" if G_LOGGER.severity <= G_LOGGER.ULTRA_VERBOSE else "attrs"
            G_LOGGER.super_verbose(lambda: ("Displaying TensorRT Network:\n" + trt_util.str_from_network(network, mode=network_log_mode)))

            G_LOGGER.start("Building engine with configuration:\n{:}".format(trt_util.str_from_config(config)))

            try:
                engine_bytes = builder.build_serialized_network(network, config)
            except AttributeError:
                engine = builder.build_engine(network, config)
                if not engine:
                    G_LOGGER.critical("Invalid Engine. Please ensure the engine was built correctly")
                stack.enter_context(engine)
                engine_bytes = engine.serialize()

            if not engine_bytes:
                G_LOGGER.critical("Invalid Engine. Please ensure the engine_bytes was built correctly")

            try:
                timing_cache = config.get_timing_cache()
            except AttributeError:
                if self.timing_cache_path:
                    trt_util.fail_unavailable("save_timing_cache in EngineBytesFromNetwork")
            else:
                if timing_cache and self.timing_cache_path:
                    with timing_cache.serialize() as buffer:
                        util.save_file(buffer, self.timing_cache_path, description="tactic timing cache")

            return engine_bytes
Ejemplo n.º 5
0
    def activate_impl(self):
        """
        Vars:
            engine (trt.ICudaEngine):
                    The engine tracked by this runner. The TrtLegacyRunner OWNS the engine it
                    manages, and therefore is responsible for it's destruction. Do not free the engine outside of the
                    runner, or it will result in a double free.
            context (trt.IExecutionContext): The context used for inference.
            input_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for input buffers.
            output_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for output buffers.
            bindings (List[int]): A list of device pointers for engine bindings.
            stream (cuda.Stream): The CUDA stream that this runner will use for inference.
        """

        # Only initialize GPU after this runner is activated.
        # Allocates all buffers required for an engine, i.e. host/device input_buffers/output_buffers.
        def allocate_buffers(engine):
            input_buffers = OrderedDict()
            output_buffers = OrderedDict()
            bindings = []
            stream = cuda.Stream()
            G_LOGGER.verbose("Using batch size: " +
                             str(engine.max_batch_size) +
                             " during buffer allocation")
            for binding in engine:
                shape = (engine.max_batch_size, ) + tuple(
                    engine.get_binding_shape(binding))
                dtype = engine.get_binding_dtype(binding)

                device_mem = cuda.DeviceBuffer(shape=shape,
                                               dtype=trt.nptype(dtype))
                G_LOGGER.extra_verbose("Tensor: "
                                       "{:40} | Allocated: {:}".format(
                                           binding, device_mem))

                if engine.binding_is_input(binding):
                    input_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        None, device_mem)
                else:
                    host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype))
                    output_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        host_mem, device_mem)
            return input_buffers, output_buffers, stream

        # Always try reading the engine first, or, failing that, build it.
        if self.load_engine:
            with open(self.load_engine,
                      "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
                G_LOGGER.info("Reading engine from {:}".format(
                    self.load_engine))
                self.engine = runtime.deserialize_cuda_engine(f.read())
        else:
            trt.init_libnvinfer_plugins(TRT_LOGGER, "")
            builder, network, parser, model_batch_size = self.network_loader()
            with builder, network, parser:
                builder.max_batch_size = int(self.max_batch_size
                                             or model_batch_size or 1)

                config = builder.create_builder_config()
                config.max_workspace_size = int(self.max_workspace_size)

                if not self.tf32:
                    with contextlib.suppress(AttributeError):
                        config.clear_flag(trt.BuilderFlag.TF32)
                if self.fp16:
                    config.flags = 1 << int(trt.BuilderFlag.FP16)

                if not network:
                    G_LOGGER.critical("Invalid network")
                G_LOGGER.super_verbose(lambda: trt_util.str_from_network(
                    network) or "Finished logging network")

                if self.layerwise:
                    # In layerwise mode, every layer becomes an output.
                    G_LOGGER.info(
                        "Running in layerwise mode. Marking {:} layers as outputs"
                        .format(network.num_layers))
                    for layer in network:
                        for index in range(layer.num_outputs):
                            out = layer.get_output(index)
                            if not out.is_network_output:
                                network.mark_output(out)

                G_LOGGER.info(
                    "Building engine: max workspace size={:} bytes, max batch size={:}, fp16={:}, "
                    "tf32={:}".format(builder.max_workspace_size,
                                      builder.max_batch_size, self.fp16,
                                      self.tf32))
                self.engine = builder.build_engine(network, config)

        if not self.engine:
            G_LOGGER.critical(
                "Invalid Engine. Please ensure the engine was built correctly")

        if self.engine_path:
            with open(self.engine_path, "wb") as f:
                G_LOGGER.info("Writing engine to {:}".format(self.engine_path))
                f.write(self.engine.serialize())

        self.context = self.engine.create_execution_context()
        self.input_buffers, self.output_buffers, self.stream = allocate_buffers(
            self.engine)
Ejemplo n.º 6
0
    def activate_impl(self):
        """
        Vars:
            engine (trt.ICudaEngine):
                    The engine tracked by this runner. The TrtLegacyRunner OWNS the engine it
                    manages, and therefore is responsible for it's destruction. Do not free the engine outside of the
                    runner, or it will result in a double free.
            context (trt.IExecutionContext): The context used for inference.
            input_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for input buffers.
            output_buffers (Dict[str, TrtLegacyRunner.HostDeviceMem]):
                    A mapping of binding names to HostDeviceMem objects for output buffers.
            bindings (List[int]): A list of device pointers for engine bindings.
            stream (cuda.Stream): The CUDA stream that this runner will use for inference.
        """

        # Only initialize GPU after this runner is activated.
        # Allocates all buffers required for an engine, i.e. host/device input_buffers/output_buffers.
        def allocate_buffers(engine):
            input_buffers = OrderedDict()
            output_buffers = OrderedDict()
            stream = cuda.Stream()
            G_LOGGER.verbose("Using batch size: " +
                             str(engine.max_batch_size) +
                             " during buffer allocation")
            for binding in engine:
                shape = (engine.max_batch_size, ) + tuple(
                    engine.get_binding_shape(binding))
                dtype = engine.get_binding_dtype(binding)

                device_mem = cuda.DeviceArray(shape=shape,
                                              dtype=trt.nptype(dtype))
                G_LOGGER.extra_verbose("Tensor: "
                                       "{:35} | Allocated: {:}".format(
                                           binding, device_mem))

                if engine.binding_is_input(binding):
                    input_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        None, device_mem)
                else:
                    host_mem = np.empty(shape=shape, dtype=trt.nptype(dtype))
                    output_buffers[binding] = TrtLegacyRunner.HostDeviceMem(
                        host_mem, device_mem)
            return input_buffers, output_buffers, stream

        # Always try reading the engine first, or, failing that, build it.
        if self.load_engine:
            with open(self.load_engine,
                      "rb") as f, trt.Runtime(get_trt_logger()) as runtime:
                G_LOGGER.info("Reading engine from {:}".format(
                    self.load_engine))
                self.engine = runtime.deserialize_cuda_engine(f.read())
        else:
            trt.init_libnvinfer_plugins(get_trt_logger(), "")
            builder, network, parser, model_batch_size = self.network_loader()
            with builder, network, parser, builder.create_builder_config(
            ) as config:
                if not network:
                    G_LOGGER.critical("Invalid network")
                G_LOGGER.super_verbose(lambda: trt_util.str_from_network(
                    network) or "Finished logging network")

                builder.max_batch_size = int(self.max_batch_size
                                             or model_batch_size or 1)

                config.max_workspace_size = int(self.max_workspace_size)

                if not self.tf32:
                    with contextlib.suppress(AttributeError):
                        config.clear_flag(trt.BuilderFlag.TF32)
                if self.fp16:
                    config.set_flag(trt.BuilderFlag.FP16)

                if self.int8:
                    config.set_flag(trt.BuilderFlag.INT8)
                    input_metadata = _input_metadata_from_network(network)
                    with contextlib.suppress(
                            AttributeError
                    ):  # Polygraphy calibrator has a reset method
                        self.calibrator.reset(input_metadata)
                    config.int8_calibrator = self.calibrator

                if self.use_dla:
                    config.default_device_type = trt.DeviceType.DLA
                    config.DLA_core = 0

                if self.allow_gpu_fallback:
                    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)

                if self.layerwise:
                    trt_util.mark_layerwise(network)

                G_LOGGER.info(
                    "Building engine: max workspace size={:} bytes, max batch size={:}, fp16={:}, "
                    "tf32={:}, int8={:}".format(config.max_workspace_size,
                                                builder.max_batch_size,
                                                self.fp16, self.tf32,
                                                self.int8))
                self.engine = builder.build_engine(network, config)

        if not self.engine:
            G_LOGGER.critical(
                "Invalid Engine. Please ensure the engine was built correctly")

        if self.engine_path:
            with open(self.engine_path, "wb") as f:
                G_LOGGER.info("Writing engine to {:}".format(self.engine_path))
                f.write(self.engine.serialize())

        self.context = self.engine.create_execution_context()
        self.input_buffers, self.output_buffers, self.stream = allocate_buffers(
            self.engine)