def create_inference_graph(input_graph_def, outputs, max_batch_size=1, max_workspace_size_bytes=2 << 20, precision_mode=TrtPrecisionMode.FP32, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=None, use_calibration=True, input_saved_model_dir=None, input_saved_model_tags=None, output_saved_model_dir=None, session_config=None): """Python wrapper for the TRT transformation. Args: input_graph_def: a GraphDef object containing a model to be transformed. If set to None, the graph will be read from the SavedModel loaded from input_saved_model_dir. outputs: list of tensors or node names for the model outputs. Only used when input_graph_def is not None. max_batch_size: max size for the input batch. max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize(). precision_mode: one of TrtPrecisionMode.supported_precision_modes(). minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT network and engine at run time. maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. If the number of cached engines is already at max but none of them can serve the input, the TRTEngineOp will fall back to run the TF function based on which the TRTEngineOp is created. cached_engine_batches: a list of batch sizes used to create cached engines, only used when is_dynamic_op is True. The length of the list should be <= maximum_cached_engines, and the dynamic TRT op will use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. use_calibration: this argument is ignored if precision_mode is not INT8. If set to True, a calibration graph will be created to calibrate the missing ranges. The calibration graph must be converted to an inference graph using calib_graph_to_infer_graph() after running calibration. if set to False, quantization nodes will be expected for every tensor in the graph (exlcuding those which will be fused). If a range is missing, an error will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. input_saved_model_dir: the directory to load the SavedModel which contains the input graph to transforms. Used only when input_graph_def is None. input_saved_model_tags: list of tags to load the SavedModel. output_saved_model_dir: if not None, construct a SavedModel using the returned GraphDef and save it to the specified directory. This option only works when the input graph is loaded from a SavedModel, i.e. when input_saved_model_dir is specified and input_graph_def is None. session_config: the ConfigProto used to create a Session. It's also used as a template to create a TRT-enabled ConfigProto for conversion. If not specified, a default ConfigProto will be used. Returns: A GraphDef transformed from input_graph_def (or the SavedModel graph def loaded from input_saved_model_dir, if input_graph_def is not present), where all TRT compatible subgraphs are replaced with TRTEngineOps, and a TF function is added for each of the subgraphs. If is_dynamic_op is True, each TRTEngineOp will contain a serialized subgraph GraphDef, which will be converted to a TRT engine at execution time and the TRT engine will be cached for future usage. A new TRT engine will be created each time when none of the cached engines match the input shapes. If it fails to execute the TRT engine or the number of cached engines reaches maximum_cached_engines, the op will fall back to call the corresponding TF function. If is_dynamic_op is False, each TRTEngineOp will contain a serialized TRT engine created from the corresponding subgraph. No more engines will be created on the fly, and the op will fall back to call the corresponding TF function when it fails to execute the engine. Raises: ValueError: if the combination of the parameters is invalid. RuntimeError: if the TensorRT library version is incompatible. """ compiled_version = get_linked_tensorrt_version() loaded_version = get_loaded_tensorrt_version() version_mismatch = False if loaded_version[0] < compiled_version[0]: tf_logging.error( "TensorRT version mismatch. Tensorflow was compiled against " + "TensorRT %s but library loaded from environment is TensorRT %s" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version])) + ". Please make sure that correct version of TensorRT " + "is available in the system and added to ldconfig or LD_LIBRARY_PATH" ) raise RuntimeError("Incompatible TensorRT library version") for i in zip(loaded_version, compiled_version): if i[0] != i[1]: tf_logging.warn("TensorRT mismatch. Compiled against version " + "%s, but loaded %s. Things may not work" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version]))) version_mismatch = True break if not version_mismatch: tf_logging.info("Running against TensorRT version %s" % ".".join([str(x) for x in loaded_version])) if session_config is None: session_config = config_pb2.ConfigProto() if input_saved_model_tags is None: input_saved_model_tags = [tag_constants.SERVING] saved_model_loader = None grappler_meta_graph_def = None if input_graph_def is None: # Read from SavedModel and freeze the graph if necessary. if input_saved_model_dir is None: raise ValueError( "input_graph_def and input_saved_model_dir cannot be " "both None") with ops.Graph().as_default(): with session.Session(config=session_config) as sess: saved_model_loader = loader_impl.SavedModelLoader( input_saved_model_dir) input_meta_graph_def = saved_model_loader.load( sess, input_saved_model_tags) output_node_names = set() def _gather_names(tensor_info): """Get the node names from a TensorInfo.""" return set([ tensor_info[key].name.split(":")[0] for key in tensor_info ]) # Get input and outputs from all SignatureDef. for key in input_meta_graph_def.signature_def: signature_def = input_meta_graph_def.signature_def[key] output_node_names.update( _gather_names(signature_def.inputs)) output_node_names.update( _gather_names(signature_def.outputs)) # Freeze the variables in the SavedModel graph and copy the frozen # graph over. frozen_graph_def = graph_util.convert_variables_to_constants( sess, sess.graph.as_graph_def(add_shapes=True), list(output_node_names)) grappler_meta_graph_def = meta_graph_pb2.MetaGraphDef() grappler_meta_graph_def.graph_def.CopyFrom(frozen_graph_def) # Copy the collections that are not variables. for key in input_meta_graph_def.collection_def: # TODO(laigd): currently we use the collection key to filter out # collections that depend on variable ops, but this may miss some # other user-defined collections. A better way would be to use # CollectionDef::NodeList for the filtering. if key not in [ "variables", "local_variables", "model_variables", "trainable_variables", "train_op", "table_initializer" ]: grappler_meta_graph_def.collection_def[key].CopyFrom( input_meta_graph_def.collection_def[key]) # Copy other information. grappler_meta_graph_def.meta_info_def.CopyFrom( input_meta_graph_def.meta_info_def) for key in input_meta_graph_def.signature_def: grappler_meta_graph_def.signature_def[key].CopyFrom( input_meta_graph_def.signature_def[key]) # TODO(laigd): maybe add back AssetFileDef. else: if output_saved_model_dir is not None: raise ValueError("output_saved_model_dir cannot be set when " "input_graph_def is set") # Create MetaGraphDef from input graph. graph = ops.Graph() with graph.as_default(): importer.import_graph_def(input_graph_def, name="") grappler_meta_graph_def = saver.export_meta_graph( graph_def=graph.as_graph_def(add_shapes=True), graph=graph) if outputs: output_collection = meta_graph_pb2.CollectionDef() output_list = output_collection.node_list.value for i in outputs: if isinstance(i, ops.Tensor): output_list.append(_to_bytes(i.name)) else: output_list.append(_to_bytes(i)) # TODO(laigd): use another key as the outputs are really not train_op. grappler_meta_graph_def.collection_def["train_op"].CopyFrom( output_collection) # Create TRT-enabled ConfigProto. session_config_with_trt = config_pb2.ConfigProto() session_config_with_trt.CopyFrom(session_config) rewriter_config = None if (session_config_with_trt.HasField("graph_options") and session_config_with_trt.graph_options.HasField("rewrite_options")): rewriter_config = session_config_with_trt.graph_options.rewrite_options rewriter_config_with_trt = get_tensorrt_rewriter_config( rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode, minimum_segment_size, is_dynamic_op, maximum_cached_engines, cached_engine_batches, use_calibration) session_config_with_trt.graph_options.rewrite_options.CopyFrom( rewriter_config_with_trt) # Run Grappler. transformed_graph_def = tf_optimizer.OptimizeGraph(session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph") # Optionally write the transformed graphdef as SavedModel. if output_saved_model_dir is not None: saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir) with ops.Graph().as_default(): importer.import_graph_def(transformed_graph_def, name="") # We don't use TRT here. with session.Session(config=session_config) as sess: saved_model_builder.add_meta_graph_and_variables( sess, input_saved_model_tags, signature_def_map=grappler_meta_graph_def.signature_def) # Ignore other meta graphs from the input SavedModel. saved_model_builder.save() return transformed_graph_def
def __init__(self, input_saved_model_dir=None, input_saved_model_tags=None, input_saved_model_signature_key=None, input_graph_def=None, nodes_blacklist=None, session_config=None, max_batch_size=1, max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES, precision_mode=TrtPrecisionMode.FP32, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=None, use_calibration=True, use_function_backup=True): """Initialize the converter. Args: input_saved_model_dir: the directory to load the SavedModel which contains the input graph to transforms. Used only when input_graph_def is None. input_saved_model_tags: list of tags to load the SavedModel. input_saved_model_signature_key: the key of the signature to optimize the graph for. input_graph_def: a GraphDef object containing a model to be transformed. If set to None, the graph will be read from the SavedModel loaded from input_saved_model_dir. nodes_blacklist: list of node names to prevent the converter from touching. Only used when input_graph_def is not None. session_config: the ConfigProto used to create a Session. It's also used as a template to create a TRT-enabled ConfigProto for conversion. If not specified, a default ConfigProto will be used. max_batch_size: max size for the input batch. max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize(). precision_mode: one of TrtPrecisionMode.supported_precision_modes(). minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT network and engine at run time. maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. If the number of cached engines is already at max but none of them can serve the input, the TRTEngineOp will fall back to run the TF function based on which the TRTEngineOp is created. cached_engine_batches: a list of batch sizes used to create cached engines, only used when is_dynamic_op is True. The length of the list should be <= maximum_cached_engines, and the dynamic TRT op will use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. use_calibration: this argument is ignored if precision_mode is not INT8. If set to True, a calibration graph will be created to calibrate the missing ranges. The calibration graph must be converted to an inference graph by running calibration with calibrate(). If set to False, quantization nodes will be expected for every tensor in the graph (exlcuding those which will be fused). If a range is missing, an error will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. use_function_backup: if set to True, it will create a FunctionDef for each subgraph that is converted to TRT op, and if TRT ops fail to execute at runtime, it'll invoke that function as a fallback. Raises: ValueError: if the combination of the parameters is invalid. RuntimeError: if the TensorRT library version is incompatible. """ super(TrtGraphConverter, self).__init__( input_saved_model_dir=input_saved_model_dir, input_saved_model_tags=input_saved_model_tags, input_saved_model_signature_key=input_saved_model_signature_key, input_graph_def=input_graph_def, nodes_blacklist=nodes_blacklist, session_config=session_config) # TODO(laigd): move all the validations below to # get_tensorrt_rewriter_config(). # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain # even if it cannot find TensorRT library. trt_ops.load_trt_ops() # pylint: disable=g-import-not-at-top,line-too-long from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version # pylint: enable=g-import-not-at-top,line-too-long # Check compatibility of TensorRT version. compiled_version = get_linked_tensorrt_version() loaded_version = get_loaded_tensorrt_version() tf_logging.info("Linked TensorRT version: %s" % str(compiled_version)) tf_logging.info("Loaded TensorRT version: %s" % str(loaded_version)) version_mismatch = False if loaded_version[0] < compiled_version[0]: tf_logging.error( "TensorRT version mismatch. Tensorflow was compiled against " + "TensorRT %s but library loaded from environment is TensorRT %s" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version])) + ". Please make sure that correct version of TensorRT " + "is available in the system and added to ldconfig or LD_LIBRARY_PATH") raise RuntimeError("Incompatible TensorRT library version") for i in zip(loaded_version, compiled_version): if i[0] != i[1]: tf_logging.warn("TensorRT mismatch. Compiled against version " + "%s, but loaded %s. Things may not work" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version]))) version_mismatch = True break if not version_mismatch: tf_logging.info("Running against TensorRT version %s" % ".".join([str(x) for x in loaded_version])) # Check input arguments. supported_precision_modes = TrtPrecisionMode.supported_precision_modes() if precision_mode not in supported_precision_modes: raise ValueError(("precision mode '{}' is not supported." "It should be one of {}").format( precision_mode, supported_precision_modes)) if cached_engine_batches: if not isinstance(cached_engine_batches, list): raise TypeError("cached_engine_batches should be a list.") if len(cached_engine_batches) > maximum_cached_engines: raise ValueError("cached_engine_batches should not contain more than " "maximum_cached_engines items.") self._need_calibration = ( precision_mode == TrtPrecisionMode.INT8 and use_calibration) self._use_function_backup = use_function_backup # TODO(laigd): consider provide a mechanism to remove the fallback path # after calibration is done. if self._need_calibration and not use_function_backup: raise ValueError( "Calibration requires enabling fallback to TF function execution.") # TODO(laigd): # - Get rid of is_dynamic_op option, it should always be True, and it should # accept N shapes as input. # - Verify in int8 mode that maximum_cached_engines and # cached_engine_batches are set appropriately. # - If it fails to build the int8 engine it should return error. self._max_batch_size = max_batch_size self._max_workspace_size_bytes = max_workspace_size_bytes self._precision_mode = precision_mode self._minimum_segment_size = minimum_segment_size self._is_dynamic_op = is_dynamic_op self._maximum_cached_engines = maximum_cached_engines self._cached_engine_batches = cached_engine_batches
def __init__(self, input_saved_model_dir=None, input_saved_model_tags=None, input_saved_model_signature_key=None, input_graph_def=None, nodes_blacklist=None, session_config=None, max_batch_size=1, max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES, precision_mode=TrtPrecisionMode.FP32, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=None, use_calibration=True, use_function_backup=True): """Initialize the converter. Args: input_saved_model_dir: the directory to load the SavedModel which contains the input graph to transforms. Used only when input_graph_def is None. input_saved_model_tags: list of tags to load the SavedModel. input_saved_model_signature_key: the key of the signature to optimize the graph for. input_graph_def: a GraphDef object containing a model to be transformed. If set to None, the graph will be read from the SavedModel loaded from input_saved_model_dir. nodes_blacklist: list of node names to prevent the converter from touching. Only used when input_graph_def is not None. session_config: the ConfigProto used to create a Session. It's also used as a template to create a TRT-enabled ConfigProto for conversion. If not specified, a default ConfigProto will be used. max_batch_size: max size for the input batch. max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize(). precision_mode: one of TrtPrecisionMode.supported_precision_modes(). minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT network and engine at run time. maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. If the number of cached engines is already at max but none of them can serve the input, the TRTEngineOp will fall back to run the TF function based on which the TRTEngineOp is created. cached_engine_batches: a list of batch sizes used to create cached engines, only used when is_dynamic_op is True. The length of the list should be <= maximum_cached_engines, and the dynamic TRT op will use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. use_calibration: this argument is ignored if precision_mode is not INT8. If set to True, a calibration graph will be created to calibrate the missing ranges. The calibration graph must be converted to an inference graph using calib_graph_to_infer_graph() after running calibration. if set to False, quantization nodes will be expected for every tensor in the graph (exlcuding those which will be fused). If a range is missing, an error will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. use_function_backup: if set to True, it will create a FunctionDef for each subgraph that is converted to TRT op, and if TRT ops fail to execute at runtime, it'll invoke that function as a fallback. Raises: ValueError: if the combination of the parameters is invalid. RuntimeError: if the TensorRT library version is incompatible. """ super(TrtGraphConverter, self).__init__( input_saved_model_dir=input_saved_model_dir, input_saved_model_tags=input_saved_model_tags, input_saved_model_signature_key=input_saved_model_signature_key, input_graph_def=input_graph_def, nodes_blacklist=nodes_blacklist, session_config=session_config) # TODO(laigd): move all the validations below to # get_tensorrt_rewriter_config(). # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain # even if it cannot find TensorRT library. trt_ops.load_trt_ops() # pylint: disable=g-import-not-at-top,line-too-long from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version from tensorflow.python.compiler.tensorrt.wrap_conversion import get_loaded_tensorrt_version # pylint: enable=g-import-not-at-top,line-too-long # Check compatibility of TensorRT version. compiled_version = get_linked_tensorrt_version() loaded_version = get_loaded_tensorrt_version() version_mismatch = False if loaded_version[0] < compiled_version[0]: tf_logging.error( "TensorRT version mismatch. Tensorflow was compiled against " + "TensorRT %s but library loaded from environment is TensorRT %s" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version])) + ". Please make sure that correct version of TensorRT " + "is available in the system and added to ldconfig or LD_LIBRARY_PATH") raise RuntimeError("Incompatible TensorRT library version") for i in zip(loaded_version, compiled_version): if i[0] != i[1]: tf_logging.warn("TensorRT mismatch. Compiled against version " + "%s, but loaded %s. Things may not work" % (".".join([str(x) for x in compiled_version]), ".".join([str(x) for x in loaded_version]))) version_mismatch = True break if not version_mismatch: tf_logging.info("Running against TensorRT version %s" % ".".join([str(x) for x in loaded_version])) # Check input arguments. if precision_mode not in TrtPrecisionMode.supported_precision_modes(): raise ValueError(("precision mode '{}' is not supported." "It should be one of {}").format( precision_mode, TrtPrecisionMode.supported_precision_modes)) if cached_engine_batches: if not isinstance(cached_engine_batches, list): raise TypeError("cached_engine_batches should be a list.") if len(cached_engine_batches) > maximum_cached_engines: raise ValueError("cached_engine_batches should not contain more than " "maximum_cached_engines items.") self._need_calibration = ( precision_mode == TrtPrecisionMode.INT8 and use_calibration) self._use_function_backup = use_function_backup # TODO(laigd): consider provide a mechanism to remove the fallback path # after calibration is done. if self._need_calibration and not use_function_backup: raise ValueError( "Calibration requires enabling fallback to TF function execution.") # TODO(laigd): # - Get rid of is_dynamic_op option, it should always be True, and it should # accept N shapes as input. # - Verify in int8 mode that maximum_cached_engines and # cached_engine_batches are set appropriately. # - If it fails to build the int8 engine it should return error. self._max_batch_size = max_batch_size self._max_workspace_size_bytes = max_workspace_size_bytes self._precision_mode = precision_mode self._minimum_segment_size = minimum_segment_size self._is_dynamic_op = is_dynamic_op self._maximum_cached_engines = maximum_cached_engines self._cached_engine_batches = cached_engine_batches