def user(multi_engine, run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" if multi_engine: inp_dims = (2, 3, 7, 5) orig_graph = get_multi_engine_graph_def() else: inp_dims = (100, 24, 24, 2) orig_graph = get_simple_graph_def() # use a frozen graph for inference dummy_input = np.random.random_sample(inp_dims) # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2, # minimum number of nodes in an engine is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=[]) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) print("Is FP32 == FP16? %s (False is possible)" % np.allclose(o1, o4)) print("Is FP32 == INT8? %s (False is possible)" % np.allclose(o1, o5)) print("Pass")
def create_optimized_trt_graph(frozen_path, saving_path, output_node, precision): """ @param frozen_path: location of the original unoptimized frozen_graph.pb @param saving_path: where do you want the new .pb to be saved? @param output_node: name of the ANN's output node @precision: precision for optimization (e.g. FP16) """ if len(output_node.split(':0')) < 2: output_node = output_node + ':0' alloc_space_TensorRT = 2 ppgmf = (8 - alloc_space_TensorRT) / 8 max_workspace_size_bytes = alloc_space_TensorRT * 1000000000 with gfile.FastGFile(frozen_path, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) trt_graph = trt.create_inference_graph( input_graph_def=graph_def, outputs=[output_node], max_batch_size=32, max_workspace_size_bytes=max_workspace_size_bytes, minimum_segment_size=1, precision_mode=precision) path_new_frozen_pb = saving_path + "/newFrozenModel_TRT_" + precision + ".pb" with gfile.FastGFile(path_new_frozen_pb, 'wb') as fp: fp.write(trt_graph.SerializeToString())
def main(): # Open frozen.pb frozen_graph_def = get_frozen_graph('./TENSORFLOW_FROZEN.pb') # output node output_nodes = ['tower_0/refine_out/BatchNorm/FusedBatchNorm'] # TensorRT inference graph trt_graph = trt.create_inference_graph( frozen_graph_def, output_nodes, max_batch_size=1, max_workspace_size_bytes=(2 << 10) << 20, precision_mode='FP16') print('!!!!!! trt graph create !!!!!!') # Write 'TRT_FP16.pb' write_graph_to_file('TRT_FP16.pb', trt_graph, './') # check how many ops of the original frozen model all_nodes = len([1 for n in frozen_graph_def.node]) print("numb. of all_nodes in frozen graph:", all_nodes) # check how many ops that is converted to TensorRT engine trt_engine_nodes = len( [1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp']) print("numb. of trt_engine_nodes in TensorRT graph:", trt_engine_nodes) all_nodes = len([1 for n in trt_graph.node]) print("numb. of all_nodes in TensorRT graph:", all_nodes)
def main(): if not file_changed(os.path.join(SAVED_MODEL_DIR, FROZEN_GRAPH_NAME)): print("Frozen graph not changed, not rebuilding") return # What model to run from - should be the directory name of an exported trained model # Change me to the directory checkpoint files are saved in frozen_graph_name = os.path.join(SAVED_MODEL_DIR, FROZEN_GRAPH_NAME) if not os.path.isfile(frozen_graph_name): print("Frozen graph not found, building...") build_frozen_graph(config=CONFIG_FILE, checkpoint=os.path.join( SAVED_MODEL_DIR, MODEL_CHECKPOINT_PREFIX + CHECKPOINT_NUMBER), score_threshold=0.2, batch_size=1) else: print("Frozen graph found, not rebuilding...") # read frozen graph from file frozen_graph, input_names, output_names = load_frozen_graph( frozen_graph_name) trt_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=output_names, max_batch_size=1, max_workspace_size_bytes=1 << 25, precision_mode='FP16', # TODO - FP16 or INT8 for Jetson minimum_segment_size=50) with open(os.path.join(SAVED_MODEL_DIR, TRT_OUTPUT_GRAPH), 'wb') as f: f.write(trt_graph.SerializeToString())
def TF_to_TRT(): with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( per_process_gpu_memory_fraction=0.3))) as sess: saver = tf.train.import_meta_graph("./tensorRT/model.meta") saver.restore(sess, "./tensorRT/model") your_outputs = ["fcn21/truediv"] frozen_graph = tf.graph_util.convert_variables_to_constants( sess, tf.get_default_graph().as_graph_def(), output_node_names=your_outputs) with gfile.FastGFile("./tensorRT/frozen_model.pb", 'wb') as f: f.write(frozen_graph.SerializeToString()) print("Frozen model is successfully stored!") trt_graph = trt.create_inference_graph(input_graph_def=frozen_graph, outputs=your_outputs, max_batch_size=1, max_workspace_size_bytes=930000000, precision_mode=PRECISION) with gfile.FastGFile("./tensorRT/" + NAME, 'wb') as f: f.write(trt_graph.SerializeToString()) print("TensorRT model is successfully stored!") all_nodes = len([1 for n in frozen_graph.node]) print("numb. of all_nodes in frozen graph:", all_nodes) trt_engine_nodes = len( [1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp']) print("numb. of trt_engine_nodes in TensorRT graph:", trt_engine_nodes) all_nodes = len([1 for n in trt_graph.node]) print("numb. of all_nodes in TensorRT graph:", all_nodes) return trt_graph
def get_trt_graph(graph_name, graph_def, precision_mode, output_dir, output_node, batch_size=128, workspace_size=1<<30): """Create and save inference graph using the TensorRT library. Args: graph_name: string, name of the graph to be used for saving. graph_def: GraphDef, the Frozen Graph to be converted. precision_mode: string, the precision that TensorRT should convert into. Options- FP32, FP16, INT8. output_dir: string, the path to where files should be written. output_node: string, the names of the output node that will be returned during inference. batch_size: int, the number of examples that will be predicted at a time. workspace_size: long, size in bytes that can be used during conversion. Returns: GraphDef for the TensorRT inference graph. """ trt_graph = trt.create_inference_graph( graph_def, [output_node], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode=precision_mode) write_graph_to_file(graph_name, trt_graph, output_dir) return trt_graph
def get_trt_graph(graph_name, graph_def, precision_mode, output_dir, output_node, batch_size=128, workspace_size=2<<10): """Create and save inference graph using the TensorRT library. Args: graph_name: string, name of the graph to be used for saving. graph_def: GraphDef, the Frozen Graph to be converted. precision_mode: string, the precision that TensorRT should convert into. Options- FP32, FP16, INT8. output_dir: string, the path to where files should be written. output_node: string, the names of the output node that will be returned during inference. batch_size: int, the number of examples that will be predicted at a time. workspace_size: int, size in megabytes that can be used during conversion. Returns: GraphDef for the TensorRT inference graph. """ trt_graph = trt.create_inference_graph( graph_def, [output_node], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size<<20, precision_mode=precision_mode) write_graph_to_file(graph_name, trt_graph, output_dir) return trt_graph
def build_trt_pb(model_name, pb_path, download_dir='data'): """Build TRT model from the original TF model, and save the graph into a pb file for faster access in the future. The code was mostly taken from the following example by NVIDIA. https://github.com/NVIDIA-Jetson/tf_trt_models/blob/master/examples/detection/detection.ipynb """ from tf_trt_models.detection import download_detection_model from tf_trt_models.detection import build_detection_graph from utils.egohands_models import get_egohands_model if 'coco' in model_name: config_path, checkpoint_path = \ download_detection_model(model_name, download_dir) else: config_path, checkpoint_path = \ get_egohands_model(model_name) frozen_graph_def, input_names, output_names = build_detection_graph( config_path=config_path, checkpoint=checkpoint_path) trt_graph_def = trt.create_inference_graph( input_graph_def=frozen_graph_def, outputs=output_names, max_batch_size=1, max_workspace_size_bytes=1 << 26, precision_mode='FP16', minimum_segment_size=50) with open(pb_path, 'wb') as pf: pf.write(trt_graph_def.SerializeToString())
def main(argv): del argv # Unused. original_saved_model_dir = FLAGS.saved_model_dir.rstrip('/') tensorrt_saved_model_dir = '{}_trt'.format(original_saved_model_dir) # Converts `SavedModel` to TensorRT inference graph. trt.create_inference_graph(None, None, input_saved_model_dir=original_saved_model_dir, output_saved_model_dir=tensorrt_saved_model_dir) print('Model conversion completed.') # Gets the image. get_image_response = requests.get(FLAGS.image_url) number = FLAGS.number saved_model_dirs = [original_saved_model_dir, tensorrt_saved_model_dir] latencies = {} for saved_model_dir in saved_model_dirs: with tf.Graph().as_default(): with tf.Session() as sess: # Loads the saved model. loader.load(sess, [tag_constants.SERVING], saved_model_dir) print('Model loaded {}'.format(saved_model_dir)) def _run_inf(session=sess, n=1): """Runs inference repeatedly.""" for _ in range(n): session.run(FLAGS.model_outputs, feed_dict={ FLAGS.model_input: [get_image_response.content] }) # Run inference once to perform XLA compile step. _run_inf(sess, 1) start = time.time() _run_inf(sess, number) end = time.time() latencies[saved_model_dir] = end - start print('Time to run {} predictions:'.format(number)) for saved_model_dir, latency in latencies.items(): print('* {} seconds for {} runs for {}'.format(latency, number, saved_model_dir))
def getINT8CalibGraph(input_file, output_prefix, output, batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getGraph(input_file), output, max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="INT8") # calibration with gfile.FastGFile(output_prefix+'.INT8Calib.pb','wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getFP16(input_file, output_prefix, output, batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getGraph(input_file), output, max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="FP16") # Get optimized graph with gfile.FastGFile(output_prefix+'.FP16.pb','wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getFP16(batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getResnet50(), [ "resnet_v1_50/predictions/Reshape_1"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="FP16") # Get optimized graph with gfile.FastGFile("resnetV150_TRTFP16.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def optimize_ocr(self, input_graph_def): output_graph_def = trt.create_inference_graph( input_graph_def=input_graph_def, outputs=['predicted_chars', 'predicted_scores'], max_batch_size=1, # max_workspace_size_bytes=(2 << 10) << 20, precision_mode=self.precision) return output_graph_def
def getINT8CalibGraph(batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getResnet50(), [ "resnet_v1_50/predictions/Reshape_1"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="INT8") # calibration with gfile.FastGFile("resnetV150_TRTINT8Calib.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getFP16(batch_size,workspace_size,network,output_nodes): trt_graph = trt.create_inference_graph(getResnet50(network),[output_nodes], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="FP16") # Get optimized graph with gfile.FastGFile("resnetV250_TRTFP16_chest.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getINT8CalibGraph(batch_size,workspace_size,network,output_nodes): trt_graph = trt.create_inference_graph(getResnet50(network), [output_nodes], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="INT8") # calibration with gfile.FastGFile("resnetV250_TRTINT8Calib_chest.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getINT8CalibGraph(batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getResnet50(), [ "resnet_v1_50/predictions/Reshape_1"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="INT8") # calibration with gfile.FastGFile("resnetV150_TRTINT8Calib.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getFP16(batch_size=128,workspace_size=1<<30): trt_graph = trt.create_inference_graph(getResnet50(), [ "resnet_v1_50/predictions/Reshape_1"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="FP16") # Get optimized graph with gfile.FastGFile("resnetV150_TRTFP16.pb",'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def get_frozen_tftrt_model(bert_config, shape, use_one_hot_embeddings, init_checkpoint): tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True output_node_names = ['unstack'] with tf.Session(config=tf_config) as tf_sess: input_ids = tf.placeholder(tf.int32, shape, 'input_ids') input_mask = tf.placeholder(tf.int32, shape, 'input_mask') segment_ids = tf.placeholder(tf.int32, shape, 'segment_ids') (start_logits, end_logits) = create_model(bert_config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf_sess.run(tf.global_variables_initializer()) print("LOADED!") tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" else: init_string = ", *NOTTTTTTTTTTTTTTTTTTTTT" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) frozen_graph = tf.graph_util.convert_variables_to_constants(tf_sess, tf_sess.graph.as_graph_def(), output_node_names) num_nodes = len(frozen_graph.node) print('Converting graph using TensorFlow-TensorRT...') import tensorflow.contrib.tensorrt as trt frozen_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=output_node_names, max_batch_size=FLAGS.predict_batch_size, max_workspace_size_bytes=(4096 << 20) - 1000, precision_mode = "FP16" if FLAGS.use_fp16 else "FP32", minimum_segment_size=4, is_dynamic_op=True, maximum_cached_engines=1000 ) print('Total node count before and after TF-TRT conversion:', num_nodes, '->', len(frozen_graph.node)) print('TRT node count:', len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])) with tf.gfile.GFile("frozen_modelTRT.pb", "wb") as f: f.write(frozen_graph.SerializeToString()) return frozen_graph
def optimize_model(args): checkpoint = tf.train.get_checkpoint_state(model_dir) input_checkpoint = checkpoint.model_checkpoint_path absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1]) with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction = 0.5))) as sess: saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=True) saver.restore(sess, input_checkpoint) your_outputs = [args.output_tensors] frozen_graph = tf.graph_util.convert_variables_to_constants(sess, tf.get_default_graph().as_graph_def(), output_node_names=your_outputs) with gfile.FastGFile('/models/model.pb', 'wb') as f: f.write(frozen_graph.SerializableToSting()) print("Frozen model is successfully stored!") trt_graph = trt.create_inference_graph( input_graph_def=frozen_graph, outputs=your_outputs, max_batch_size=2, max_workspace_size_bytes=2*(10**9), precision_mode="FP32") with gfile.FastGFile('model/tensorrt_model.pb', 'wb') as f: f.write(trt_graph.SerializableToSting()) print("tensorRT model is successfully stored!") all_nodes = len([1 for n in frozen_graph.node]) print("no.s of nodes in frozen model", all_nodes) tensorrt_all_nodes = len([1 for n in trt_graph.node if str(n.op) == 'TRTEngine']) print("no.s of nodes in trt model graph", tensorrt_all_nodes) all_nodes = len([1 for n in trt_graph.node]) print("no.s of nodes in trt model", all_nodes) def read_pb_graph(model): with gfile.FastGFile(model, 'rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) return graph_def MODEL_PATH = 'model/model.pb' graph = tf.Graph() with graph.as_default(): with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction = 0.5))) as sess: trt_graph = read_pb_graph(MODEL_PATH) tf.import_graph_def(trt_graph, name='') input = sess.graph.get_tensor_by_name(args.input_tensor + ':0') output = sess.graph.get_tensor_by_name(args.output_tensors + ':0') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, default="", help="Model folder to export") parser.add_argument("--input_tensor", type=str, help="name of input tensors") parser.add_argument("--output_tensors", type=str, help="name of output tensors") args = parser.parse_args()
def _GetTrtGraph(self, gdef, precision_mode, is_dynamic_op): """Return trt converted graph.""" return trt.create_inference_graph(input_graph_def=gdef, outputs=[OUTPUT_NAME], max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode=precision_mode, minimum_segment_size=2, is_dynamic_op=is_dynamic_op)
def load_frozen_model(self): if (self.model != self.prev_model): self.prev_model = self.model rospy.loginfo("load a new frozen model {}".format(self.model)) detection_graph = tf.Graph() try: trt_graph = tf.GraphDef() with tf.gfile.GFile( os.path.join( os.path.dirname(os.path.realpath(__file__)), self.model + "/trt.pb"), "rb") as f: serialized_trt_graph = f.read() trt_graph.ParseFromString(serialized_trt_graph) rospy.loginfo("loading graph from file") except: od_graph_def = tf.GraphDef() with tf.gfile.GFile( os.path.join( os.path.dirname(os.path.realpath(__file__)), self.model + "/frozen_inference_graph.pb"), 'rb') as fid: serialized_graph = fid.read() od_graph_def.ParseFromString(serialized_graph) trt_graph = trt.create_inference_graph( input_graph_def=od_graph_def, outputs=[ "detection_boxes:0", "detection_scores:0", "detection_classes:0", "num_detections:0" ], max_batch_size=1, max_workspace_size_bytes=1 << 25, precision_mode="FP32", is_dynamic_op=False, minimum_segment_size=50) rospy.loginfo("loading graph from scratch") with open( os.path.join( os.path.dirname(os.path.realpath(__file__)), self.model + "/trt.pb"), "wb") as f: f.write(trt_graph.SerializeToString()) with detection_graph.as_default(): rospy.loginfo("finish generating tensorrt engine") tf.import_graph_def(trt_graph, name='') rospy.loginfo("model is loaded!") return detection_graph, None, None else: rospy.loginfo("keep the previous model") return self.detection_graph, None, None
def user(run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" inp_dims = (100, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) orig_graph = get_simple_graph_def() # use a frozen graph for inference # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) assert np.allclose(o1, o4) assert np.allclose(o1, o5) print("Pass")
def get_trt(self, grf, precision): re = trt.create_inference_graph(grf, [ self.out_nd_str[0], self.out_nd_str[1], self.out_nd_str[2], self.out_nd_str[3] ], max_batch_size=20, max_workspace_size_bytes=2 << 10 << 20, precision_mode=precision, minimum_segment_size=10) return re
def user(run_graph=execute_graph, run_calibration=execute_calibration): """Example function that converts a graph to TFTRT graph.""" inp_dims = (100, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) orig_graph = get_simple_graph_def() # use a frozen graph for inference # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) int8_calib_gdef = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="INT8", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o4 = run_graph(fp16_graph, dummy_input) _ = run_calibration(int8_calib_gdef, dummy_input) int8_graph = trt.calib_graph_to_infer_graph(int8_calib_gdef) o5 = run_graph(int8_graph, dummy_input) assert np.allclose(o1, o4) assert np.allclose(o1, o5) print("Pass")
def _GetTrtGraph(self, gdef, precision_mode, is_dynamic_op): """Return trt converted graph.""" return trt.create_inference_graph( input_graph_def=gdef, outputs=[OUTPUT_NAME], max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode=precision_mode, minimum_segment_size=2, is_dynamic_op=is_dynamic_op)
def convert_with_tensorrt(args): """Function triggered by 'convert tensorrt' command. Args: args: A namespace parsed from command line. """ # Import here instead of at top, because this will crash if TensorRT is # not installed from tensorflow.contrib import tensorrt # pylint: disable=g-import-not-at-top tensorrt.create_inference_graph( None, None, max_batch_size=args.max_batch_size, max_workspace_size_bytes=args.max_workspace_size_bytes, precision_mode=args.precision_mode, minimum_segment_size=args.minimum_segment_size, is_dynamic_op=args.is_dynamic_op, input_saved_model_dir=args.dir, input_saved_model_tags=args.tag_set.split(','), output_saved_model_dir=args.output_dir)
def convert_with_tensorrt(args): """Function triggered by 'convert tensorrt' command. Args: args: A namespace parsed from command line. """ # Import here instead of at top, because this will crash if TensorRT is # not installed from tensorflow.contrib import tensorrt # pylint: disable=g-import-not-at-top tensorrt.create_inference_graph( None, None, max_batch_size=args.max_batch_size, max_workspace_size_bytes=args.max_workspace_size_bytes, precision_mode=args.precision_mode, minimum_segment_size=args.minimum_segment_size, is_dynamic_op=args.is_dynamic_op, input_saved_model_dir=args.dir, input_saved_model_tags=args.tag_set.split(','), output_saved_model_dir=args.output_dir)
def getFP32(input_graph, out_tensor, precision, batch_size, workspace_size): graph_prefix = input_graph.split('.pb')[0] output_graph = graph_prefix + "_tftrt_" + precision + ".pb" #print("output graph is ", output_graph) tftrt_graph = trt.create_inference_graph( getFrozenGraph(input_graph), [out_tensor], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode=precision) # Get optimized graph with gfile.FastGFile(output_graph, 'wb') as f: f.write(tftrt_graph.SerializeToString())
def optimize_rcnn(self, input_graph_def): trt_graph = trt.create_inference_graph( input_graph_def=input_graph_def, outputs=[ 'detection_boxes', 'detection_scores', 'detection_classes', 'num_detections' ], max_batch_size=1, # max_workspace_size_bytes=(2 << 10) << 20, precision_mode=self.precision) return trt_graph
def createOptimizedGraph(model, session, tf): outputs = [out.op.name for out in model.outputs] frozenGraph = _freeze_session(session, tf, output_names=outputs) # print("Model is now frozen...") # Possible precision modes: FP32, FP16, INT8 optimizedGraph = tensorrt.create_inference_graph( frozenGraph, [out.op.name for out in model.outputs], max_batch_size=MachineSpecificSettings.OPTIMIZED_GRAPH_MAX_BATCH, precision_mode='FP32') return optimizedGraph, outputs
def get_trt_graph(self, mode): """Return trt converted graph.""" if mode in ["FP32", "FP16", "INT8"]: return trt.create_inference_graph( input_graph_def=self._original_graph, outputs=["output"], max_batch_size=self._input.shape[0], max_workspace_size_bytes=1<<25, precision_mode=mode, # TRT Engine precision "FP32", "FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) return None
def getRTGraph(): # Unfortunately it needs a post feb/2019 version of TensorRT to optimize # Convolution2DTranspose, i.e., where we spend most of the time... g, model = getFrozenGraph() output = [x.op.name for x in model.outputs] newG = trt.create_inference_graph(input_graph_def=g, outputs=output, max_batch_size=1, max_workspace_size_bytes=400000000, precision_mode='FP16') return newG, model
def optimizeModel(model_location): """ Creates an optimized Tensor Real Time model from a frozen inference graph """ graph_def, model, category_index, image_tensor, tensor_dict = loadModelDataFromDir( model_location) trt_model = trt.create_inference_graph(graph_def, tensor_dict, max_batch_size=2, precision_mode="FP16", minimum_segment_size=50) return trt_model
def get_trt_graph(self, mode): """Return trt converted graph.""" if mode in ["FP32", "FP16", "INT8"]: return trt.create_inference_graph( input_graph_def=self._original_graph, outputs=["output"], max_batch_size=self._input.shape[0], max_workspace_size_bytes=1 << 25, precision_mode=mode, # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) return None
def get_trt_graph(filename, batch_size, workspace_size, precision, output_pb): print('Start to optimize graph') trt_graph = trt.create_inference_graph( get_GraphDef(filename), ["resnet_v1_50/predictions/Reshape_1"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode=precision) # Get optimized graph with gfile.FastGFile(output_pb, 'wb') as f: f.write(trt_graph.SerializeToString()) return trt_graph
def getFP16(batch_size=64, workspace_size=1 << 30): trt_graph = trt.create_inference_graph( getGraph(), ["classes"], max_batch_size=batch_size, max_workspace_size_bytes=workspace_size, precision_mode="FP16") with gfile.FastGFile(fp16_file, 'wb') as f: f.write(trt_graph.SerializeToString()) if FLAGS.logging_enabled: writer = tf.summary.FileWriter(FLAGS.log_dir + "16") writer.add_graph(trt_graph) return trt_graph
def __init__(self, graph, batch_size, precision): tftrt_graph = tftrt.create_inference_graph( graph.frozen, outputs=graph.y_name, max_batch_size=batch_size, max_workspace_size_bytes=1 << 25, precision_mode=precision, minimum_segment_size=2) opt_graph = copy.deepcopy(graph) opt_graph.frozen = tftrt_graph super(TftrtEngine, self).__init__(opt_graph) self.batch_size = batch_size
def testIncOpPlugin(self): inp_dims = (5, 24, 24, 2) dummy_input = numpy.ones(inp_dims).astype(numpy.float32) orig_graph = self._get_plugin_graph_def() # graph with plugin node # trigger conversion. # plugin nodes have been registered during import, converter will be able to # create corresponding plugin layer during conversion. trt_graph = tensorrt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", minimum_segment_size=2) o2 = self._run_graph(trt_graph, dummy_input) self.assertEqual(35, o2.reshape([-1])[0])
# run over real calibration data here, we are mimicking a calibration set of # 30 different batches. Use as much calibration data as you want for _ in range(30): val = sess.run(out, {inp: dumm_inp}) return val if "__main__" in __name__: inp_dims = (100, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) orig_graph = get_simple_graph_def() # use a frozen graph for inference # Get optimized graph trt_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP32", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine ) o1 = run_graph(orig_graph, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check fp16_graph = trt.create_inference_graph( input_graph_def=orig_graph, outputs=["output"], max_batch_size=inp_dims[0], max_workspace_size_bytes=1 << 25, precision_mode="FP16", # TRT Engine precision "FP32","FP16" or "INT8" minimum_segment_size=2 # minimum number of nodes in an engine
return g.as_graph_def() def run_graph(gdef, dumm_inp): gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50) ops.reset_default_graph() g = ops.Graph() with g.as_default(): inp, out = importer.import_graph_def( graph_def=gdef, return_elements=["input", "output"]) inp = inp.outputs[0] out = out.outputs[0] with csess.Session( config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess: val = sess.run(out, {inp: dumm_inp}) return val if "__main__" in __name__: inp_dims = (100, 24, 24, 2) dummy_input = np.random.random_sample(inp_dims) gdef = get_simple_graph_def() # Get optimized graph trt_graph = trt.create_inference_graph(gdef, ["output"], inp_dims[0]) o1 = run_graph(gdef, dummy_input) o2 = run_graph(trt_graph, dummy_input) o3 = run_graph(trt_graph, dummy_input) assert np.array_equal(o1, o2) assert np.array_equal(o3, o2) # sanity check print("Pass")