def get_trt_converter(graph_def, precision_mode, output_node, batch_size=128, workspace_size=2 << 10): """ Create a TrtGraphConverter Object to use later Args: graph_def: GraphDef, the Frozen Graph to be converted. precision_mode: string, the precision that TensorRT should convert into. Options- FP32, FP16, INT8. output_node: string, the names of the output node that will be returned during inference. batch_size: int, the number of examples that will be predicted at a time. workspace_size: int, size in megabytes that can be used during conversion. Returns: TrtGraphConverter Object """ return trt.TrtGraphConverter( input_graph_def=graph_def, nodes_blacklist=output_node, max_batch_size=batch_size, # max_workspace_size_bytes=workspace_size<<20, precision_mode=precision_mode)
def _ConvertGraphV1(self, output_saved_model_dir=None, need_calibration=False, max_batch_size=1, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, device=None): """Helper method to convert a GraphDef or SavedModel using TF-TRT.""" input_saved_model_dir = None if output_saved_model_dir: input_saved_model_dir = self.mkdtemp() self._WriteInputSavedModelForV1(input_saved_model_dir, device) # Calibration requires dynamic_op. if need_calibration: is_dynamic_op = True # For dynamic_op, the converter requires the unused max_batch_size=None. if is_dynamic_op: max_batch_size = None converter = trt_convert.TrtGraphConverter( input_saved_model_dir=input_saved_model_dir, input_saved_model_signature_key=_SAVED_MODEL_SIGNATURE_KEY, input_graph_def=None if input_saved_model_dir else self._GetGraphDefForV1(device), nodes_denylist=None if input_saved_model_dir else ["output"], max_batch_size=max_batch_size, max_workspace_size_bytes=TrtConvertTest. _TRT_MAX_WORKSPACE_SIZE_BYTES, precision_mode=(trt_convert.TrtPrecisionMode.INT8 if need_calibration else trt_convert.TrtPrecisionMode.FP32), minimum_segment_size=minimum_segment_size, is_dynamic_op=is_dynamic_op, maximum_cached_engines=maximum_cached_engines) output_graph_def = converter.convert() if need_calibration: class CalibrationData(object): def __init__(self): self._data = 0 def next(self): self._data += 1 return { "input1:0": [[[self._data]]], "input2:0": [[[self._data]]] } output_graph_def = converter.calibrate( fetch_names=["output:0"], num_runs=10, feed_dict_fn=CalibrationData().next) if output_saved_model_dir is not None: converter.save(output_saved_model_dir=output_saved_model_dir) return output_graph_def
def __init__(self, frozen_path, gpu_mem_fraction=0.5): self.GPU_MEM_FRACTION = gpu_mem_fraction self.outputs = ['policy_head/Softmax', 'value_head/Tanh'] with gfile.FastGFile(frozen_path, 'rb') as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) trt_converter = trt.TrtGraphConverter(input_graph_def=frozen_graph, nodes_blacklist=self.outputs, is_dynamic_op=True, precision_mode='INT8') trt_graph = trt_converter.convert() # TODO: trt_converter.calibrate tf.reset_default_graph() self.graph = tf.Graph() with self.graph.as_default(): tf.import_graph_def(trt_graph, name='') self.sess = tf.Session(graph=self.graph, config=self._get_gpu_config()) self.state = self.sess.graph.get_tensor_by_name('board_state_input:0') self.policy = self.sess.graph.get_tensor_by_name( 'policy_head/Softmax:0') self.value = self.sess.graph.get_tensor_by_name('value_head/Tanh:0')
def convert2trt(tf_savedmodel_dir: str, trt_savedmodel_dir: str): converter = trt.TrtGraphConverter(input_saved_model_dir=tf_savedmodel_dir, max_workspace_size_bytes=(2 << 20), precision_mode='FP16', maximum_cached_engines=1) converter.convert() converter.save(trt_savedmodel_dir)
def get_frozen_tftrt_model(bert_config, shape, num_labels, use_one_hot_embeddings, init_checkpoint): tf_config = tf.ConfigProto() output_node_names = [ 'loss/cls_loss', 'loss/cls_per_example_loss', 'loss/cls_logits', 'loss/cls_probabilities' ] with tf.Session(config=tf_config) as tf_sess: input_ids = tf.placeholder(tf.int32, shape, 'input_ids') input_mask = tf.placeholder(tf.int32, shape, 'input_mask') segment_ids = tf.placeholder(tf.int32, shape, 'segment_ids') label_ids = tf.placeholder(tf.int32, (None), 'label_ids') create_model(bert_config, False, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) tvars = tf.trainable_variables() (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) tf_sess.run(tf.global_variables_initializer()) print("LOADED!") tf.logging.info("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" else: init_string = ", *NOTTTTTTTTTTTTTTTTTTTTT" tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, init_string) frozen_graph = tf.graph_util.convert_variables_to_constants( tf_sess, tf_sess.graph.as_graph_def(), output_node_names) num_nodes = len(frozen_graph.node) print('Converting graph using TensorFlow-TensorRT...') from tensorflow.python.compiler.tensorrt import trt_convert as trt converter = trt.TrtGraphConverter( input_graph_def=frozen_graph, nodes_blacklist=output_node_names, max_workspace_size_bytes=(4096 << 20) - 1000, precision_mode="FP16" if FLAGS.use_fp16 else "FP32", minimum_segment_size=4, is_dynamic_op=True, maximum_cached_engines=1000) frozen_graph = converter.convert() print('Total node count before and after TF-TRT conversion:', num_nodes, '->', len(frozen_graph.node)) print( 'TRT node count:', len([1 for n in frozen_graph.node if str(n.op) == 'TRTEngineOp'])) with tf.gfile.GFile("frozen_modelTRT.pb", "wb") as f: f.write(frozen_graph.SerializeToString()) return frozen_graph
def convert(self, model: Model, dataloader_fn) -> Model: # https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html # converting graph_def is not supported in TF2 from tensorflow.python.compiler.tensorrt import trt_convert # pytype: disable=import-error assert isinstance(model.handle, tf.compat.v1.GraphDef) session_config = create_session_config(allow_growth=True) output_node_names = [ spec.name.split(":")[0] for spec in model.outputs.values() ] converter = trt_convert.TrtGraphConverter( input_graph_def=model.handle, session_config=session_config, nodes_blacklist=output_node_names, is_dynamic_op=self._is_dynamic_op, precision_mode=self._precision.value, max_workspace_size_bytes=self._max_workspace_size, maximum_cached_engines=self._maximum_cached_engines, max_batch_size=self._max_batch_size, minimum_segment_size=self._minimum_segment_size, ) graph_def = converter.convert() return model._replace(handle=graph_def)
def save_tftrt(): converter = trt.TrtGraphConverter( input_saved_model_dir=input_saved_model_dir, max_workspace_size_bytes=(11 < 32), precision_mode='FP16', maximum_cached_engines=100) converter.convert() converter.save(output_saved_model_dir)
def trt_frozen_graph_and_tensors(model_name, frozen_graph_filepath=FROZEN_GRAPH_FILEPATH, precision_mode='FP16'): from tensorflow.python.compiler.tensorrt import trt_convert as trt """ Loads a Tensorflow frozen graph and changes its precision mode. You can either use FP32 or FP16. FP32 is the original precision mode, but will use TensorRT optimization. Args: model_name (str): The name of your model, eg, resnet_manual_highres_center_only_f1_2_f2_4 frozen_graph_filepath (str): Path to where the frozen graph was saved precision_mode (str): either 'FP32' or 'FP16' Returns: (tuple): tuple containing: frozen_graph (tf.compat.v1.Session): Session containing the TRT graph x (tf.Tensor): Tensor containing the x data y (tf.Tensor): Tensor containing the y data """ if precision_mode in ['FP32', 'FP16']: print('OPENING FROZEN GRAPH FOR MODEL {}'.format(model_name)) with open(frozen_graph_filepath, 'rb') as f: frozen_graph_gd = tf.compat.v1.GraphDef() frozen_graph_gd.ParseFromString(f.read()) if precision_mode == 'FP16': converter = trt.TrtGraphConverter(input_graph_def=frozen_graph_gd, nodes_blacklist=['local_dense/truediv'], precision_mode=precision_mode, use_calibration=True, is_dynamic_op=True) del frozen_graph_gd print('Converting to {}'.format(precision_mode)) frozen_graph = converter.convert() print('Conversion finished') config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth=True with tf.compat.v1.Session(graph=tf.Graph(), config=config) as sess: if precision_mode == 'FP32': frozen_graph = frozen_graph_gd tf.import_graph_def(frozen_graph) input_node = 'import/input1_1' output_node = 'import/local_dense/truediv' frozen_graph = sess.graph x = frozen_graph.get_tensor_by_name(input_node + ':0') y = frozen_graph.get_tensor_by_name(output_node + ':0') return frozen_graph, x, y else: return 'Error on the precision mode'
def _GetGraphDef(self, use_trt, max_batch_size, model_dir): """Get the frozen mnist GraphDef. Args: use_trt: whether use TF-TRT to convert the graph. max_batch_size: the max batch size to apply during TF-TRT conversion. model_dir: the model directory to load the checkpoints. Returns: The frozen mnist GraphDef. """ graph = ops.Graph() with self.session(graph=graph) as sess: with graph.device('/GPU:0'): x = array_ops.placeholder(shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME) self._BuildGraph(x) # Load weights mnist_saver = saver.Saver() checkpoint_file = latest_checkpoint(model_dir) if checkpoint_file is None: raise ValueError( 'latest_checkpoint returned None. check if' + 'model_dir={} is the right directory'.format(model_dir)) mnist_saver.restore(sess, checkpoint_file) # Freeze graph_def = graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME]) # Convert with TF-TRT if use_trt: logging.info('Number of nodes before TF-TRT conversion: %d', len(graph_def.node)) converter = trt_convert.TrtGraphConverter( input_graph_def=graph_def, nodes_blacklist=[OUTPUT_NODE_NAME], max_batch_size=max_batch_size, precision_mode='INT8', # There is a 2GB GPU memory limit for each test, so we set # max_workspace_size_bytes to 256MB to leave enough room for TF # runtime to allocate GPU memory. max_workspace_size_bytes=1 << 28, minimum_segment_size=2, use_calibration=False, use_function_backup=False) graph_def = converter.convert() logging.info('Number of nodes after TF-TRT conversion: %d', len(graph_def.node)) num_engines = len( [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp']) self.assertEqual(1, num_engines) return graph_def
def _ConvertGraph(self, input_saved_model_dir=None, output_saved_model_dir=None, need_calibration=False, max_batch_size=1, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, use_function_backup=False): """Helper method to convert a GraphDef or SavedModel using TF-TRT.""" converter = trt_convert.TrtGraphConverter( input_saved_model_dir=input_saved_model_dir, input_saved_model_signature_key="mypredict", input_graph_def=None if input_saved_model_dir else self._GetGraphDef(), nodes_blacklist=["output"], session_config=self._GetConfigProto(), max_batch_size=max_batch_size, max_workspace_size_bytes=TrtConvertTest. _TRT_MAX_WORKSPACE_SIZE_BYTES, precision_mode=(trt_convert.TrtPrecisionMode.INT8 if need_calibration else trt_convert.TrtPrecisionMode.FP32), minimum_segment_size=minimum_segment_size, is_dynamic_op=is_dynamic_op, maximum_cached_engines=maximum_cached_engines, use_function_backup=use_function_backup) conversion_result = converter.convert() if context.executing_eagerly(): output_graph_def = conversion_result.graph.as_graph_def() else: output_graph_def = conversion_result if need_calibration: class CalibrationData(object): def __init__(self): self._data = 0 def next(self): self._data += 1 return {"input:0": [[[self._data]]]} output_graph_def = converter.calibrate( fetch_names=["output:0"], num_runs=10, feed_dict_fn=CalibrationData().next) if output_saved_model_dir is not None: converter.save(output_saved_model_dir=output_saved_model_dir) return output_graph_def
def optim_graph(graph, blacklist_names, precision_mode, mss, mce): ''' Returns the TRT converted graph given the input parameters. ''' with tf.compat.v1.Session() as sess: converter = trt_convert.TrtGraphConverter( input_graph_def=graph, nodes_blacklist=blacklist_names, precision_mode=precision_mode, max_batch_size=1, max_workspace_size_bytes=int(5e8), minimum_segment_size=mss, maximum_cached_engines=mce, use_calibration=False) new_g = converter.convert() return new_g
def _CreateConverter(self, saved_model_dir, session_config, conversion_params): """Return a TrtGraphConverter.""" converter = trt_convert.TrtGraphConverter( input_saved_model_dir=saved_model_dir, session_config=session_config, max_batch_size=conversion_params.max_batch_size, max_workspace_size_bytes=conversion_params. max_workspace_size_bytes, precision_mode=conversion_params.precision_mode, minimum_segment_size=conversion_params.minimum_segment_size, is_dynamic_op=conversion_params.is_dynamic_op, maximum_cached_engines=conversion_params.maximum_cached_engines, use_calibration=conversion_params.use_calibration, use_function_backup=conversion_params.use_function_backup) return converter
def _create_converter(self, trt_convert_params: trt.TrtConversionParams): conversion_nodes_denylist = self.output_tensor_names return trt.TrtGraphConverter( input_saved_model_dir=self._saved_model_dir, input_saved_model_tags=self._saved_model_tags, input_saved_model_signature_key=self._saved_model_signature_key, nodes_denylist=conversion_nodes_denylist, max_batch_size=trt_convert_params.max_batch_size, max_workspace_size_bytes=trt_convert_params. max_workspace_size_bytes, precision_mode=trt_convert_params.precision_mode, minimum_segment_size=trt_convert_params.minimum_segment_size, is_dynamic_op=trt_convert_params.is_dynamic_op, maximum_cached_engines=trt_convert_params.maximum_cached_engines, use_calibration=trt_convert_params.use_calibration, )
def optimizeGraph(graph_def, output_nodes, user_trt_args=None): try: from tensorflow.python.compiler.tensorrt import trt_convert as trt tensor_rt_args={'input_graph_def':graph_def, 'nodes_blacklist':output_nodes, 'precision_mode':trt.TrtPrecisionMode.FP16, 'is_dynamic_op':True, 'maximum_cached_engines':10, 'minimum_segment_size': 6, 'max_batch_size':4} if user_trt_args: tensor_rt_args.update(user_trt_args) converter = trt.TrtGraphConverter(**tensor_rt_args) return converter.convert() except: print("WARNING: Unable to optomize graph.") return graph_def
def _CreateConverter(self, gdef, session_config, conversion_params): """Return a TrtGraphConverter.""" params = self._GetParamsCached() converter = trt_convert.TrtGraphConverter( input_graph_def=gdef, nodes_blacklist=params.input_names + params.output_names, session_config=session_config, max_batch_size=conversion_params.max_batch_size, max_workspace_size_bytes=conversion_params. max_workspace_size_bytes, precision_mode=conversion_params.precision_mode, minimum_segment_size=conversion_params.minimum_segment_size, is_dynamic_op=conversion_params.is_dynamic_op, maximum_cached_engines=conversion_params.maximum_cached_engines, cached_engine_batches=conversion_params.cached_engine_batches, use_calibration=conversion_params.use_calibration) return converter
def GenerateModelV1(tf_saved_model_dir, tftrt_saved_model_dir): """Generate and convert a model using TFv1 API.""" def SimpleModel(): """Define model with a TF graph.""" def GraphFn(): input1 = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 1, 1], name="input1") input2 = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 1, 1], name="input2") var = variables.Variable([[[1.0]]], dtype=dtypes.float32, name="v1") out = GetGraph(input1, input2, var) return g, var, input1, input2, out g = ops.Graph() with g.as_default(): return GraphFn() g, var, input1, input2, out = SimpleModel() signature_def = signature_def_utils.build_signature_def( inputs={ "input1": utils.build_tensor_info(input1), "input2": utils.build_tensor_info(input2) }, outputs={"output": utils.build_tensor_info(out)}, method_name=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY) saved_model_builder = builder.SavedModelBuilder(tf_saved_model_dir) with Session(graph=g) as sess: sess.run(var.initializer) saved_model_builder.add_meta_graph_and_variables( sess, [tag_constants.SERVING], signature_def_map={ signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signature_def }) saved_model_builder.save() # Convert TF model to TensorRT converter = trt_convert.TrtGraphConverter( input_saved_model_dir=tf_saved_model_dir, is_dynamic_op=True) converter.convert() converter.save(tftrt_saved_model_dir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--path', help='path to pb file.', required=True) parser.add_argument('--output', help='Output dir.', default='model') args = parser.parse_args() model_dir = args.output if tf.gfile.Exists(model_dir) == False: tf.gfile.MkDir(model_dir) if tf.gfile.Exists(args.path) == False: print('Error: pb file dose note exist!') return tf.reset_default_graph() graph = tf.Graph() graph_def = None with tf.gfile.GFile(args.path, 'rb') as f: graph_def = tf.GraphDef.FromString(f.read()) output_names = ['SemanticPredictions'] converter = trt_convert.TrtGraphConverter( input_graph_def=graph_def, nodes_blacklist=output_names, #output nodes max_batch_size=1, # is_dynamic_op=False, is_dynamic_op=True, max_workspace_size_bytes=1 << 25, precision_mode=trt_convert.TrtPrecisionMode.FP16, minimum_segment_size=50) trt_graph = converter.convert() trt_engine_opts = len( [1 for n in trt_graph.node if str(n.op) == 'TRTEngineOp']) print("trt_engine_opts = {}".format(trt_engine_opts)) base_name = os.path.splitext(os.path.basename(args.path))[0] save_model_file_name = base_name + '_dynamic_fp16.pb' with open(os.path.join(model_dir, save_model_file_name), 'wb') as f: f.write(trt_graph.SerializeToString())
def to_tf_trt(savedmodel_dir: str, output_dir: str, precision: str, feed_dict_fn: Callable, num_runs: int, output_tensor_names: List[str], compress: bool): """ Export Tensorflow savedModel to TF-TRT :param savedmodel_dir: (str) Input directory containing a Tensorflow savedModel :param output_dir: (str) Output directory for storage of the generated TF-TRT exported model :param precision: (str) Desired precision of the network (FP32, FP16 or INT8) :param feed_dict_fn: (Callable) Input tensors for INT8 calibration. Model specific. :param num_runs: (int) Number of calibration runs. :param output_tensor_names: (List) Name of the output tensor for graph conversion. Model specific. :param compress: (bool) Compress output """ if savedmodel_dir is None or not os.path.exists(savedmodel_dir): raise FileNotFoundError('savedmodel_dir not found: {}'.format(savedmodel_dir)) if os.path.exists(output_dir): print('[*] Output dir \'{}\' is not empty. Cleaning up ...'.format(output_dir)) shutil.rmtree(output_dir) print('[*] Converting model...') converter = trt.TrtGraphConverter(input_saved_model_dir=savedmodel_dir, precision_mode=precision) converter.convert() if precision == 'INT8': print('[*] Running INT8 calibration ...') converter.calibrate(fetch_names=output_tensor_names, num_runs=num_runs, feed_dict_fn=feed_dict_fn) converter.save(output_dir) print('[*] Done! TF-TRT saved_model stored in: `%s`' % output_dir) if compress: _compress('tftrt_saved_model', output_dir)
def optimizeGraph(graph_def, output_nodes, user_trt_args=None): if tf.test.is_gpu_available(cuda_only=True) is False or os.getenv("OPENEM_NOTRT") == "1": print("No GPU available to optimize for") return graph_def try: from tensorflow.python.compiler.tensorrt import trt_convert as trt tensor_rt_args={'input_graph_def':graph_def, 'nodes_blacklist':output_nodes, 'precision_mode':trt.TrtPrecisionMode.FP16, 'is_dynamic_op':True, 'maximum_cached_engines':10, 'minimum_segment_size': 6, 'max_batch_size':4} if user_trt_args: tensor_rt_args.update(user_trt_args) converter = trt.TrtGraphConverter(**tensor_rt_args) return converter.convert() except: print("WARNING: Unable to optomize graph.") return graph_def
def _GetGraphDef(self, use_trt, max_batch_size, model_dir): """Gets the frozen mnist GraphDef. Args: use_trt: whether use TF-TRT to convert the graph. max_batch_size: the max batch size to apply during TF-TRT conversion. model_dir: the model directory to load the checkpoints. Returns: The frozen mnist GraphDef. """ graph = ops.Graph() with self.session(graph=graph) as sess: with graph.device('/GPU:0'): x = array_ops.placeholder( shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME) self._BuildGraph(x) self._LoadWeights(model_dir, sess) # Freeze graph_def = graph_util.convert_variables_to_constants( sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME]) # Convert with TF-TRT if use_trt: logging.info('Number of nodes before TF-TRT conversion: %d', len(graph_def.node)) converter = trt_convert.TrtGraphConverter( input_graph_def=graph_def, nodes_denylist=[OUTPUT_NODE_NAME], max_batch_size=max_batch_size, precision_mode='INT8', max_workspace_size_bytes=( trt_convert.DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES), minimum_segment_size=2, use_calibration=False) graph_def = converter.convert() logging.info('Number of nodes after TF-TRT conversion: %d', len(graph_def.node)) num_engines = len( [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp']) self.assertEqual(1, num_engines) return graph_def
def get_trt_graph_from_calib_cxg(graph_name, graph_def, data, input_node, output_node, output_dir): """Convert a TensorRT graph used for calibration to an inference graph.""" converter = trt.TrtGraphConverter( input_graph_def=graph_def, nodes_blacklist=output_node, max_batch_size=4, # max_workspace_size_bytes=workspace_size<<20, precision_mode='INT8') converter.convert() def input_fn(): iterator = get_iterator(data) return {input_node: iterator.get_next()} trt_graph = converter.calibrate(fetch_names=output_node, num_runs=1, input_map_fn=input_fn) write_graph_to_file(graph_name, trt_graph, output_dir) return trt_graph
def freeze_graph(model_path, use_trt=False, trt_max_batch_size=8, trt_precision='fp32'): output_names = ['policy_output', 'value_output'] n = DualNetwork(model_path) out_graph = tf.graph_util.convert_variables_to_constants( n.sess, n.sess.graph.as_graph_def(), output_names) # eval is always fp32, so let's store a eval copy before we trt. metadata = make_model_metadata({ 'engine': 'tf', 'use_trt': False, }) minigo_model.write_graph_def(out_graph, metadata, model_path + '.evalfp32minigo') if use_trt: from tensorflow.python.compiler.tensorrt import trt_convert as trt converter = trt.TrtGraphConverter(input_graph_def=out_graph, nodes_blacklist=output_names, max_batch_size=trt_max_batch_size, max_workspace_size_bytes=1 << 29, precision_mode=trt_precision) out_graph = converter.convert() metadata = make_model_metadata({ 'engine': 'tf', 'use_trt': bool(use_trt), }) # double buffer model write minigo_model.write_graph_def(out_graph, metadata, model_path + '.stagedmodel') minigo_model.write_graph_def(out_graph, metadata, model_path + '.minigo')
def _CreateConverter(self, run_params, saved_model_dir, conversion_params): """Returns a TrtGraphConverter.""" if run_params.is_v2: converter_v2 = trt_convert.TrtGraphConverterV2( input_saved_model_dir=saved_model_dir, use_dynamic_shape=run_params.dynamic_shape, dynamic_shape_profile_strategy=self._profile_strategy, **conversion_params._asdict()) if self._disable_non_trt_optimizers: converter_v2._test_only_disable_non_trt_optimizers = True # pylint: disable=protected-access return converter_v2 converter_v1 = trt_convert.TrtGraphConverter( input_saved_model_dir=saved_model_dir, max_batch_size=self.GetMaxBatchSize(run_params), max_workspace_size_bytes=conversion_params.max_workspace_size_bytes, precision_mode=conversion_params.precision_mode, minimum_segment_size=conversion_params.minimum_segment_size, is_dynamic_op=run_params.dynamic_engine, maximum_cached_engines=conversion_params.maximum_cached_engines, use_calibration=conversion_params.use_calibration) if self._disable_non_trt_optimizers: converter_v1._test_only_disable_non_trt_optimizers = True # pylint: disable=protected-access return converter_v1
def load_graph_and_convert(model_name, frozen_graph_filepath, precision_mode, seed, f1, f2): """ Load a full-precision (FP32) frozen graph, and return it as a half-precision (FP16) frozen graph @params: model_name (string): The "base" name of your model, such as "resnet", so it can be found in the directory frozen_graph_filepath (string): The path to the directory containing your frozen graph precision_mode (string): The precision you are converting your model, here we only use 'FP16' so far @return: The original frozen_graph converted into a different precision mode """ print('OPENING FROZEN GRAPH FOR MODEL {}.'.format(model_name)) frozen_graph_filepath = frozen_graph_filepath + '{}_{}/flex_random_seed_{}_'.format(f1,f2,seed) + model_name + '_frozen_graph.pb' with open(frozen_graph_filepath, 'rb') as f: frozen_graph_gd = tf.GraphDef() frozen_graph_gd.ParseFromString(f.read()) print('BEGINNING THE CONVERSION TO TRT {}'.format(precision_mode)) converter = trt.TrtGraphConverter(input_graph_def=frozen_graph_gd, nodes_blacklist=['local_dense/truediv'], precision_mode=precision_mode, use_calibration=True, is_dynamic_op=True) try: frozen_graph = converter.convert() print('CONVERSION FINISHED WITH SUCCESS.') except Exception as e: errorMessage = 'Exception catched on file: ' errorMessage += os.path.abspath(sys.argv[0]) + '\n' tracebackMessage = traceback.format_exc() text_file = open('./Logs/exceptions.txt', 'w') text_file.write(errorMessage + tracebackMessage + '\n') text_file.close() print(errorMessage) return frozen_graph
def get_frozen_graph( model, model_dir=None, use_trt=False, engine_dir=None, use_dynamic_op=False, precision='FP32', batch_size=8, minimum_segment_size=2, calib_files=None, num_calib_inputs=None, use_synthetic=False, cache=False, default_models_dir='./data', max_workspace_size=(1<<32)): """Retreives a frozen GraphDef from model definitions in classification.py and applies TF-TRT model: str, the model name (see NETS table in classification.py) use_trt: bool, if true, use TensorRT precision: str, floating point precision (FP32, FP16, or INT8) batch_size: int, batch size for TensorRT optimizations returns: tensorflow.GraphDef, the TensorRT compatible frozen graph """ num_nodes = {} times = {} graph_sizes = {} # Load from pb file if frozen graph was already created and cached if cache: # Graph must match the model, TRT mode, precision, and batch size prebuilt_graph_path = "graphs/frozen_graph_%s_%d_%s_%d.pb" % (model, int(use_trt), precision, batch_size) if os.path.isfile(prebuilt_graph_path): print('Loading cached frozen graph from \'%s\'' % prebuilt_graph_path) start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "rb") as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) times['loading_frozen_graph'] = time.time() - start_time num_nodes['loaded_frozen_graph'] = len(frozen_graph.node) num_nodes['trt_only'] = len([1 for n in frozen_graph.node if str(n.op)=='TRTEngineOp']) graph_sizes['loaded_frozen_graph'] = len(frozen_graph.SerializeToString()) return frozen_graph, num_nodes, times, graph_sizes # Build graph and load weights frozen_graph = build_classification_graph(model, model_dir, default_models_dir) num_nodes['native_tf'] = len(frozen_graph.node) graph_sizes['native_tf'] = len(frozen_graph.SerializeToString()) # Convert to TensorRT graph if use_trt: start_time = time.time() converter = trt.TrtGraphConverter( input_graph_def=frozen_graph, nodes_blacklist=['logits', 'classes'], max_batch_size=batch_size, max_workspace_size_bytes=max_workspace_size, precision_mode=precision.upper(), minimum_segment_size=minimum_segment_size, is_dynamic_op=use_dynamic_op ) frozen_graph = converter.convert() times['trt_conversion'] = time.time() - start_time num_nodes['tftrt_total'] = len(frozen_graph.node) num_nodes['trt_only'] = len([1 for n in frozen_graph.node if str(n.op)=='TRTEngineOp']) graph_sizes['trt'] = len(frozen_graph.SerializeToString()) if engine_dir: segment_number = 0 for node in frozen_graph.node: if node.op == "TRTEngineOp": engine = node.attr["serialized_segment"].s engine_path = engine_dir+'/{}_{}_{}_segment{}.trtengine'.format(model, precision, batch_size, segment_number) segment_number += 1 with open(engine_path, "wb") as f: f.write(engine) if precision == 'INT8': calib_graph = frozen_graph graph_sizes['calib'] = len(calib_graph.SerializeToString()) def input_map_fn(): features, _ = input_fn(model, calib_files, batch_size, use_synthetic) return {'input:0': features} # INT8 calibration step print('Calibrating INT8...') start_time = time.time() frozen_graph = converter.calibrate( fetch_names=['logits', 'classes'], num_runs=num_calib_inputs // batch_size, input_map_fn=input_map_fn) times['trt_calibration'] = time.time() - start_time # This is already set but overwriting it here to ensure the right size graph_sizes['trt'] = len(frozen_graph.SerializeToString()) del calib_graph print('INT8 graph created.') # Cache graph to avoid long conversions each time if cache: if not os.path.exists(os.path.dirname(prebuilt_graph_path)): try: os.makedirs(os.path.dirname(prebuilt_graph_path)) except Exception as e: raise e start_time = time.time() with tf.gfile.GFile(prebuilt_graph_path, "wb") as f: f.write(frozen_graph.SerializeToString()) times['saving_frozen_graph'] = time.time() - start_time return frozen_graph, num_nodes, times, graph_sizes
def MakeExtractor(sess, config, import_scope=None): """Creates a function to extract features from an image. Args: sess: TensorFlow session to use. config: DelfConfig proto containing the model configuration. import_scope: Optional scope to use for model. Returns: Function that receives an image and returns features. """ ''' tf.saved_model.loader.load( sess, [tf.saved_model.tag_constants.SERVING], config.model_path, import_scope=import_scope) ''' with tf.gfile.GFile('./static_model/delf_both_white/frozen_graph.pb', 'rb') as f: frozen_graph = tf.GraphDef() frozen_graph.ParseFromString(f.read()) # Now you can create a TensorRT inference graph from your # frozen graph: converter = trt.TrtGraphConverter(input_graph_def=frozen_graph, nodes_blacklist=[ 'global_feature', 'boxes', 'features', 'scales', 'scores' ], precision_mode="FP32", maximum_cached_engines=100, is_dynamic_op=True) #output nodes trt_graph = converter.convert() # Import the TensorRT graph into a new graph and run: output_node = tf.import_graph_def(trt_graph, return_elements=[ 'global_feature', 'boxes', 'features', 'scales', 'scores' ]) import_scope_prefix = import_scope + '/' if import_scope is not None else '' input_image = sess.graph.get_tensor_by_name('%sinput_image:0' % import_scope_prefix) input_score_threshold = sess.graph.get_tensor_by_name( '%sinput_abs_thres:0' % import_scope_prefix) input_image_scales = sess.graph.get_tensor_by_name('%sinput_scales:0' % import_scope_prefix) input_max_feature_num = sess.graph.get_tensor_by_name( '%sinput_max_feature_num:0' % import_scope_prefix) global_feature = sess.graph.get_tensor_by_name('%sglobal_feature:0' % import_scope_prefix) boxes = sess.graph.get_tensor_by_name('%sboxes:0' % import_scope_prefix) raw_descriptors = sess.graph.get_tensor_by_name('%sfeatures:0' % import_scope_prefix) feature_scales = sess.graph.get_tensor_by_name('%sscales:0' % import_scope_prefix) attention_with_extra_dim = sess.graph.get_tensor_by_name( '%sscores:0' % import_scope_prefix) attention = tf.reshape(attention_with_extra_dim, [tf.shape(attention_with_extra_dim)[0]]) locations, descriptors = feature_extractor.DelfFeaturePostProcessing( boxes, raw_descriptors, config) def ExtractorFn(image): """Receives an image and returns DELF features. If image is too small, returns empty set of features. Args: image: Uint8 array with shape (height, width, 3) containing the RGB image. Returns: Tuple (locations, descriptors, feature_scales, attention) """ resized_image, scale_factor = ResizeImage(image, config) # If the image is too small, returns empty features. if resized_image.shape[0] < _MIN_HEIGHT or resized_image.shape[ 1] < _MIN_WIDTH: return np.array([]), np.array([]), np.array([]), np.array([]) (global_feature_out, locations_out, descriptors_out, feature_scales_out, attention_out) = sess.run( [ global_feature, locations, descriptors, feature_scales, attention ], feed_dict={ input_image: resized_image, input_score_threshold: config.delf_local_config.score_threshold, input_image_scales: list(config.image_scales), input_max_feature_num: config.delf_local_config.max_feature_num }) rescaled_locations_out = locations_out / scale_factor return (global_feature_out, rescaled_locations_out, descriptors_out, feature_scales_out, attention_out) return ExtractorFn
def evaluate(config, evaluation_set='val', plot_confusionMatrix=False): # -------------------------------------------------------------------- # init network # -------------------------------------------------------------------- tf.compat.v1.reset_default_graph() # define input placeholders input_placeholder = {} input_placeholder.update( {'is_training': tf.compat.v1.placeholder(dtype=tf.bool, shape=())}) if config.ARCHITECTURE == 'semantic_segmentation': batch_size = config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH # Search for available GPUs: the result is a list of device ids like `['/gpu:0', '/gpu:1']` devices = get_available_gpus() print("found devices: ", devices) num_GPU = len(devices) if (num_GPU) == 0: num_GPU = 1 # CPU support! # min 1 sample should be applied on a GPU if (config.BATCH_SIZE < num_GPU): num_GPU = config.BATCH_SIZE image_placeholder = [] label_placeholder = [] for iter in range(num_GPU): if (iter == (num_GPU - 1)): batch_size_local = batch_size - (num_GPU - 1) * (batch_size // num_GPU) else: batch_size_local = batch_size // num_GPU print('batch_size /gpu:{} : {}'.format(iter, num_GPU)) image_placeholder.append( tf.compat.v1.placeholder( dtype=tf.float32, shape=(batch_size_local, config.DATASET_TRAIN.INPUT_SIZE[0], config.DATASET_TRAIN.INPUT_SIZE[1], config.DATASET_TRAIN.NUM_CHANNELS))) label_placeholder.append( tf.compat.v1.placeholder( dtype=tf.float32, shape=(batch_size_local, config.DATASET_TRAIN.INPUT_SIZE[0], config.DATASET_TRAIN.INPUT_SIZE[1], 1))) input_placeholder.update({'image_batch': image_placeholder}) input_placeholder.update({'label_batch': label_placeholder}) else: print( '[ERROR] network architecture does not exist!!! Please check your spelling!' ) raise NotImplementedError # load network architecture if config.ARCHITECTURE == 'semantic_segmentation': model = get_model(config.MODEL) net = model( { 'data': input_placeholder['image_batch'], 'is_training': input_placeholder['is_training'] }, is_training=input_placeholder['is_training'], evaluation=tf.logical_not(input_placeholder['is_training']), #is_inference=True, num_classes=config.DATASET_TRAIN.NUM_CLASSES, filter_scale=config.FILTER_SCALE, timeSequence=config.TIMESEQUENCE_LENGTH, variant=config.MODEL_VARIANT) else: print( '[ERROR] network architecture does not exist!!! Please check your spelling!' ) raise NotImplementedError # -------------------------------------------------------------------- # determine evaluation metric # -------------------------------------------------------------------- if config.ARCHITECTURE == 'semantic_segmentation': list_raw_gt = [] list_pred_flattern_mIoU = [] for iter_gpu in range(len(input_placeholder['image_batch'])): with tf.device('/gpu:%d' % iter_gpu): if config.MODEL == 'SegNet_BN' or config.MODEL == 'SegNet_BN_encoder' or config.MODEL == 'SegNet_BN_decoder' or config.MODEL == 'SegNet_BN_encoderDecoder': raw_output = net.layers['output'][iter_gpu] raw_output_up = tf.argmax(raw_output, axis=3, output_type=tf.int32) raw_pred_mIoU = tf.expand_dims(raw_output_up, dim=3) else: # ICNet ori_shape = config.DATASET_TRAIN.INPUT_SIZE #?? raw_output = net.layers['output'][iter_gpu] raw_output_up = tf.compat.v1.image.resize_bilinear( raw_output, size=ori_shape[:2], align_corners=True) raw_output_up = tf.argmax(raw_output_up, axis=3, output_type=tf.int32) raw_pred_mIoU = tf.expand_dims(raw_output_up, dim=3) # determine mIoU if config.USAGE_TIMESEQUENCES: # evaluate only last image of time sequence pred_of_interest = np.array( range(config.BATCH_SIZE), dtype=np.int32 ) * config.TIMESEQUENCE_LENGTH + config.TIMESEQUENCE_LENGTH - 1 pred_flatten_mIoU = tf.reshape( tf.gather(raw_pred_mIoU, pred_of_interest), [ -1, ]) raw_gt = tf.reshape( tf.gather(input_placeholder['label_batch'][iter_gpu], pred_of_interest), [ -1, ]) else: # evaluate all images of batch size pred_flatten_mIoU = tf.reshape(raw_pred_mIoU, [ -1, ]) raw_gt = tf.reshape( input_placeholder['label_batch'][iter_gpu], [ -1, ]) list_raw_gt.append(raw_gt) list_pred_flattern_mIoU.append(pred_flatten_mIoU) # combine output of different GPUs with tf.device('/gpu:%d' % 0): all_raw_gt = tf.reshape(tf.concat(list_raw_gt, -1), [ -1, ]) all_pred_flatten_mIoU = tf.reshape( tf.concat(list_pred_flattern_mIoU, -1), [ -1, ]) indices_mIoU = tf.squeeze( tf.where( tf.less_equal(raw_gt, config.DATASET_TRAIN.NUM_CLASSES - 1)), 1) gt_mIoU = tf.cast(tf.gather(raw_gt, indices_mIoU), tf.int32) pred_mIoU = tf.gather(pred_flatten_mIoU, indices_mIoU) mIoU, update_op = tf.contrib.metrics.streaming_mean_iou( pred_mIoU, gt_mIoU, num_classes=config.DATASET_VAL.NUM_CLASSES) # create colored image pred_color = decode_labels(pred_flatten_mIoU, config.DATASET_VAL.INPUT_SIZE[0:2], config.DATASET_VAL.NUM_CLASSES) # deterimine confusing matrix if plot_confusionMatrix: # Create an accumulator variable to hold the counts confusion = tf.Variable(tf.zeros([ config.DATASET_VAL.NUM_CLASSES, config.DATASET_VAL.NUM_CLASSES ], dtype=tf.int64), name='confusion', collections=[ tf.compat.v1.GraphKeys.LOCAL_VARIABLES ]) # Compute a per-batch confusion batch_confusion = tf.math.confusion_matrix( tf.reshape(gt_mIoU, [-1]), tf.reshape(pred_mIoU, [-1]), num_classes=config.DATASET_VAL.NUM_CLASSES, name='batch_confusion') # Create the update op for doing a "+=" accumulation on the batch confusion_update = confusion.assign( confusion + tf.cast(batch_confusion, dtype=tf.int64)) # ----------------------------------------- # init session # ----------------------------------------- # Set up tf session and initialize variables. sessConfig = tf.compat.v1.ConfigProto() sessConfig.gpu_options.allow_growth = True # use only a fraction of gpu memory, otherwise the TensorRT-test has not enough free GPU memory for execution sessConfig.gpu_options.per_process_gpu_memory_fraction = 0.5 sess = tf.compat.v1.Session(config=sessConfig) init = tf.compat.v1.global_variables_initializer() local_init = tf.compat.v1.local_variables_initializer() sess.run(init) sess.run(local_init) # load checkpoint file print(config.EVALUATION.MODELPATH) ckpt = tf.compat.v1.train.get_checkpoint_state(config.EVALUATION.MODELPATH) if ckpt and ckpt.model_checkpoint_path: loader = tf.compat.v1.train.Saver( var_list=tf.compat.v1.global_variables()) load(loader, sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found.') # `sess.graph` provides access to the graph used in a <a href="./../api_docs/python/tf/Session"><code>tf.Session</code></a>. writer = tf.compat.v1.summary.FileWriter("/tmp/tensorflow_graph", tf.compat.v1.get_default_graph()) # -------------------------------------------------------------------- # Evaluate - Iterate over training steps. # -------------------------------------------------------------------- # evaluate training or validation set if evaluation_set == "val": imagereader_val = ImageReader(config.IMAGEREADER.VAL, config.DATASET_VAL, config.BATCH_SIZE, config.TIMESEQUENCE_LENGTH) elif evaluation_set == "train": imagereader_val = ImageReader(config.IMAGEREADER.VAL, config.DATASET_TRAIN, config.BATCH_SIZE, config.TIMESEQUENCE_LENGTH) elif evaluation_set == "test": imagereader_val = ImageReader(config.IMAGEREADER.VAL, config.DATASET_TEST, config.BATCH_SIZE, config.TIMESEQUENCE_LENGTH) elif evaluation_set == "all": imagereader_val = ImageReader(config.IMAGEREADER.VAL, config.DATASET_ALL, config.BATCH_SIZE, config.TIMESEQUENCE_LENGTH) else: print("Dataset {} does not exist!".format(evaluation_set)) filename_memory = "" filename_count = 0 average_inference_time = 0 # -------------------------------------- # perform evaluation - semantic segmentation # -------------------------------------- if config.ARCHITECTURE == 'semantic_segmentation': if config.TIMESEQUENCES_SLIDINGWINDOW: # use time sequences for step in trange( int(imagereader_val._dataset_amount - config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH + 1), desc='inference', leave=True): #start_time = time.time() training_batch = imagereader_val.getNextMinibatch() feed_dict = {input_placeholder['is_training']: False} for iter_GPU in range(len(input_placeholder['image_batch'])): num_GPU = len(input_placeholder['image_batch']) batch_size = training_batch['blob_data'].shape[0] batch_size_local = batch_size // num_GPU if (iter_GPU == (num_GPU - 1)): batch_size_act = batch_size - (num_GPU - 1) * ( batch_size // num_GPU) else: batch_size_act = batch_size // num_GPU feed_dict.update({ input_placeholder['image_batch'][iter_GPU]: training_batch['blob_data'][iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, :], input_placeholder['label_batch'][iter_GPU]: training_batch['blob_label'] [iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, :] }) start_time = time.time() if plot_confusionMatrix: sess.run([update_op, confusion_update], feed_dict=feed_dict) else: sess.run([update_op], feed_dict=feed_dict) duration = time.time() - start_time average_inference_time += duration # save image prediction = sess.run([pred_color], feed_dict=feed_dict) predi = np.array( prediction[0][0, :, :, :]).astype(dtype=np.uint8) img = cv2.imread(imagereader_val._dataset[step][0]) if imagereader_val._configDataset.USE_IMAGE_ROI: img = img[imagereader_val._configDataset.IMAGE_ROI_MIN_X: imagereader_val._configDataset.IMAGE_ROI_MAX_X, imagereader_val._configDataset. IMAGE_ROI_MIN_Y:imagereader_val._configDataset. IMAGE_ROI_MAX_Y, :] cv2.addWeighted(predi, config.INFERENCE.OVERLAPPING_IMAGE, img, 1 - config.INFERENCE.OVERLAPPING_IMAGE, 0, img) buff = imagereader_val._dataset[step][-1] buff = buff.split('/') filename = buff[-1].split('.')[0] if filename_memory == buff[-1].split('.')[0]: filename_count += 1 filename = buff[-1].split('.')[0] + "_" + str( filename_count) + ".png" else: filename_memory = buff[-1].split('.')[0] filename = buff[-1].split('.')[0] + "_0.png" filename_count = 0 cv2.imwrite(config.INFERENCE.SAVEDIR_IMAGES + filename, predi[:, :, (2, 1, 0)]) cv2.imwrite(config.INFERENCE.SAVEDIR_IMAGES + filename, img[:, :, (2, 1, 0)]) # determine average time average_inference_time = average_inference_time / int( imagereader_val._dataset_amount - config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH + 1) else: # do not use time sequences (normal evaluation) # TODO parameters for flickering evaluation flickering_sum = 0 flickering_img_size = 0 flickering_sum1 = 0 flickering_img_size1 = 0 for step in trange( int(imagereader_val._dataset_amount / (config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH)), desc='inference', leave=True): training_batch = imagereader_val.getNextMinibatch() feed_dict = {input_placeholder['is_training']: False} for iter_GPU in range(len(input_placeholder['image_batch'])): num_GPU = len(input_placeholder['image_batch']) batch_size = training_batch['blob_data'].shape[0] batch_size_local = batch_size // num_GPU if (iter_GPU == (num_GPU - 1)): batch_size_act = batch_size - (num_GPU - 1) * ( batch_size // num_GPU) else: batch_size_act = batch_size // num_GPU # for depth images, if result should be shown in camera image if True: feed_dict.update({ input_placeholder['image_batch'][iter_GPU]: training_batch['blob_data'] [iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, :], input_placeholder['label_batch'][iter_GPU]: training_batch['blob_label'] [iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, :] }) else: training_batch['blob_data'][:, :, :, 1] = training_batch[ 'blob_data'][:, :, :, -1] training_batch['blob_data'][:, :, :, 2] = training_batch[ 'blob_data'][:, :, :, -1] feed_dict.update({ input_placeholder['image_batch'][iter_GPU]: training_batch['blob_data'] [iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, 1:4], input_placeholder['label_batch'][iter_GPU]: training_batch['blob_label'] [iter_GPU * batch_size_local:iter_GPU * batch_size_local + batch_size_act, :, :, :] }) start_time = time.time() if plot_confusionMatrix: sess.run([update_op, confusion_update], feed_dict=feed_dict) else: sess.run([update_op], feed_dict=feed_dict) duration = time.time() - start_time # skip the first 50 images if step >= 50: average_inference_time += duration # -------------------------- # save image # -------------------------- prediction = sess.run([pred_color, raw_output_up], feed_dict=feed_dict) predi = np.array( prediction[0][0, :, :, :]).astype(dtype=np.uint8) predi_id = np.array(prediction[1][-1, ...]).astype(dtype=np.uint8) data_index = step * config.TIMESEQUENCE_LENGTH + config.TIMESEQUENCE_LENGTH - 1 buff = imagereader_val._dataset[data_index][-1] buff = buff.split('/') filename = buff[-1].split('.')[0] if filename_memory == buff[-1].split('.')[0]: filename_count += 1 filename = buff[-1].split('.')[0] + "_" + str( filename_count).zfill(6) + ".png" else: filename_memory = buff[-1].split('.')[0] filename = buff[-1].split('.')[0] + "_000000.png" filename_count = 0 img = cv2.imread(imagereader_val._dataset[data_index][0]) if imagereader_val._configDataset.USE_IMAGE_ROI: img = img[imagereader_val._configDataset.IMAGE_ROI_MIN_X: imagereader_val._configDataset.IMAGE_ROI_MAX_X, imagereader_val._configDataset. IMAGE_ROI_MIN_Y:imagereader_val._configDataset. IMAGE_ROI_MAX_Y, :] # label images for video cv2.addWeighted(predi, config.INFERENCE.OVERLAPPING_IMAGE, img, 1 - config.INFERENCE.OVERLAPPING_IMAGE, 0, img) cv2.imwrite( config.INFERENCE.SAVEDIR_IMAGES + 'pred_' + filename, predi[:, :, (2, 1, 0)]) cv2.imwrite( config.INFERENCE.SAVEDIR_IMAGES + 'overlay_' + filename, img[:, :, (2, 1, 0)]) # -------------------------- # flickering evaluation # -------------------------- # to measure flickering between non-ground-truth classes, multiply it with the predicted result (add 1, since True * class 0 = 0!) diff_img = np.array( (predi_id != training_batch['blob_label'][-1, :, :, 0]), np.float32) * np.array( training_batch['blob_label'][-1, :, :, 0] + 1, np.float32) diff_img1 = np.array(predi_id, np.float32) if step > 0: # skip step = 0, since there is no reference image flickering = np.sum((diff_img != prediction_old)) flickering_sum += flickering flickering_img_size += predi_id.shape[0] * predi_id.shape[1] flickering1 = np.sum((diff_img1 != prediction_old1)) flickering_sum1 += flickering1 flickering_img_size1 += predi_id.shape[0] * predi_id.shape[ 1] prediction_old = diff_img prediction_old1 = diff_img1 # determine average time average_inference_time = average_inference_time / int( imagereader_val._dataset_amount / (config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH) - 50) mIoU_value = sess.run(mIoU) print('flickering_sum: ', flickering_sum) print('FP: ', float(flickering_sum) / float(flickering_img_size)) print('--------------') print('flickering_sum1: ', flickering_sum1) print('FIP: ', float(flickering_sum1) / float(flickering_img_size1)) print('--------------') print( float(flickering_sum) / float(flickering_img_size), float(flickering_sum1) / float(flickering_img_size1)) print('--------------') if plot_confusionMatrix: confusion_matrix = sess.run(confusion) # print Accuracy: np.set_printoptions(linewidth=np.inf) #150) acc_value = float(np.sum(np.diag(confusion_matrix))) / float( np.sum(confusion_matrix)) else: print( '[ERROR] network architecture does not exist!!! Please check your spelling!' ) raise NotImplementedError # -------------------------------------------- # create optimized pb-model # -------------------------------------------- if config.FREEZEINFERENCEGRAPH.MODE: output_nodes = config.FREEZEINFERENCEGRAPH.OUTPUT_NODE_NAMES.split(',') input_nodes = config.FREEZEINFERENCEGRAPH.INPUT_NODE_NAMES.split(',') frozen_graph_def = tf.compat.v1.graph_util.convert_variables_to_constants( sess, tf.compat.v1.get_default_graph().as_graph_def(), output_nodes) # Write tensorrt graph def to pb file for use in C++ output_graph_path = config.EVALUATION.MODELPATH + '/model.pb' with tf.io.gfile.GFile(output_graph_path, "wb") as f: f.write(frozen_graph_def.SerializeToString()) transforms = [ 'remove_nodes(op=Identity)', 'merge_duplicate_nodes', 'strip_unused_nodes', 'fold_constants(ignore_errors=true)', 'fold_batch_norms', 'remove_device' ] optimized_graph_def = TransformGraph(frozen_graph_def, input_nodes, output_nodes, transforms) print('inference model saved in ', output_graph_path) # ------------------------------------ # apply TensorRT # ------------------------------------ if config.FREEZEINFERENCEGRAPH.MODE: config_TensorRT = tf.ConfigProto() #config_TensorRT.gpu_options.per_process_gpu_memory_fraction = 0.5 config_TensorRT.gpu_options.allow_growth = True converter = trt.TrtGraphConverter(input_graph_def=optimized_graph_def, session_config=config_TensorRT, max_workspace_size_bytes=4000000000, nodes_blacklist=output_nodes, is_dynamic_op=False, precision_mode='FP16') converted_graph_def = converter.convert() # Write tensorrt graph def to pb file for use in C++ output_graph_TensorRT_path = config.EVALUATION.MODELPATH + '/tensorrt_model.pb' with tf.io.gfile.GFile(output_graph_TensorRT_path, "wb") as f: f.write(converted_graph_def.SerializeToString()) print('TensorRT-model saved in ', output_graph_TensorRT_path) # -------------------------------------------- # close session # -------------------------------------------- writer.close() sess.close() tf.compat.v1.reset_default_graph() # ------------------------------------------ # define inference-function for model.pb # ------------------------------------------ def determineInferenceTime(path2frozenmodel): # We load the protobuf file from the disk and parse it to retrieve the unserialized graph_def with tf.gfile.GFile(path2frozenmodel, "rb") as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) for node in graph_def.node: if node.op == 'RefSwitch': node.op = 'Switch' for index in xrange(len(node.input)): if 'moving_' in node.input[index]: node.input[index] = node.input[index] + '/read' elif node.op == 'AssignSub': node.op = 'Sub' if 'use_locking' in node.attr: del node.attr['use_locking'] elif node.op == 'AssignAdd': node.op = 'Add' if 'use_locking' in node.attr: del node.attr['use_locking'] elif node.op == 'AssignMovingAvg': node.op = 'MovingAvg' if 'use_locking' in node.attr: del node.attr['use_locking'] # Then, we import the graph_def into a new Graph and returns it with tf.Graph().as_default() as graph: # The name var will prefix every op/nodes in your graph # Since we load everything in a new graph, this is not needed tf.import_graph_def(graph_def, name="") # print output node names if config.FREEZEINFERENCEGRAPH.PRINT_OUTPUT_NODE_NAMES: for op in graph.get_operations(): print(str(op.name)) ##### init network if config.ARCHITECTURE == 'semantic_segmentation': image_tensor = graph.get_tensor_by_name('Placeholder_1:0') ouput_tensor = graph.get_tensor_by_name('ArgMax:0') # init session # Note: we don't nee to initialize/restore anything. There is no Variables in this graph, only hardcoded constants sess_inf = tf.compat.v1.Session(graph=graph) average_inference_time_frozenModel = 0.0 for step in trange( int(imagereader_val._dataset_amount / (config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH)), desc='inference', leave=True): # get images from datastet training_batch = imagereader_val.getNextMinibatch() if config.ARCHITECTURE == 'semantic_segmentation': feed_dict = {image_tensor: training_batch['blob_data']} # apply inference start_time = time.time() if config.ARCHITECTURE == 'semantic_segmentation': sess_inf.run(ouput_tensor, feed_dict=feed_dict) duration_inf = time.time() - start_time # skip the first 50 images if step >= 50: average_inference_time_frozenModel += duration_inf sess_inf.close() average_inference_time_frozenModel = average_inference_time_frozenModel / int( imagereader_val._dataset_amount / (config.BATCH_SIZE * config.TIMESEQUENCE_LENGTH) - 50) return average_inference_time_frozenModel # ------------------------------------------ # test model - optimized model # ------------------------------------------ if config.FREEZEINFERENCEGRAPH.MODE: ### apply optimized model (model.pb) path2frozenmodel_opt = config.EVALUATION.MODELPATH + '/model.pb' average_inference_time_opt = determineInferenceTime( path2frozenmodel_opt) ### apply TensorRT model (tensorrt_model.pb) path2frozenmodel_tensorrt = config.EVALUATION.MODELPATH + '/tensorrt_model.pb' average_inference_time_tensorrt = determineInferenceTime( path2frozenmodel_tensorrt) print('average time optimized model: {:.2f} ms'.format( average_inference_time_opt * 1000.0)) print('average time TensorRT: {:.2f} ms'.format( average_inference_time_tensorrt * 1000.0)) # -------------------------------------------------------------------- # Show results # -------------------------------------------------------------------- print('average time: {:.2f} ms'.format(average_inference_time * 1000.0)) if plot_confusionMatrix and config.ARCHITECTURE == 'semantic_segmentation': # determine class-wise IoU buff = 0.0 print('-------------------------------------------------------------') for iter in xrange(config.DATASET_VAL.NUM_CLASSES): if np.sum( confusion_matrix[iter, :]) == 0: # avoid division by zero IoU = 0.0 else: IoU = 100.0 * confusion_matrix[iter, iter] / ( np.sum(confusion_matrix[iter, :]) + np.sum(confusion_matrix[:, iter]) - confusion_matrix[iter, iter]) buff = buff + IoU print('{}: {}'.format(config.DATASET_VAL.CLASSES[iter], IoU)) print('-------------------------------------------------------------') print('dataset: {} - {}'.format(config.DATASET_NAME, config.DATASET_WEATHER)) print('Accuracy: {}'.format(acc_value)) print('mIoU: {}'.format(mIoU_value)) print('average time: {:.2f} ms'.format(average_inference_time * 1000.0)) print('-------------------------------------------------------------') if plot_confusionMatrix and config.ARCHITECTURE == 'semantic_segmentation': print('-------------------------------------------------------------') print(confusion_matrix) print('-------------------------------------------------------------') #plot_confusion_matrix(confusion_matrix, config.DATASET_VAL.CLASSES) return mIoU_value
def inference(run, iterations, ckpt_path, inference_input_file, inference_output_file, hparams, num_workers=1, jobid=0, scope=None): """Perform translation.""" if hparams.inference_indices: assert num_workers == 1 model_creator = get_model_creator(hparams) infer_model = model_helper.create_infer_model(model_creator, hparams, scope) sess, loaded_infer_model = start_sess_and_load_model( infer_model, ckpt_path, hparams) # FIXME (bryce): Set to False to disable inference from frozen graph and run fast again if True: frozen_graph = None with infer_model.graph.as_default(): output_node_names = ['hash_table_Lookup_1/LookupTableFindV2'] other_node_names = [ 'MakeIterator', 'IteratorToStringHandle', 'init_all_tables', 'NoOp', 'dynamic_seq2seq/decoder/NoOp' ] frozen_graph = tf.graph_util.convert_variables_to_constants( sess, tf.get_default_graph().as_graph_def(), output_node_names=output_node_names + other_node_names) from tensorflow.python.compiler.tensorrt import trt_convert as trt converter = trt.TrtGraphConverter( input_graph_def=frozen_graph, nodes_blacklist=(output_node_names), is_dynamic_op=True, max_batch_size=hparams.infer_batch_size, max_beam_size=hparams.beam_width, max_src_seq_len=hparams.src_max_len) frozen_graph = converter.convert() with tf.Graph().as_default(): tf.graph_util.import_graph_def(frozen_graph, name="") sess = tf.Session(graph=tf.get_default_graph(), config=utils.get_config_proto( num_intra_threads=hparams.num_intra_threads, num_inter_threads=hparams.num_inter_threads)) iterator = iterator_utils.BatchedInput( initializer=tf.get_default_graph().get_operation_by_name( infer_model.iterator.initializer.name), source=tf.get_default_graph().get_tensor_by_name( infer_model.iterator.source.name), target_input=None, target_output=None, source_sequence_length=tf.get_default_graph( ).get_tensor_by_name( infer_model.iterator.source_sequence_length.name), target_sequence_length=None) infer_model = model_helper.InferModel( graph=tf.get_default_graph(), model=infer_model.model, src_placeholder=tf.get_default_graph().get_tensor_by_name( infer_model.src_placeholder.name), batch_size_placeholder=tf.get_default_graph( ).get_tensor_by_name(infer_model.batch_size_placeholder.name), iterator=iterator) if num_workers == 1: single_worker_inference(run, iterations, sess, infer_model, loaded_infer_model, inference_input_file, inference_output_file, hparams) else: multi_worker_inference(sess, infer_model, loaded_infer_model, inference_input_file, inference_output_file, hparams, num_workers=num_workers, jobid=jobid) sess.close()
import os import shutil import tensorflow as tf from tensorflow.python.compiler.tensorrt import trt_convert as trt org_savedmodel_dir = "output_savedmodel_dir" tensorrt_savedmodel_dir = "converted_savedmodel_dir" if not os.path.exists(org_savedmodel_dir): raise FileNotFoundError("notfound") if os.path.exists(tensorrt_savedmodel_dir): shutil.rmtree(tensorrt_savedmodel_dir) os.mkdir(tensorrt_savedmodel_dir) converter = trt.TrtGraphConverter(input_saved_model_dir=org_savedmodel_dir) converter.convert() converter.save(tensorrt_savedmodel_dir) # Evaluation for Original SavedModel ### with tf.Session() as sess: meta_graph = tf.saved_model.loader.load(sess, [tf.saved_model.SERVING], org_savedmodel_dir) model_signature = meta_graph.signature_def['serving_default'] input_signature = model_signature.inputs output_signature = model_signature.outputs start = timeit.default_timer() feed_dict = { sess.graph.get_tensor_by_name(input_signature['myInput'].name): mnist.test.images[:10]
import timeit import os import shutil import tensorflow as tf from tensorflow.python.compiler.tensorrt import trt_convert as trt org_savedmodel_dir = os.path.join(os.getcwd(), "savedmodel_dir_latest") tensorrt_savedmodel_dir = os.path.join(os.getcwd(), "tensorrt_savedmodel_dir") if not os.path.exists(org_savedmodel_dir): raise FileNotFoundError("notfound") if os.path.exists(tensorrt_savedmodel_dir): shutil.rmtree(tensorrt_savedmodel_dir) os.mkdir(tensorrt_savedmodel_dir) converter = trt.TrtGraphConverter(input_saved_model_dir=org_savedmodel_dir, input_saved_model_signature_key="predict_images",\ #precision_mode=trt.TrtPrecisionMode.INT8,\ #use_calibration=False\ ) converter.convert() converter.save(tensorrt_savedmodel_dir)