def torch2trt_dynamic(module, inputs, input_names=None, output_names=None, log_level=trt.Logger.ERROR, max_batch_size=1, fp16_mode=False, max_workspace_size=0, opt_shape_param=None, strict_type_constraints=False, keep_network=True, int8_mode=False, int8_calib_dataset=None, int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM): if int8_mode and fp16_mode: fp16_mode = False inputs_in = inputs # copy inputs to avoid modifications to source data inputs = [tensor.clone() for tensor in inputs] logger = trt.Logger(log_level) builder = trt.Builder(logger) EXPLICIT_BATCH = 1 << (int)( trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) network = builder.create_network(EXPLICIT_BATCH) with ShapeConverter(), ConversionContext(network) as ctx: if isinstance(inputs, list): inputs = tuple(inputs) if not isinstance(inputs, tuple): inputs = (inputs, ) ctx.add_inputs(inputs, input_names, opt_shape_param) outputs = module(*inputs) if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = (outputs, ) ctx.mark_outputs(outputs, output_names) torch.cuda.empty_cache() builder.max_workspace_size = max_workspace_size builder.max_batch_size = max_batch_size builder.strict_type_constraints = strict_type_constraints config = builder.create_builder_config() config.max_workspace_size = max_workspace_size profile = builder.create_optimization_profile() if input_names is None: input_names = ['input_%d' % i for i in range(len(inputs))] for input_index, input_tensor in enumerate(inputs): if opt_shape_param is not None: min_shape = tuple(opt_shape_param[input_index][0][:]) opt_shape = tuple(opt_shape_param[input_index][1][:]) max_shape = tuple(opt_shape_param[input_index][2][:]) else: opt_shape = tuple(input_tensor.shape) min_shape = opt_shape max_shape = opt_shape profile.set_shape(input_names[input_index], min_shape, opt_shape, max_shape) config.add_optimization_profile(profile) if fp16_mode: builder.fp16_mode = fp16_mode config.set_flag(trt.BuilderFlag.FP16) if int8_mode: # default to use input tensors for calibration if int8_calib_dataset is None: int8_calib_dataset = TensorBatchDataset(inputs_in) config.set_flag(trt.BuilderFlag.INT8) config.int8_calibrator = DatasetCalibrator( input_names, profile, inputs_in, int8_calib_dataset, batch_size=opt_shape[0], algorithm=int8_calib_algorithm) config.set_calibration_profile(profile) builder.int8_mode = int8_mode builder.int8_calibrator = config.int8_calibrator engine = builder.build_engine(network, config) module_trt = TRTModule(engine, ctx.input_names, ctx.output_names) if keep_network: module_trt.network = network return module_trt
def from_onnx( onnx_path: Union[Path, str], save_path: Union[Path, str], inputs: List[IOShape], outputs: List[IOShape], int8_calibrator=None, create_model_config: bool = True, override: bool = False, ): """Takes an ONNX file and creates a TensorRT engine to run inference with From https://github.com/layerism/TensorRT-Inference-Server-Tutorial FIXME: bug exist: TRT 6.x.x does not support opset 10 used in ResNet50(ONNX). """ import tensorrt as trt if save_path.with_suffix('.plan').exists(): if not override: # file exist yet override flag is not set logger.info('Use cached model') return True onnx_path = Path(onnx_path) assert onnx_path.exists() save_path = Path(save_path) # get arch name arch_name = parse_path(save_path)['architecture'] # trt serving model repository is different from others: # `<model-name>/<framework>-tensorrt/<version>/model.plan` save_path = save_path.with_suffix('') save_path.mkdir(parents=True, exist_ok=True) # Save TRT engine trt_logger = trt.Logger(trt.Logger.WARNING) with trt.Builder(trt_logger) as builder: with builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network: with trt.OnnxParser(network, trt_logger) as parser: builder.max_workspace_size = GiB(1) # 1GB builder.max_batch_size = 1 if int8_calibrator is not None: builder.int8_mode = True builder.int8_calibrator = int8_calibrator print('Loading ONNX file from path {}...'.format(onnx_path)) with open(onnx_path, 'rb') as model: parser.parse(model.read()) engine = builder.build_cuda_engine(network) with open(save_path / 'model.plan', 'wb') as f: f.write(engine.serialize()) # create model configuration file if create_model_config: TRTConverter.generate_trt_config( save_path.parent, arch_name=arch_name, inputs=inputs, outputs=outputs ) return True
def FFDNet( clip: vs.VideoNode, sigma: float = 5.0, use_cuda_graph: bool = False, logger: trt.Logger = trt.Logger(trt.Logger.WARNING) ) -> vs.VideoNode: assert clip.format.id == vs.RGBS width, height = clip.width, clip.height sigma /= 255 runtime = trt.Runtime(logger) with open(f"ffdnet_{width}_{height}.engine", "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) execution_context = engine.create_execution_context() input_size = execution_context.get_strides(0)[0] * 4 input_shape = execution_context.get_binding_shape(0) sigma_size = execution_context.get_strides(1)[0] * 4 sigma_shape = execution_context.get_binding_shape(1) output_size = execution_context.get_strides(2)[0] * 4 output_shape = execution_context.get_binding_shape(2) h_sigma = checkError( cuda.cuMemHostAlloc(sigma_size, cuda.CU_MEMHOSTALLOC_WRITECOMBINED)) h_sigma = UniqueResource(h_sigma, cuda.cuMemFreeHost, h_sigma) h_sigma_pointer = ctypes.cast(ctypes.c_void_p(h_sigma.obj), ctypes.POINTER(ctypes.c_float)) h_sigma_array = np.ctypeslib.as_array(h_sigma_pointer, shape=(sigma_size // 4, )).reshape(sigma_shape) d_sigma = checkError(cuda.cuMemAlloc(sigma_size)) d_sigma = UniqueResource(d_sigma, cuda.cuMemFree, d_sigma) h_input = checkError( cuda.cuMemHostAlloc(input_size, cuda.CU_MEMHOSTALLOC_WRITECOMBINED)) h_input = UniqueResource(h_input, cuda.cuMemFreeHost, h_input) h_input_pointer = ctypes.cast(ctypes.c_void_p(h_input.obj), ctypes.POINTER(ctypes.c_float)) h_input_array = np.ctypeslib.as_array(h_input_pointer, shape=(input_size // 4, )).reshape(input_shape) d_input = checkError(cuda.cuMemAlloc(input_size)) d_input = UniqueResource(d_input, cuda.cuMemFree, d_input) d_output = checkError(cuda.cuMemAlloc(output_size)) d_output = UniqueResource(d_output, cuda.cuMemFree, d_output) h_output = checkError(cuda.cuMemAllocHost(output_size)) h_output = UniqueResource(h_output, cuda.cuMemFreeHost, h_output) h_output_pointer = ctypes.cast(ctypes.c_void_p(h_output.obj), ctypes.POINTER(ctypes.c_float)) h_output_array = np.ctypeslib.as_array(h_output_pointer, shape=(output_size // 4, )).reshape(output_shape) stream = checkError( cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)) stream = UniqueResource(stream, cuda.cuStreamDestroy, stream) h_sigma_array[...] = sigma checkError( cuda.cuMemcpyHtoDAsync(d_sigma.obj, h_sigma.obj, sigma_size, stream.obj)) def execute(): checkError( cuda.cuMemcpyHtoDAsync(d_input.obj, h_input.obj, input_size, stream.obj)) execution_context.execute_async_v2( [d_input.obj, d_sigma.obj, d_output.obj], stream_handle=stream.obj) checkError( cuda.cuMemcpyDtoHAsync(h_output.obj, d_output.obj, output_size, stream.obj)) if use_cuda_graph: checkError( cuda.cuStreamBeginCapture( stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED)) execute() graph = checkError(cuda.cuStreamEndCapture(stream.obj)) graphexec, error_node = checkError( cuda.cuGraphInstantiate(graph, logBuffer=b"", bufferSize=0)) graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec) checkError(cuda.cuGraphDestroy(graph)) def inference_core(n, f): for i in range(3): h_input_array[0, i, :, :] = np.asarray( _get_array(f, plane=i, read=True)) if use_cuda_graph: checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj)) else: execute() fout = f.copy() fout.get_write_array(0) # triggers COW checkError(cuda.cuStreamSynchronize(stream.obj)) for i in range(3): np.asarray(_get_array(fout, plane=i, read=False))[...] = h_output_array[0, i, :, :] return fout return core.std.ModifyFrame(clip, clips=[clip], selector=inference_core)
def run(args): onnx_filename = run_onnx_util.onnx_model_file(args.test_dir, args.model_file) input_names, output_names = run_onnx_util.onnx_input_output_names( onnx_filename) test_data_dir = os.path.join(args.test_dir, 'test_data_set_0') inputs, outputs = run_onnx_util.load_test_data( test_data_dir, input_names, output_names) with open(onnx_filename, 'rb') as f: onnx_proto = f.read() if args.debug: logger = tensorrt.Logger(tensorrt.Logger.Severity.INFO) else: logger = tensorrt.Logger() builder = tensorrt.Builder(logger) if args.fp16_mode: builder.fp16_mode = True # TODO(hamaji): Infer batch_size from inputs. builder.max_batch_size = args.batch_size network = builder.create_network() parser = tensorrt.OnnxParser(network, logger) if not parser.parse(onnx_proto): for i in range(parser.num_errors): sys.stderr.write('ONNX import failure: %s\n' % parser.get_error(i)) raise RuntimeError('ONNX import failed') engine = builder.build_cuda_engine(network) context = engine.create_execution_context() assert len(inputs) + len(outputs) == engine.num_bindings for i, (_, input) in enumerate(inputs): assert args.batch_size == input.shape[0] assert input.shape[1:] == engine.get_binding_shape(i) for i, (_, output) in enumerate(outputs): assert args.batch_size == output.shape[0] i += len(inputs) assert output.shape[1:] == engine.get_binding_shape(i) inputs = [v for n, v in inputs] outputs = [v for n, v in outputs] gpu_inputs = to_gpu(inputs) gpu_outputs = [] for output in outputs: gpu_outputs.append(cupy.zeros_like(cupy.array(output))) bindings = [a.data.ptr for a in gpu_inputs] bindings += [a.data.ptr for a in gpu_outputs] context.execute(args.batch_size, bindings) actual_outputs = to_cpu(gpu_outputs) for i, (name, expected, actual) in enumerate( zip(output_names, outputs, actual_outputs)): np.testing.assert_allclose(expected, actual, rtol=args.rtol, atol=args.atol), name print('%s: OK' % name) print('ALL OK') def compute(): context.execute(args.batch_size, bindings) cupy.cuda.device.Device().synchronize() return run_onnx_util.run_benchmark(compute, args.iterations)
import tensorrt as trt TRT_LOGGER = trt.Logger(trt.Logger.WARNING) EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) onnx_file = 'mobilenet.onnx' trt_file = 'mobilenet.trt' batch_size = 1 """Takes an ONNX file and creates a TensorRT engine to run inference with""" with trt.Builder(TRT_LOGGER) as builder, builder.create_network( EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: builder.max_workspace_size = 1 << 28 # 256MiB builder.max_batch_size = batch_size builder.fp16_mode = True # fp32_mode -> False # Parse model file with open(onnx_file, 'rb') as model: print('Beginning ONNX file parsing') if not parser.parse(model.read()): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) print('Completed parsing of ONNX file') engine = builder.build_cuda_engine(network) print("Completed creating Engine") with open(trt_file, "wb") as f: f.write(engine.serialize())
# tensorrt-lib import os import tensorrt as trt from calibrator import Calibrator # add verbose TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) # ** engine���ӻ� ** f_layer = open('log.txt', 'w') # create tensorrt-engine # fixed and dynamic def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \ fp32_mode=False, fp16_mode=False, int4_mode=False, calibration_stream=None, calibration_table_path="", save_engine=False, strategy=None): """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" def build_engine(max_batch_size, save_engine): """Takes an ONNX file and creates a TensorRT engine to run inference with""" # 1 << NetworkDefinitionCreationFlag.EXPLICIT_BATCH with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) as network, \ trt.OnnxParser(network, TRT_LOGGER) as parser: # parse onnx model file if not os.path.exists(onnx_file_path): quit('ONNX file {} not found'.format(onnx_file_path)) print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model:
import tensorrt as trt # Input parameters specific to the trained model uff_file_name = 'saved_model.uff' # Name of uff file that defines the trained model input_node_name = 'input/IteratorGetNext' # Input node (best to name it with tf.name.scope) input_node_dims = (1, 1, 4096) # Input dimensions to trained model # Input parameter for inference batch_size = 128 # Batch size to optimize to. This should be used for inference workspace_size = 1073741824 # 1 GB, for example use_fp16 = True # Do you want to use float16 type output_file_name = 'saved_model.plan' # Name of output file # Make the plan file builder = trt.Builder(trt.Logger(trt.Logger.INFO)) network = builder.create_network() parser = trt.UffParser() parser.register_input(input_node_name, input_node_dims) parser.parse(uff_file_name, network) builder.max_batch_size = batch_size builder.max_workspace_size = workspace_size builder.fp16_mode = use_fp16 engine = builder.build_cuda_engine(network) with open(output_file_name, 'wb') as f: f.write(engine.serialize())
def __init__(self, weights='yolov5s.pt', device=torch.device('cpu'), dnn=False, data=None, fp16=False): # Usage: # PyTorch: weights = *.pt # TorchScript: *.torchscript # ONNX Runtime: *.onnx # ONNX OpenCV DNN: *.onnx with --dnn # OpenVINO: *.xml # CoreML: *.mlmodel # TensorRT: *.engine # TensorFlow SavedModel: *_saved_model # TensorFlow GraphDef: *.pb # TensorFlow Lite: *.tflite # TensorFlow Edge TPU: *_edgetpu.tflite from models.experimental import attempt_download, attempt_load # scoped to avoid circular import super().__init__() w = str(weights[0] if isinstance(weights, list) else weights) pt, jit, onnx, xml, engine, coreml, saved_model, pb, tflite, edgetpu, tfjs = self.model_type( w) # get backend stride, names = 32, [f'class{i}' for i in range(1000)] # assign defaults w = attempt_download(w) # download if not local fp16 &= (pt or jit or onnx or engine) and device.type != 'cpu' # FP16 if data: # data.yaml path (optional) with open(data, errors='ignore') as f: names = yaml.safe_load(f)['names'] # class names if pt: # PyTorch model = attempt_load(weights if isinstance(weights, list) else w, map_location=device) stride = max(int(model.stride.max()), 32) # model stride names = model.module.names if hasattr( model, 'module') else model.names # get class names model.half() if fp16 else model.float() self.model = model # explicitly assign for to(), cpu(), cuda(), half() elif jit: # TorchScript LOGGER.info(f'Loading {w} for TorchScript inference...') extra_files = {'config.txt': ''} # model metadata model = torch.jit.load(w, _extra_files=extra_files) model.half() if fp16 else model.float() if extra_files['config.txt']: d = json.loads(extra_files['config.txt']) # extra_files dict stride, names = int(d['stride']), d['names'] elif dnn: # ONNX OpenCV DNN LOGGER.info(f'Loading {w} for ONNX OpenCV DNN inference...') check_requirements(('opencv-python>=4.5.4', )) net = cv2.dnn.readNetFromONNX(w) elif onnx: # ONNX Runtime LOGGER.info(f'Loading {w} for ONNX Runtime inference...') cuda = torch.cuda.is_available() check_requirements( ('onnx', 'onnxruntime-gpu' if cuda else 'onnxruntime')) import onnxruntime providers = ['CUDAExecutionProvider', 'CPUExecutionProvider' ] if cuda else ['CPUExecutionProvider'] session = onnxruntime.InferenceSession(w, providers=providers) meta = session.get_modelmeta().custom_metadata_map # metadata if 'stride' in meta: stride, names = int(meta['stride']), eval(meta['names']) elif xml: # OpenVINO LOGGER.info(f'Loading {w} for OpenVINO inference...') check_requirements( ('openvino-dev', ) ) # requires openvino-dev: https://pypi.org/project/openvino-dev/ import openvino.inference_engine as ie core = ie.IECore() if not Path(w).is_file(): # if not *.xml w = next(Path(w).glob( '*.xml')) # get *.xml file from *_openvino_model dir network = core.read_network( model=w, weights=Path(w).with_suffix('.bin')) # *.xml, *.bin paths executable_network = core.load_network(network, device_name='CPU', num_requests=1) elif engine: # TensorRT LOGGER.info(f'Loading {w} for TensorRT inference...') import tensorrt as trt # https://developer.nvidia.com/nvidia-tensorrt-download check_version(trt.__version__, '7.0.0', hard=True) # require tensorrt>=7.0.0 Binding = namedtuple('Binding', ('name', 'dtype', 'shape', 'data', 'ptr')) logger = trt.Logger(trt.Logger.INFO) with open(w, 'rb') as f, trt.Runtime(logger) as runtime: model = runtime.deserialize_cuda_engine(f.read()) bindings = OrderedDict() fp16 = False # default updated below for index in range(model.num_bindings): name = model.get_binding_name(index) dtype = trt.nptype(model.get_binding_dtype(index)) shape = tuple(model.get_binding_shape(index)) data = torch.from_numpy(np.empty( shape, dtype=np.dtype(dtype))).to(device) bindings[name] = Binding(name, dtype, shape, data, int(data.data_ptr())) if model.binding_is_input(index) and dtype == np.float16: fp16 = True binding_addrs = OrderedDict( (n, d.ptr) for n, d in bindings.items()) context = model.create_execution_context() batch_size = bindings['images'].shape[0] elif coreml: # CoreML LOGGER.info(f'Loading {w} for CoreML inference...') import coremltools as ct model = ct.models.MLModel(w) else: # TensorFlow (SavedModel, GraphDef, Lite, Edge TPU) if saved_model: # SavedModel LOGGER.info( f'Loading {w} for TensorFlow SavedModel inference...') import tensorflow as tf keras = False # assume TF1 saved_model model = tf.keras.models.load_model( w) if keras else tf.saved_model.load(w) elif pb: # GraphDef https://www.tensorflow.org/guide/migrate#a_graphpb_or_graphpbtxt LOGGER.info( f'Loading {w} for TensorFlow GraphDef inference...') import tensorflow as tf def wrap_frozen_graph(gd, inputs, outputs): x = tf.compat.v1.wrap_function( lambda: tf.compat.v1.import_graph_def(gd, name=""), []) # wrapped ge = x.graph.as_graph_element return x.prune(tf.nest.map_structure(ge, inputs), tf.nest.map_structure(ge, outputs)) gd = tf.Graph().as_graph_def() # graph_def with open(w, 'rb') as f: gd.ParseFromString(f.read()) frozen_func = wrap_frozen_graph(gd, inputs="x:0", outputs="Identity:0") elif tflite or edgetpu: # https://www.tensorflow.org/lite/guide/python#install_tensorflow_lite_for_python try: # https://coral.ai/docs/edgetpu/tflite-python/#update-existing-tf-lite-code-for-the-edge-tpu from tflite_runtime.interpreter import Interpreter, load_delegate except ImportError: import tensorflow as tf Interpreter, load_delegate = tf.lite.Interpreter, tf.lite.experimental.load_delegate, if edgetpu: # Edge TPU https://coral.ai/software/#edgetpu-runtime LOGGER.info( f'Loading {w} for TensorFlow Lite Edge TPU inference...' ) delegate = { 'Linux': 'libedgetpu.so.1', 'Darwin': 'libedgetpu.1.dylib', 'Windows': 'edgetpu.dll' }[platform.system()] interpreter = Interpreter( model_path=w, experimental_delegates=[load_delegate(delegate)]) else: # Lite LOGGER.info( f'Loading {w} for TensorFlow Lite inference...') interpreter = Interpreter( model_path=w) # load TFLite model interpreter.allocate_tensors() # allocate input_details = interpreter.get_input_details() # inputs output_details = interpreter.get_output_details() # outputs elif tfjs: raise Exception( 'ERROR: YOLOv5 TF.js inference is not supported') self.__dict__.update(locals()) # assign all variables to self
# limitations under the License. # import ctypes import numpy as np from cuda import cudart import tensorrt as trt np.random.seed(97) nIn, cIn, hIn, wIn = 4, 3, 128, 128 data = np.random.rand(nIn * cIn * hIn * wIn).astype(np.float32).reshape(nIn, cIn, hIn, wIn) np.set_printoptions(precision=8, linewidth=200, suppress=True) cudart.cudaDeviceSynchronize() logger = trt.Logger(trt.Logger.ERROR) builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile0 = builder.create_optimization_profile() profile1 = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 1 << 30 inputT0 = network.add_input('inputT0', trt.float32, [-1, cIn, hIn, wIn]) layer = network.add_unary(inputT0, trt.UnaryOperation.NEG) network.mark_output(layer.get_output(0)) profile0.set_shape(inputT0.name, (1, cIn, hIn, wIn), (nIn, cIn, hIn, wIn), (nIn * 2, cIn, hIn, wIn)) profile1.set_shape(inputT0.name, (1, cIn, hIn, wIn), (nIn, cIn, hIn, wIn), (nIn * 2, cIn, hIn, wIn)) config.add_optimization_profile(profile0) config.add_optimization_profile(profile1)
def torch2trt(module, inputs, input_names=None, output_names=None, log_level=trt.Logger.ERROR, max_batch_size=1, fp16_mode=False, max_workspace_size=1 << 25, strict_type_constraints=False, keep_network=True, int8_mode=False, int8_calib_dataset=None, int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM, int8_calib_batch_size=1, use_onnx=False, **kwargs): # capture arguments to provide to context kwargs.update(locals()) kwargs.pop('kwargs') inputs_in = inputs # copy inputs to avoid modifications to source data inputs = [tensor.clone() for tensor in inputs] # only run single entry logger = trt.Logger(log_level) builder = trt.Builder(logger) if isinstance(inputs, list): inputs = tuple(inputs) if not isinstance(inputs, tuple): inputs = (inputs, ) # run once to get num outputs outputs = module(*inputs) if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = (outputs, ) if input_names is None: input_names = default_input_names(len(inputs)) if output_names is None: output_names = default_output_names(len(outputs)) if use_onnx: f = io.BytesIO() torch.onnx.export(module, inputs, f, input_names=input_names, output_names=output_names) f.seek(0) onnx_bytes = f.read() network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) parser = trt.OnnxParser(network, logger) parser.parse(onnx_bytes) else: network = builder.create_network() with ConversionContext(network, torch2trt_kwargs=kwargs) as ctx: ctx.add_inputs(inputs, input_names) outputs = module(*inputs) if not isinstance(outputs, tuple) and not isinstance( outputs, list): outputs = (outputs, ) ctx.mark_outputs(outputs, output_names) builder.max_workspace_size = max_workspace_size builder.fp16_mode = fp16_mode builder.max_batch_size = max_batch_size builder.strict_type_constraints = strict_type_constraints if int8_mode: # default to use input tensors for calibration if int8_calib_dataset is None: int8_calib_dataset = TensorBatchDataset(inputs_in) builder.int8_mode = True # @TODO(jwelsh): Should we set batch_size=max_batch_size? Need to investigate memory consumption builder.int8_calibrator = DatasetCalibrator( inputs, int8_calib_dataset, batch_size=int8_calib_batch_size, algorithm=int8_calib_algorithm) engine = builder.build_cuda_engine(network) module_trt = TRTModule(engine, input_names, output_names) if keep_network: module_trt.network = network return module_trt
dataPath = os.path.dirname(os.path.realpath(__file__)) + "/../../00-MNISTData/" sys.path.append(dataPath) np.random.seed(97) nTrainbatchSize = 128 pbFile = "./model-NCHW.pb" caffePrototxtFile = "./model.prototxt" caffeModelFile = "./model.caffemodel" trtFile = "./model-NCHW.plan" inputImage = dataPath + '8.png' np.set_printoptions(precision=4, linewidth=200, suppress=True) cudart.cudaDeviceSynchronize() # TensorRT 中加载 Caffe 模型并创建 engine ----------------------------------------- logger = trt.Logger(trt.Logger.VERBOSE) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engine = trt.Runtime(logger).deserialize_cuda_engine(f.read()) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) config = builder.create_builder_config() config.max_workspace_size = 3 << 30 parser = trt.CaffeParser() with open(caffePrototxtFile, 'rb') as f0, open(caffeModelFile, 'rb') as f1: net = parser.parse_buffer(f0.read(), f1.read(), network, trt.float32)
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def __repr__(self): return self.__str__() anchors = [] fpn_fm_shape = [ math.ceil(cfg.img_size / stride) for stride in (8, 16, 32, 64, 128) ] for i, size in enumerate(fpn_fm_shape): anchors += make_anchors(cfg, size, size, cfg.scales[i]) # prepare engine with open(cfg.weight, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.WARNING)) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) inputs, outputs, bindings = [], [], [] stream = cuda.Stream() for binding in engine: size = trt.volume( engine.get_binding_shape(binding)) * engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) # Allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # Append the device buffer to device bindings. bindings.append(int(device_mem))
def load_engine(engine_path): TRT_LOGGER = trt.Logger(trt.Logger.WARNING) # INFO with open(engine_path, 'rb') as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read())
import pycuda.driver as cuda import pycuda.autoinit import utilities as ut USE_FP16 = True target_dtype = np.float16 if USE_FP16 else np.float32 # input_batch = ut.random_image(100).numpy() img_path = "data/test7.jpg" # input_batch = ut.npz_loader(img_path).numpy() input_batch = ut.load_image(img_path).unsqueeze(0).numpy() print(input_batch.shape) input_batch = np.ascontiguousarray(input_batch, dtype=np.float16) print(input_batch) f = open("inference_models/edsr.trt", "rb") runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING)) engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() # need to set input and output precisions to FP16 to fully enable it output = np.empty([1, 3, 480, 480], dtype=target_dtype) # allocate device memory d_input = cuda.mem_alloc(1 * input_batch.nbytes) d_output = cuda.mem_alloc(1 * output.nbytes) bindings = [int(d_input), int(d_output)] stream = cuda.Stream()
def export_engine(model, im, file, train, half, simplify, workspace=4, verbose=False, prefix=colorstr('TensorRT:')): # YOLOv5 TensorRT export https://developer.nvidia.com/tensorrt try: assert im.device.type != 'cpu', 'export running on CPU but must be on GPU, i.e. `python export.py --device 0`' try: import tensorrt as trt except Exception: if platform.system() == 'Linux': check_requirements( ('nvidia-tensorrt', ), cmds=('-U --index-url https://pypi.ngc.nvidia.com', )) import tensorrt as trt if trt.__version__[ 0] == '7': # TensorRT 7 handling https://github.com/ultralytics/yolov5/issues/6012 grid = model.model[-1].anchor_grid model.model[-1].anchor_grid = [a[..., :1, :1, :] for a in grid] export_onnx(model, im, file, 12, train, False, simplify) # opset 12 model.model[-1].anchor_grid = grid else: # TensorRT >= 8 check_version(trt.__version__, '8.0.0', hard=True) # require tensorrt>=8.0.0 export_onnx(model, im, file, 13, train, False, simplify) # opset 13 onnx = file.with_suffix('.onnx') LOGGER.info( f'\n{prefix} starting export with TensorRT {trt.__version__}...') assert onnx.exists(), f'failed to export ONNX file: {onnx}' f = file.with_suffix('.engine') # TensorRT engine file logger = trt.Logger(trt.Logger.INFO) if verbose: logger.min_severity = trt.Logger.Severity.VERBOSE builder = trt.Builder(logger) config = builder.create_builder_config() config.max_workspace_size = workspace * 1 << 30 # config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, workspace << 30) # fix TRT 8.4 deprecation notice flag = (1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) network = builder.create_network(flag) parser = trt.OnnxParser(network, logger) if not parser.parse_from_file(str(onnx)): raise RuntimeError(f'failed to load ONNX file: {onnx}') inputs = [network.get_input(i) for i in range(network.num_inputs)] outputs = [network.get_output(i) for i in range(network.num_outputs)] LOGGER.info(f'{prefix} Network Description:') for inp in inputs: LOGGER.info( f'{prefix}\tinput "{inp.name}" with shape {inp.shape} and dtype {inp.dtype}' ) for out in outputs: LOGGER.info( f'{prefix}\toutput "{out.name}" with shape {out.shape} and dtype {out.dtype}' ) LOGGER.info( f'{prefix} building FP{16 if builder.platform_has_fast_fp16 and half else 32} engine in {f}' ) if builder.platform_has_fast_fp16 and half: config.set_flag(trt.BuilderFlag.FP16) with builder.build_engine(network, config) as engine, open(f, 'wb') as t: t.write(engine.serialize()) LOGGER.info( f'{prefix} export success, saved as {f} ({file_size(f):.1f} MB)') return f except Exception as e: LOGGER.info(f'\n{prefix} export failure: {e}')
class InferenceBackend: # initialize TensorRT TRT_LOGGER = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(TRT_LOGGER, '') def __init__(self, model, batch_size): self.model = model self.batch_size = batch_size # load plugin if the model requires one if self.model.PLUGIN_PATH is not None: try: ctypes.cdll.LoadLibrary(self.model.PLUGIN_PATH) except OSError as err: raise RuntimeError('Plugin not found') from err # load trt engine or build one if not found if not self.model.ENGINE_PATH.exists(): self.engine = self.model.build_engine(InferenceBackend.TRT_LOGGER, self.batch_size) else: runtime = trt.Runtime(InferenceBackend.TRT_LOGGER) with open(self.model.ENGINE_PATH, 'rb') as engine_file: buf = engine_file.read() self.engine = runtime.deserialize_cuda_engine(buf) if self.engine is None: raise RuntimeError('Unable to load the engine file') if self.engine.has_implicit_batch_dimension: assert self.batch_size <= self.engine.max_batch_size # allocate buffers self.bindings = [] self.outputs = [] for binding in self.engine: shape = self.engine.get_binding_shape(binding) size = trt.volume(shape) if self.engine.has_implicit_batch_dimension: size *= self.batch_size dtype = trt.nptype(self.engine.get_binding_dtype(binding)) # allocate host and device buffers host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) # append the device buffer to device bindings self.bindings.append(int(device_mem)) if self.engine.binding_is_input(binding): if not self.engine.has_implicit_batch_dimension: assert self.batch_size == shape[0] self.input = HostDeviceMem(host_mem, device_mem) else: self.outputs.append(HostDeviceMem(host_mem, device_mem)) self.context = self.engine.create_execution_context() self.stream = cuda.Stream() def infer(self): self.infer_async() return self.synchronize() def infer_async(self): cuda.memcpy_htod_async(self.input.device, self.input.host, self.stream) if self.engine.has_implicit_batch_dimension: self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.handle) else: self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) for out in self.outputs: cuda.memcpy_dtoh_async(out.host, out.device, self.stream) def synchronize(self): self.stream.synchronize() return [out.host for out in self.outputs]
import os import tensorrt as trt import logging import uff import numpy as np import pycuda.driver as cuda import pycuda.autoinit import sys logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S") logger = logging.getLogger(__name__) TRT_LOGGER = trt.Logger(trt.Logger.ERROR) # global trt logger setting class TensorrtBuilder: @staticmethod def _item_to_list(item): if not isinstance(item, list): if item: item = [item] return item @staticmethod def _GiB(val): return val * 1 << 30 @staticmethod
def main(): """ Launches text to speech (inference). Inference is executed on a single GPU. """ parser = argparse.ArgumentParser( description='PyTorch Tacotron 2 Inference') parser = parse_args(parser) args, unknown_args = parser.parse_known_args() DLLogger.init(backends=[JSONStreamBackend(Verbosity.DEFAULT, args.log_file), StdOutBackend(Verbosity.VERBOSE)]) for k,v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k:v}) DLLogger.log(step="PARAMETER", data={'model_name':'Tacotron2_PyT'}) measurements_all = {"pre_processing": [], "tacotron2_encoder_time": [], "tacotron2_decoder_time": [], "tacotron2_postnet_time": [], "tacotron2_latency": [], "waveglow_latency": [], "latency": [], "type_conversion": [], "data_transfer": [], "storage": [], "tacotron2_items_per_sec": [], "waveglow_items_per_sec": [], "num_mels_per_audio": [], "throughput": []} print("args:", args, unknown_args) torch.cuda.init() TRT_LOGGER = trt.Logger(trt.Logger.WARNING) encoder = load_engine(args.encoder, TRT_LOGGER) decoder_iter = load_engine(args.decoder, TRT_LOGGER) postnet = load_engine(args.postnet, TRT_LOGGER) waveglow = load_engine(args.waveglow, TRT_LOGGER) if args.waveglow_ckpt != "": # setup denoiser using WaveGlow PyTorch checkpoint waveglow_ckpt = load_and_setup_model('WaveGlow', parser, args.waveglow_ckpt, fp16_run=args.fp16, cpu_run=False, forward_is_infer=True) denoiser = Denoiser(waveglow_ckpt).cuda() # after initialization, we don't need WaveGlow PyTorch checkpoint # anymore - deleting del waveglow_ckpt torch.cuda.empty_cache() # create TRT contexts for each engine encoder_context = encoder.create_execution_context() decoder_context = decoder_iter.create_execution_context() postnet_context = postnet.create_execution_context() waveglow_context = waveglow.create_execution_context() texts = ["The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves. The forms of printed letters should be beautiful, and that their arrangement on the page should be reasonable and a help to the shapeliness of the letters themselves."] texts = [texts[0][:args.input_length]] texts = texts*args.batch_size warmup_iters = 3 for iter in range(args.num_iters): measurements = {} with MeasureTime(measurements, "pre_processing"): sequences_padded, input_lengths = prepare_input_sequence(texts) sequences_padded = sequences_padded.to(torch.int32) input_lengths = input_lengths.to(torch.int32) with torch.no_grad(): with MeasureTime(measurements, "latency"): with MeasureTime(measurements, "tacotron2_latency"): mel, mel_lengths = infer_tacotron2_trt(encoder, decoder_iter, postnet, encoder_context, decoder_context, postnet_context, sequences_padded, input_lengths, measurements, args.fp16, True) with MeasureTime(measurements, "waveglow_latency"): audios = infer_waveglow_trt(waveglow, waveglow_context, mel, measurements, args.fp16) num_mels = mel.size(0)*mel.size(2) num_samples = audios.size(0)*audios.size(1) with MeasureTime(measurements, "type_conversion"): audios = audios.float() with MeasureTime(measurements, "data_transfer"): audios = audios.cpu() with MeasureTime(measurements, "storage"): audios = audios.numpy() for i, audio in enumerate(audios): audio_path = "audio_"+str(i)+".wav" write(audio_path, args.sampling_rate, audio[:mel_lengths[i]*args.stft_hop_length]) measurements['tacotron2_items_per_sec'] = num_mels/measurements['tacotron2_latency'] measurements['waveglow_items_per_sec'] = num_samples/measurements['waveglow_latency'] measurements['num_mels_per_audio'] = mel.size(2) measurements['throughput'] = num_samples/measurements['latency'] if iter >= warmup_iters: for k,v in measurements.items(): if k in measurements_all.keys(): measurements_all[k].append(v) DLLogger.log(step=(iter-warmup_iters), data={k: v}) DLLogger.flush() print_stats(measurements_all)
import os import numpy as np from time import time from cuda import cudart import tensorrt as trt trtFile = "./model.plan" nIn, cIn, hIn, wIn = 1, 1, 28, 28 np.random.seed(97) data = np.random.rand(nIn, cIn, hIn, wIn).astype(np.float32) * 2 - 1 np.set_printoptions(precision=3, linewidth=200, suppress=True) cudart.cudaDeviceSynchronize() logger = trt.Logger(trt.Logger.INFO) builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS) | 1 << int(trt.TacticSource.CUBLAS_LT) | 1 << int(trt.TacticSource.CUDNN)) #config.set_tactic_sources(1 << int(trt.TacticSource.CUBLAS) | 1 << int(trt.TacticSource.CUBLAS_LT)) inputTensor = network.add_input('inputT0', trt.float32, [-1, 1, 28, 28]) profile.set_shape(inputTensor.name, [1, cIn, hIn, wIn], [nIn, cIn, hIn, wIn], [nIn * 2, cIn, hIn, wIn]) config.add_optimization_profile(profile)
def torch2trt(module, inputs, input_names=None, output_names=None, log_level=trt.Logger.ERROR, max_batch_size=1, fp16_mode=False, max_workspace_size=0, strict_type_constraints=False, keep_network=True, int8_mode=False, int8_calib_dataset=None, int8_calib_algorithm=DEFAULT_CALIBRATION_ALGORITHM): inputs_in = inputs # copy inputs to avoid modifications to source data inputs = [tensor.clone()[0:1] for tensor in inputs] # only run single entry logger = trt.Logger(log_level) builder = trt.Builder(logger) network = builder.create_network() with ConversionContext(network) as ctx: if isinstance(inputs, list): inputs = tuple(inputs) if not isinstance(inputs, tuple): inputs = (inputs, ) ctx.add_inputs(inputs, input_names) outputs = module(*inputs) if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = (outputs, ) ctx.mark_outputs(outputs, output_names) builder.max_workspace_size = max_workspace_size builder.fp16_mode = fp16_mode builder.max_batch_size = max_batch_size builder.strict_type_constraints = strict_type_constraints if int8_mode: # default to use input tensors for calibration if int8_calib_dataset is None: int8_calib_dataset = TensorBatchDataset(inputs_in) builder.int8_mode = True # @TODO(jwelsh): Should we set batch_size=max_batch_size? Need to investigate memory consumption builder.int8_calibrator = DatasetCalibrator( inputs, int8_calib_dataset, batch_size=1, algorithm=int8_calib_algorithm) engine = builder.build_cuda_engine(network) module_trt = TRTModule(engine, ctx.input_names, ctx.output_names) if keep_network: module_trt.network = network return module_trt
def timing_engine(engine_file_path, batch_size, num_input_channels, height, width, timing_loops=100): logger = tensorrt.Logger(tensorrt.Logger.ERROR) with open(engine_file_path, 'rb') as fin, tensorrt.Runtime(logger) as runtime: engine = runtime.deserialize_cuda_engine(fin.read()) assert engine is not None, 'deserialize engine failed!' assert batch_size <= engine.max_batch_size print('Engine info:') print('\tmax batch size: ', engine.max_batch_size) print('\tmax workspace_size: ', engine.max_workspace_size) print('\tdevice memory_size: ', engine.device_memory_size) inputs, outputs, bindings, stream = allocate_buffers(engine, batch_size) input_data = numpy.random.rand(batch_size, num_input_channels, height, width).astype(dtype=numpy.float32, order='C') inputs[0].host = input_data print('Start timing......') with engine.create_execution_context() as context: # warm up for i in range(10): [ cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs ] context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) [ cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs ] stream.synchronize() time_start = time.time() for i in range(timing_loops): [ cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs ] context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) [ cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs ] stream.synchronize() time_end = time.time() print( 'Total time elapsed: %.04f ms.\n%.04f ms for each image (%.02f FPS)\n%.04f ms for each batch' % ((time_end - time_start) * 1000, (time_end - time_start) * 1000 / batch_size / timing_loops, batch_size * timing_loops / (time_end - time_start), (time_end - time_start) * 1000 / timing_loops))
def loadEngine2TensorRT(filepath): G_LOGGER = trt.Logger(trt.Logger.WARNING) # 反序列化引擎 with open(filepath, "rb") as f, trt.Runtime(G_LOGGER) as runtime: engine = runtime.deserialize_cuda_engine(f.read()) return engine
def benchmark( width: int, height: int, iter: int = 5, use_cuda_graph: bool = False, logger: trt.Logger = trt.Logger(trt.Logger.VERBOSE) ) -> None: cuda_context = init_cuda() runtime = trt.Runtime(logger) with open(f"ffdnet_{width}_{height}.engine", "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) execution_context = engine.create_execution_context() _bindings = _get_bindings(execution_context, engine.num_bindings) bindings = [binding.obj for binding in _bindings] stream = checkError( cuda.cuStreamCreate(cuda.CUstream_flags.CU_STREAM_NON_BLOCKING.value)) stream = UniqueResource(stream, cuda.cuStreamDestroy, stream) start = checkError( cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value)) start = UniqueResource(start, cuda.cuEventDestroy, start) end = checkError( cuda.cuEventCreate(cuda.CUevent_flags.CU_EVENT_DEFAULT.value)) end = UniqueResource(end, cuda.cuEventDestroy, end) def execute(): execution_context.execute_async_v2(bindings, stream_handle=stream.obj) if use_cuda_graph: checkError( cuda.cuStreamBeginCapture( stream.obj, cuda.CUstreamCaptureMode.CU_STREAM_CAPTURE_MODE_RELAXED)) execute() graph = checkError(cuda.cuStreamEndCapture(stream.obj)) graphexec, error_node = checkError( cuda.cuGraphInstantiate(graph, logBuffer=b"", bufferSize=0)) graphexec = UniqueResource(graphexec, cuda.cuGraphExecDestroy, graphexec) checkError( cuda.cuGraphDebugDotPrint( graph, b"ffdnet.dot", cuda.CUgraphDebugDot_flags. CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE.value)) checkError(cuda.cuGraphDestroy(graph)) for _ in range(iter): checkError(cuda.cuEventRecord(start.obj, stream.obj)) if use_cuda_graph: checkError(cuda.cuGraphLaunch(graphexec.obj, stream.obj)) else: execute() checkError(cuda.cuEventRecord(end.obj, stream.obj)) checkError(cuda.cuEventSynchronize(end.obj)) duration = checkError(cuda.cuEventElapsedTime(start.obj, end.obj)) print(f"duration: {duration} ms")
def run(shape, scalar): testCase = "<shape=%s,scalar=%f>" % (shape, scalar) trtFile = "./model-Dim%s.plan" % str(len(shape)) print("Test %s" % testCase) logger = trt.Logger(trt.Logger.ERROR) trt.init_libnvinfer_plugins(logger, '') ctypes.cdll.LoadLibrary(soFile) if os.path.isfile(trtFile): with open(trtFile, 'rb') as f: engineStr = f.read() engine = trt.Runtime(logger).deserialize_cuda_engine(engineStr) if engine == None: print("Failed loading engine!") exit() print("Succeeded loading engine!") else: builder = trt.Builder(logger) network = builder.create_network( 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) profile = builder.create_optimization_profile() config = builder.create_builder_config() config.max_workspace_size = 6 << 30 inputT0 = network.add_input('inputT0', trt.float32, [-1 for i in shape]) profile.set_shape(inputT0.name, [1 for i in shape], [8 for i in shape], [32 for i in shape]) config.add_optimization_profile(profile) pluginLayer = network.add_plugin_v2([inputT0], getAddScalarPlugin(scalar)) network.mark_output(pluginLayer.get_output(0)) engineString = builder.build_serialized_network(network, config) if engineString == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: f.write(engineString) engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) context = engine.create_execution_context() context.set_binding_shape(0, shape) #print("Binding all? %s"%(["No","Yes"][int(context.all_binding_shapes_specified)])) nInput = np.sum( [engine.binding_is_input(i) for i in range(engine.num_bindings)]) nOutput = engine.num_bindings - nInput #for i in range(engine.num_bindings): # print("Bind[%2d]:i[%d]->"%(i,i) if engine.binding_is_input(i) else "Bind[%2d]:o[%d]->"%(i,i-nInput), # engine.get_binding_dtype(i),engine.get_binding_shape(i),context.get_binding_shape(i),engine.get_binding_name(i)) bufferH = [] bufferH.append(np.arange(np.prod(shape), dtype=np.float32).reshape(shape)) for i in range(nOutput): bufferH.append( np.empty(context.get_binding_shape(nInput + i), dtype=trt.nptype(engine.get_binding_dtype(nInput + i)))) bufferD = [] for i in range(engine.num_bindings): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): cudart.cudaMemcpy( bufferD[i], np.ascontiguousarray(bufferH[i].reshape(-1)).ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute_v2(bufferD) for i in range(nOutput): cudart.cudaMemcpy(bufferH[nInput + i].ctypes.data, bufferD[nInput + i], bufferH[nInput + i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) outputCPU = addScalarCPU(bufferH[:nInput], scalar) ''' for i in range(nInput): printArrayInfo(bufferH[i]) for i in range(nOutput): printArrayInfo(bufferH[nInput+i]) for i in range(nOutput): printArrayInfo(outputCPU[i]) ''' check(bufferH[nInput:][0], outputCPU[0], True) for buffer in bufferD: cudart.cudaFree(buffer) print("Test %s finish!\n" % testCase)
import sys import glob import numpy as np import tensorrt as trt import pycuda.driver as cuda import pycuda.autoinit #from PIL import ImageDraw from data_processing import PreprocessYOLO, PostprocessYOLO from utils import read_truths_args, multi_bbox_ious import common import time t2 = time.time() TRT_LOGGER = trt.Logger() try: data_dir = os.environ['TESTDATADIR'] except KeyError: data_dir = '/tmp/dataset-nctu/clothes/clothes_test' def get_engine(engine_file_path="clothes.trt"): print("Reading engine from file {}".format(engine_file_path)) with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime: return runtime.deserialize_cuda_engine(f.read()) input_HW = (416, 416) output_shapes = [(1, 255, 13, 13), (1, 255, 26, 26), (1, 255, 52, 52)]
# tensorrt-lib import os import tensorrt as trt import pycuda.autoinit import pycuda.driver as cuda from calibrator import Calibrator from torch.autograd import Variable import torch import numpy as np import time # add verbose TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) # ** engine可视化 ** # create tensorrt-engine # fixed and dynamic def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",\ fp16_mode=False, int8_mode=False, calibration_stream=None, calibration_table_path="", save_engine=False): """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it.""" def build_engine(max_batch_size, save_engine): """Takes an ONNX file and creates a TensorRT engine to run inference with""" with trt.Builder(TRT_LOGGER) as builder, \ builder.create_network(1) as network,\ trt.OnnxParser(network, TRT_LOGGER) as parser: # parse onnx model file if not os.path.exists(onnx_file_path): quit('ONNX file {} not found'.format(onnx_file_path)) print('Loading ONNX file from path {}...'.format(onnx_file_path)) with open(onnx_file_path, 'rb') as model:
import tensorrt as trt import uff from tensorrt import UffParser G_LOGGER = trt.Logger(trt.Logger.INFO) trt.init_libnvinfer_plugins(G_LOGGER, '') model_file = './mask_rcnn_nucleus_0080.uff' output_nodes = ['mrcnn_detection', "mrcnn_mask/Sigmoid"] trt_output_nodes = output_nodes INPUT_NODE = "input_image" INPUT_SIZE = [3, 1024, 1024] with trt.Builder(G_LOGGER) as builder, builder.create_network( ) as network, trt.UffParser() as parser: parser.register_input(INPUT_NODE, INPUT_SIZE) parser.register_output(output_nodes[0]) parser.register_output(output_nodes[1]) parser.parse(model_file, network) builder.max_batch_size = 1 builder.max_workspace_size = 1 << 28 # 256MiB engine = builder.build_cuda_engine(network) for binding in engine: print(engine.get_binding_shape(binding)) with open("nucleus.engine", "wb") as f: f.write(engine.serialize())
def build_engine(onnx_path, cfg_file_path, model_name, category_num, do_int8, dla_core, verbose=False): """Build a TensorRT engine from ONNX using the older API.""" net_w, net_h = get_input_wh(cfg_file_path) print('Loading the ONNX file...') onnx_data = load_onnx(onnx_path) if onnx_data is None: return None TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) if verbose else trt.Logger() EXPLICIT_BATCH = [] if trt.__version__[0] < '7' else \ [1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)] with trt.Builder(TRT_LOGGER) as builder, builder.create_network( *EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser: if do_int8 and not builder.platform_has_fast_int8: raise RuntimeError('INT8 not supported on this platform') if not parser.parse(onnx_data): print('ERROR: Failed to parse the ONNX file.') for error in range(parser.num_errors): print(parser.get_error(error)) return None network = set_net_batch(network, MAX_BATCH_SIZE) print('Adding yolo_layer plugins...') network = add_yolo_plugins(network, cfg_file_path, model_name, category_num, TRT_LOGGER) print('Building an engine. This would take a while...') print('(Use "--verbose" or "-v" to enable verbose logging.)') if trt.__version__[0] < '7': # older API: build_cuda_engine() if dla_core >= 0: raise RuntimeError('DLA core not supported by old API') builder.max_batch_size = MAX_BATCH_SIZE builder.max_workspace_size = 1 << 30 builder.fp16_mode = True # alternative: builder.platform_has_fast_fp16 if do_int8: from calibrator import YOLOEntropyCalibrator builder.int8_mode = True builder.int8_calibrator = YOLOEntropyCalibrator( 'calib_images', (net_h, net_w), 'calib_%s.bin' % model_name) engine = builder.build_cuda_engine(network) else: # new API: build_engine() with builder config builder.max_batch_size = MAX_BATCH_SIZE config = builder.create_builder_config() config.max_workspace_size = 1 << 30 config.set_flag(trt.BuilderFlag.GPU_FALLBACK) config.set_flag(trt.BuilderFlag.FP16) profile = builder.create_optimization_profile() profile.set_shape( '000_net', # input tensor name (MAX_BATCH_SIZE, 3, net_h, net_w), # min shape (MAX_BATCH_SIZE, 3, net_h, net_w), # opt shape (MAX_BATCH_SIZE, 3, net_h, net_w)) # max shape config.add_optimization_profile(profile) if do_int8: from calibrator import YOLOEntropyCalibrator config.set_flag(trt.BuilderFlag.INT8) config.int8_calibrator = YOLOEntropyCalibrator( 'calib_images', (net_h, net_w), 'calib_%s.bin' % model_name) config.set_calibration_profile(profile) if dla_core >= 0: config.default_device_type = trt.DeviceType.DLA config.DLA_core = dla_core config.set_flag(trt.BuilderFlag.STRICT_TYPES) print('Using DLA core %d.' % dla_core) engine = builder.build_engine(network, config) if engine is not None: print('Completed creating engine.') return engine
import torch import tensorrt as trt from vgg16_397923af_trt import populate_network def build_engine(weights): # flag implies the input batch is explicit. The input shape is {P * C * H * W}. flag = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH) with trt.Builder(TRT_LOGGER) as builder, builder.create_network(flag) as network: # Populate the network using weights from the PyTorch model. populate_network(network, weights) builder.max_workspace_size = 4 * 1 << 30 config = builder.create_builder_config() return builder.build_engine(network, config) vgg16_path = './vgg16-397923af.pth' vgg16_weights = torch.load(vgg16_path, map_location='cpu') TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE) # Do inference with TensorRT with build_engine(vgg16_weights) as engine: # Build an engine, allocate buffers and create a stream. host_memory = engine.serialize() output_engine = 'vgg16-397923af_fp32.engine' print("===> Save %s\n" % output_engine) with open(output_engine, "wb") as f: f.write(engine.serialize())
def run(): logger = trt.Logger(trt.Logger.ERROR) # 指定 Logger,可用等级:VERBOSE,INFO,WARNING,ERRROR,INTERNAL_ERROR if os.path.isfile(trtFile): # 如果有 .plan 文件则直接读取 with open(trtFile, 'rb') as f: engineString = f.read() if engineString == None: print("Failed getting serialized engine!") return print("Succeeded getting serialized engine!") engine = trt.Runtime(logger).deserialize_cuda_engine(engineString) if engine == None: print("Failed building engine!") return print("Succeeded building engine!") else: # 没有 .plan 文件,从头开始创建 builder = trt.Builder(logger) # 网络元信息,Builder/Network/BuilderConfig/Profile 相关 builder.max_batch_size = 3 builder.max_workspace_size = 1 << 30 network = builder.create_network() inputTensor = network.add_input('inputT0', trt.float32, [4, 5]) # 指定输入张量 identityLayer = network.add_identity(inputTensor) # 恒等变换 network.mark_output(identityLayer.get_output(0)) # 标记输出张量 engine = builder.build_cuda_engine(network) if engine == None: print("Failed building engine!") return print("Succeeded building engine!") with open(trtFile, 'wb') as f: # 将序列化网络保存为 .plan 文件 f.write(engine.serialize()) print("Succeeded saving .plan file!") context = engine.create_execution_context() # 创建 context(相当于 GPU 进程) nInput = np.sum([engine.binding_is_input(i) for i in range(engine.num_bindings)]) # 获取 engine 绑定信息 nOutput = engine.num_bindings - nInput for i in range(nInput): print("Bind[%2d]:i[%2d]->" % (i, i), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) for i in range(nInput,nInput+nOutput): print("Bind[%2d]:o[%2d]->" % (i, i - nInput), engine.get_binding_dtype(i), engine.get_binding_shape(i), context.get_binding_shape(i), engine.get_binding_name(i)) data = np.arange(3 * 4 * 5, dtype=np.float32).reshape(3, 4, 5) # 准备数据和 Host/Device 端内存 bufferH = [] bufferH.append(np.ascontiguousarray(data.reshape(-1))) for i in range(nInput, nInput + nOutput): bufferH.append(np.empty((3, ) + tuple(context.get_binding_shape(i)), dtype=trt.nptype(engine.get_binding_dtype(i)))) bufferD = [] for i in range(nInput + nOutput): bufferD.append(cudart.cudaMalloc(bufferH[i].nbytes)[1]) for i in range(nInput): # 首先将 Host 数据拷贝到 Device 端 cudart.cudaMemcpy(bufferD[i], bufferH[i].ctypes.data, bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyHostToDevice) context.execute(3, bufferD) # 运行推理计算 for i in range(nInput, nInput + nOutput): # 将结果从 Device 端拷回 Host 端 cudart.cudaMemcpy(bufferH[i].ctypes.data, bufferD[i], bufferH[i].nbytes, cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost) for i in range(nInput + nOutput): print(engine.get_binding_name(i)) print(bufferH[i].reshape((3, ) + tuple(context.get_binding_shape(i)))) for b in bufferD: # 释放 Device 端内存 cudart.cudaFree(b)