# #trtexec --verbose --onnx=resnet50.dynamic_shape.onnx --saveEngine=resnet50.dynamic_shape.trt --optShapes=input:1x3x1080x1920 --minShapes=input:1x3x1080x1920 --maxShapes=input:1x3x1080x1920 input_data = torch.randn(1, 3, 800, 1216, dtype=torch.float16, device='cuda') nRound = 10 from trt_lite import TrtLite import numpy as np import os for engine_file_path in ['fcos101.trt', 'fcos101_fp16.trt']: if not os.path.exists(engine_file_path): print('Engine file', engine_file_path, 'doesn\'t exist. Please run trtexec and re-run this script.') exit(1) print('====', engine_file_path, '===') trt = TrtLite(engine_file_path=engine_file_path) trt.print_info() i2shape = {0: (1, 3, 800, 1216)} io_info = trt.get_io_info(i2shape) d_buffers = trt.allocate_io_buffers(i2shape, True) output_data_trt = np.zeros(io_info[1][2], dtype=np.float32) d_buffers[0] = input_data trt.execute([t.data_ptr() for t in d_buffers], i2shape) output_data_trt = d_buffers[1].cpu().numpy() torch.cuda.synchronize() t0 = time.time() for i in range(nRound): trt.execute([t.data_ptr() for t in d_buffers], i2shape) torch.cuda.synchronize()
def run_engine_dynamic(save_and_load=False): input_shape = (1, 1, 5, 5) n = reduce(lambda x, y: x * y, input_shape) input_data = np.asarray(range(n), dtype=np.float32).reshape(input_shape) output_data = np.zeros(input_shape, dtype=np.float32) trt = TrtLite(build_engine_dynamic) if save_and_load: trt.save_to_file("out.trt") trt = TrtLite(engine_file_path="out.trt") trt.print_info() i2shape = {0: input_shape} d_buffers = trt.allocate_io_buffers(i2shape, True) cuda.memcpy_htod(d_buffers[0], input_data) trt.execute(d_buffers, i2shape) cuda.memcpy_dtoh(output_data, d_buffers[1]) print(output_data)
def run_engine_dynamic(input_data): #trt = TrtLite(build_engine_dynamic) #trt.print_info() #trt.save_to_file("edvr.trt") trt = TrtLite(engine_file_path="edvr.trt") trt.print_info() io_info = trt.get_io_info({}) if io_info is None: return print(io_info) h_buffers = trt.allocate_io_buffers({}, False) d_buffers = trt.allocate_io_buffers({}, True) h_buffers[0][:] = input_data for i, info in enumerate(io_info): if info[1]: cuda.memcpy_htod(d_buffers[i], h_buffers[i]) trt.execute(d_buffers, {}) nRound = 10 cuda.Context.synchronize() t0 = time.time() for i in range(nRound): trt.execute(d_buffers, {}) cuda.Context.synchronize() print('Prediction time: ', (time.time() - t0) / nRound) for i, info in enumerate(io_info): if not info[1]: cuda.memcpy_dtoh(h_buffers[i], d_buffers[i]) name2tensor = { info[0]: h_buffers[i] for i, info in enumerate(io_info) if not info[1] } np.savez('out.npz', **name2tensor)
import pycuda.driver as cuda class PyTorchTensorHolder(pycuda.driver.PointerHolderBase): def __init__(self,tensor): super(PyTorchTensorHolder,self).__init__() self.tensor = tensor def get_pointer(self): return self.tensor.data_ptr() tensorrt.init_libnvinfer_plugins(None, "") #engine_file_path = 'panoptic_fcn_fp16.trt' for engine_file_path in ['panoptic_fcn.trt','panoptic_fcn_fp16.trt']: if not os.path.exists(engine_file_path): print('bad!') else: print('=='+engine_file_path+'==') trt = TrtLite(engine_file_path=engine_file_path) trt.print_info() i2shape = {0:(1,3,h,w)} io_info = trt.get_io_info(i2shape) # print(io_info) # print(io_info[1]) # print(io_info[1][2]) d_buffers = trt.allocate_io_buffers(i2shape,True) scores_out = np.zeros(io_info[1][2],dtype=np.float32) pred_inst_out = np.zeros(io_info[2][2],dtype=np.int32) classes_out = np.zeros(io_info[3][2],dtype=np.float32) #print(d_buffers) # #output_data_trt = cuda.memcpy_dtod(d_buffers[0],PyTorchTensorHolder(image),image.nelement()*image.element_size()) trt.execute(d_buffers,i2shape)