def predict(self, img_path: str, is_output_polygon=False, runtime='torch'): ''' 对传入的图像进行预测,支持图像地址,opencv 读取图片,偏慢 :param img_path: 图像地址 :param is_numpy: :param runtime: ['trt', 'torch'] :return: ''' assert os.path.exists(img_path), 'file is not exists' img = cv2.imread(img_path, 1 if self.img_mode != 'GRAY' else 0) if self.img_mode == 'RGB': img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) h, w = img.shape[:2] # 改为长边resize, 短边pad, 方便batch 推理和加速 # img = resize_image(img, self.imageW) # 为了尽可能还原 动态尺寸输入的性能,此处用两个模型,动态选择 input_h, input_w = (self.imageW + self.imageH) * 4 // 9, ( self.imageW + self.imageH) * 4 // 9 self.engine = self.engine_1x1 self.context = self.context_1x1 if h / w > 1.2: input_h, input_w = self.imageH, self.imageW self.engine = self.engine_2x1 self.context = self.context_2x1 img = cv2.resize(img, (input_w, input_h)) # img, useful_h, useful_w = resize_pad(img, long_size=short_size) # img = cv2.copyMakeBorder(img, 0, self.imageH - useful_h, 0, self.imageW - useful_w, borderType=cv2.BORDER_CONSTANT, value=(255, 255, 255)) # 将图片由(h,w,c)变为(1,c,h,w) tensor = self.transform(img) tensor = tensor.unsqueeze_(0) if runtime == 'torch' or 'both': tensor = tensor.to(self.device) batch = {'shape': [(h, w)]} with torch.no_grad(): if str(self.device).__contains__('cuda'): torch.cuda.synchronize(self.device) start = time.time() torch_outputs = self.model(tensor) preds = torch_outputs # preds = torch_outputs[:, :, :useful_h, :useful_w] if str(self.device).__contains__('cuda'): torch.cuda.synchronize(self.device) box_list, score_list = self.post_process( batch, preds, is_output_polygon=is_output_polygon) box_list, score_list = box_list[0], score_list[0] if len(box_list) > 0: if is_output_polygon: idx = [x.sum() > 0 for x in box_list] box_list = [ box_list[i] for i, v in enumerate(idx) if v ] score_list = [ score_list[i] for i, v in enumerate(idx) if v ] else: idx = box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0 # 去掉全为0的框 box_list, score_list = box_list[idx], score_list[idx] else: box_list, score_list = [], [] t = time.time() - start if runtime == 'trt' or runtime == 'both': if str(self.device).__contains__('cuda'): torch.cuda.synchronize(self.device) batch = {'shape': [(h, w)]} # Notice: Here we only allocate device memory for speed up inputs, outputs, bindings, stream = common.allocate_buffers( self.engine) # Speed test: cpu(0.976s) vs gpu(0.719s) # ==> Set host input to the data. # The common.do_inference function will copy the input to the GPU before executing. inputs[0].host = tensor.cpu().numpy() # for torch.Tensor # ==> Or set device input to the data. # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore. # c_type_pointer = ctypes.c_void_p(int(inputs[0].device)) # x.cpu().numpy().copy_to_external(c_type_pointer) start = time.time() trt_outputs = common.do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=self.batch_size) preds = torch.as_tensor(trt_outputs[0], dtype=torch.float32, device=torch.device('cpu')) preds = preds.view(-1, 2, input_h, input_w) # preds = preds[:, :, :useful_h, :useful_w] box_list, score_list = self.post_process( batch, preds, is_output_polygon=is_output_polygon) t = time.time() - start box_list, score_list = box_list[0], score_list[0] if len(box_list) > 0: if is_output_polygon: idx = [x.sum() > 0 for x in box_list] box_list = [box_list[i] for i, v in enumerate(idx) if v] score_list = [ score_list[i] for i, v in enumerate(idx) if v ] else: idx = box_list.reshape(box_list.shape[0], -1).sum(axis=1) > 0 # 去掉全为0的框 box_list, score_list = box_list[idx], score_list[idx] else: box_list, score_list = [], [] if runtime == 'both': print( "====================== Check output between tensorRT and torch =====================================" ) for i, name in enumerate(self.output_names): try: np.testing.assert_allclose( torch_outputs[i].cpu().detach().numpy().reshape(-1), trt_outputs[i], rtol=1e-03, atol=2e-04) except AssertionError as e: print("ouput {} mismatch {}".format( self.output_names[i], e)) continue print("ouput {} match\n".format(self.output_names[i])) if runtime not in ['trt', 'torch', 'both']: raise KeyError("support only ['torch', 'trt'] yet!") return preds[0, 0, :, :].detach().cpu().numpy(), box_list, score_list, t
def run_on_tensorrt(): from deploy import common config_file = '../configs/solov2/solov2_light_448_r34_fpn_8gpu_3x.py' onnx_file = 'weights/SOLOv2_light_R34.onnx' input_names = ['input'] # output_names = ['C0', 'C1', 'C2', 'C3', 'C4'] # output_names = ['cate_pred_0', 'cate_pred_1', 'cate_pred_2', 'cate_pred_3', 'cate_pred_4', # 'kernel_pred_0', 'kernel_pred_1', 'kernel_pred_2', 'kernel_pred_3', 'kernel_pred_4', # 'seg_pred'] # Origin output_names = ['cate_pred', 'kernel_pred', 'seg_pred'] # add permute & concate if isinstance(config_file, str): cfg = mmcv.Config.fromfile(config_file) elif not isinstance(config_file, mmcv.Config): raise TypeError('config must be a filename or Config object, ' 'but got {}'.format(type(config_file))) # 1. Preprocess # input demo img size 427x640 --> resized 448x671 --> pad 448x672 img = 'images/demo.jpg' # build the data pipeline test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:] test_pipeline = Compose(test_pipeline) # prepare data data = dict(img=img) data = test_pipeline(data) # 2. Run inference on trt print("Load onnx model from {}.".format(onnx_file)) image_shape = data['img_meta'][0].data['pad_shape'] input_shapes = ((1, image_shape[2], image_shape[0], image_shape[1]),) # explict shape # input_shapes = ((1, 3, 448, 448), (1, 3, 608, 608), (1, 3, 768, 768)) # dynamic shape # shape_matrix = [ # [1, 40, 40, 80], # [1, 36, 36, 80], # [1, 24, 24, 80], # [1, 16, 16, 80], # [1, 12, 12, 80], # [1, 128, 40, 40], # [1, 128, 36, 36], # [1, 128, 24, 24], # [1, 128, 16, 16], # [1, 128, 12, 12], # [1, 128, image_shape[0]//4, image_shape[1]//4] # ] shape_matrix = [ [3872, 80], [3872, 128], [1, 128, image_shape[0] // 4, image_shape[1] // 4] ] with common.get_engine(onnx_file, onnx_file.replace(".onnx", ".engine"), input_shapes=input_shapes, force_rebuild=False) \ as engine, engine.create_execution_context() as context: # Notice: Here we only allocate device memory for speed up # DYNAMIC shape # context.active_optimization_profile = 0 # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)] # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context) # EXPLICIT shape inputs, outputs, bindings, stream = common.allocate_buffers(engine) # Speed test: cpu(0.976s) vs gpu(0.719s) # ==> Set host input to the data. # The common.do_inference function will copy the input to the GPU before executing. inputs[0].host = data['img'][0].unsqueeze(0).cpu().numpy() # for torch.Tensor # ==> Or set device input to the data. # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore. # c_type_pointer = ctypes.c_void_p(int(inputs[0].device)) # x.cpu().numpy().copy_to_external(c_type_pointer) tic = cv2.getTickCount() trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1, h_=image_shape[0], w_=image_shape[1]) print('-----> tensorRT inference time: {}ms'.format((cv2.getTickCount() - tic) * 1000 / cv2.getTickFrequency())) # 3. Get seg # 调用pytorch定义的获取分割图和matrix nms 以及后处理 from mmdet.models.anchor_heads.solov2_head import SOLOv2Head solov2_head = SOLOv2Head(num_classes=81, in_channels=256, num_grids=[40, 36, 24, 16, 12], strides=[8, 8, 16, 32, 32], ins_out_channels = 128, loss_ins=cfg.model.bbox_head.loss_ins, loss_cate=cfg.model.bbox_head.loss_cate) # TODO: tensorrt output order is different from pytorch? # Origin # ids = [8, 9, 7, 6, 5, 3, 4, 2, 1, 0, 10] # ids = [9, 8, 7, 5, 6, 4, 3, 2, 0, 1, 10] # TODO: tensorrt output order is different from pytorch? # Add permute & concate ids = [1, 0, 2] cate_preds = [torch.from_numpy(trt_outputs[x]).reshape(y) for x, y in zip(ids[:1], shape_matrix[:1])] kernel_preds = [torch.from_numpy(trt_outputs[x]).reshape(y) for x, y in zip(ids[1:2], shape_matrix[1:2])] seg_pred = torch.from_numpy(trt_outputs[2]).reshape(shape_matrix[2]) result = solov2_head.get_seg(cate_preds, kernel_preds, seg_pred, [data['img_meta'][0].data], cfg.test_cfg, rescale=True) show_result_ins(img, result, get_classes('coco'), score_thr=0.25, out_file="images/demo_out_trt_solov2.jpg") print('Script done!')
def check(model, device, input_tensor, name, check_onnx=True, check_trt=True): ''' Check the Converted onnx model on onnxruntime and tensorrt ''' x = input_tensor x.to(device) # ======================= Run pytorch model ========================================= tic = time.time() with torch.no_grad(): torch_output = model(x) # print(torch_output.shape) torch_output = to_list([torch_output]) print(time.time() - tic) print(name) # print(model) # torch.save(model.state_dict(), 'weights/{}.pth'.format(name)) # ======================= Run onnx on onnxruntime ========================================= # export onnx model convert2onnx(model, x, 'weights/{}.onnx'.format(name)) if check_onnx: print("Load onnx model from {}.".format('weights/{}.onnx'.format(name))) import onnxruntime as rt sess = rt.InferenceSession('weights/{}.onnx'.format(name)) # check input and output for in_blob in sess.get_inputs(): if in_blob.name not in input_names: print("Input blob name not match that in the mode") else: print("Input {}, shape {} and type {}".format(in_blob.name, in_blob.shape, in_blob.type)) for out_blob in sess.get_outputs(): if out_blob.name not in output_names: print("Output blob name not match that in the mode") else: print("Output {}, shape {} and type {}".format(out_blob.name, out_blob.shape, out_blob.type)) onnx_output = sess.run(output_names, {input_names[0]: x.cpu().numpy()}) print("========================== Check output between onnxruntime and torch! ================================") for i, out in enumerate(onnx_output): try: np.testing.assert_allclose(torch_output[i].cpu().detach().numpy(), out, rtol=1e-03, atol=2e-04) except AssertionError as e: print("ouput {} mismatch {}".format(output_names[i], e)) continue print("ouput {} match\n".format(output_names[i])) # ======================= Run onnx on tensorrt backend ========================================= if check_trt: # use API V2 here print("Load onnx model from {}.".format('weights/{}.onnx'.format(name))) with get_engine('weights/{}.onnx'.format(name), 'weights/{}.engine'.format(name)) as engine, engine.create_execution_context() as context: # Notice: Here we only allocate device memory for speed up inputs, outputs, bindings, stream = common.allocate_buffers(engine) # Speed test: cpu(0.976s) vs gpu(0.719s) # ==> Set host input to the data. # The common.do_inference function will copy the input to the GPU before executing. inputs[0].host = x.cpu().numpy() # for torch.Tensor # ==> Or set device input to the data. # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore. # c_type_pointer = ctypes.c_void_p(int(inputs[0].device)) # x.cpu().numpy().copy_to_external(c_type_pointer) trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1, h_=224, w_=224) print("====================== Check output between tensorRT and torch =====================================") for i, trt_output in enumerate(trt_outputs): try: np.testing.assert_allclose(torch_output[i].cpu().detach().numpy().reshape(-1), trt_output, rtol=1e-03, atol=2e-04) except AssertionError as e: print("ouput {} mismatch {}".format(output_names[i], e)) continue print("ouput {} match\n".format(output_names[i])) print("script done")
torch_model = TestOps() torch_out = torch.onnx._export(torch_model, image, 'weights/test.onnx', verbose=True, opset_version=11) from deploy import common with common.get_engine('weights/test.onnx', 'weights/test.engine', input_shapes=((1, 128, 40, 40),), force_rebuild=True) \ as engine, engine.create_execution_context() as context: # DYNAMIC shape # context.active_optimization_profile = 0 # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)] # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context) # EXPLICIT shape inputs, outputs, bindings, stream = common.allocate_buffers(engine) # The common.do_inference function will copy the input to the GPU before executing. inputs[0].host = image.cpu().numpy() # for torch.Tensor trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1, h_=40, w_=40, c=128) print('Done!')
def check(args, dummy_input, check_onnx=True, check_trt=True): ''' Check the Converted onnx model on onnxruntime and tensorrt ''' # ======================= Run pytorch model ========================================= # build the model from a config file and a checkpoint file model = init_detector(args.config, args.checkpoint, device='cuda:0') if hasattr(model, 'forward_dummy'): # model.forward = model.extract_feat model.forward = model.forward_dummy else: raise NotImplementedError( 'ONNX exporting is not currently supported with {}'. format(model.__class__.__name__)) with torch.no_grad(): torch_output = model(dummy_input) torch_output = to_list(torch_output) # ======================= Run onnx on onnxruntime ========================================= if check_onnx: import onnxruntime as rt print("Load onnx model from {}.".format(args.out)) sess = rt.InferenceSession(args.out) # check input and output for in_blob in sess.get_inputs(): if in_blob.name not in input_names: print("Input blob name not match that in the mode") else: print("Input {}, shape {} and type {}".format(in_blob.name, in_blob.shape, in_blob.type)) for out_blob in sess.get_outputs(): if out_blob.name not in output_names: print("Output blob name not match that in the mode") else: print("Output {}, shape {} and type {}".format(out_blob.name, out_blob.shape, out_blob.type)) onnx_output = sess.run(output_names, {input_names[0]: dummy_input.cpu().numpy()}) print("onnxruntime") for i, out in enumerate(onnx_output): try: np.testing.assert_allclose(torch_output[i].cpu().detach().numpy(), out, rtol=1e-03, atol=2e-04) except AssertionError as e: print("ouput {} mismatch {}".format(output_names[i], e)) continue print("ouput {} match\n".format(output_names[i])) # ======================= Run onnx on tensorrt ========================================= if check_trt: input_shapes = ((1, 3, args.shape[0], args.shape[1]),) # explict shape # input_shapes = ((1, 3, 448, 448), (1, 3, 608, 608), (1, 3, 768, 768)) # dynamic shape # shape_matrix = [ # [1, 3, args.shape[0], args.shape[1]], # [1, 40, 40, 80], # [1, 36, 36, 80], # [1, 24, 24, 80], # [1, 16, 16, 80], # [1, 12, 12, 80], # [1, 128, 40, 40], # [1, 128, 36, 36], # [1, 128, 24, 24], # [1, 128, 16, 16], # [1, 128, 12, 12], # [1, 128, args.shape[0] // 4, args.shape[1] // 4] # ] # shape_matrix = [ # [1, 3, args.shape[0], args.shape[1]], # [3872, 80], # [3872, 128], # [1, 128, args.shape[0] // 4, args.shape[1] // 4] # ] with common.get_engine(args.out, args.out.replace(".onnx", ".engine"), input_shapes=input_shapes, force_rebuild=False) \ as engine, engine.create_execution_context() as context: # Notice: Here we only allocate device memory for speed up # DYNAMIC shape # context.active_optimization_profile = 0 # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)] # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context) # EXPLICIT shape inputs, outputs, bindings, stream = common.allocate_buffers(engine) # The common.do_inference function will copy the input to the GPU before executing. inputs[0].host = dummy_input.cpu().numpy() # for torch.Tensor # ==> Or set device input to the data. # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore. # c_type_pointer = ctypes.c_void_p(int(inputs[0].device)) # x.cpu().numpy().copy_to_external(c_type_pointer) trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1, h_=args.shape[0], w_=args.shape[1]) print("tensorrt") # TODO: tensorrt output order is different from pytorch? Origin # Origin # ids = [8, 9, 7, 6, 5, 3, 4, 2, 1, 0, 10] # Add permute & concate ids = [1, 0, 2] for i, (trt_output, id) in enumerate(zip(trt_outputs, ids)): try: np.testing.assert_allclose(torch_output[id].cpu().detach().numpy().reshape(-1), trt_output, rtol=1e-03, atol=2e-04) except AssertionError as e: print("ouput {} mismatch {}".format(output_names[id], e)) continue print("ouput {} match\n".format(output_names[id])) print("script done")