Exemple #1
0
    def predict(self, img_path: str, is_output_polygon=False, runtime='torch'):
        '''
        对传入的图像进行预测,支持图像地址,opencv 读取图片,偏慢
        :param img_path: 图像地址
        :param is_numpy:
        :param runtime: ['trt', 'torch']
        :return:
        '''
        assert os.path.exists(img_path), 'file is not exists'
        img = cv2.imread(img_path, 1 if self.img_mode != 'GRAY' else 0)
        if self.img_mode == 'RGB':
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        h, w = img.shape[:2]

        # 改为长边resize, 短边pad, 方便batch 推理和加速
        # img = resize_image(img, self.imageW)
        # 为了尽可能还原 动态尺寸输入的性能,此处用两个模型,动态选择
        input_h, input_w = (self.imageW + self.imageH) * 4 // 9, (
            self.imageW + self.imageH) * 4 // 9
        self.engine = self.engine_1x1
        self.context = self.context_1x1
        if h / w > 1.2:
            input_h, input_w = self.imageH, self.imageW
            self.engine = self.engine_2x1
            self.context = self.context_2x1

        img = cv2.resize(img, (input_w, input_h))
        # img, useful_h, useful_w = resize_pad(img, long_size=short_size)
        # img = cv2.copyMakeBorder(img, 0, self.imageH - useful_h, 0, self.imageW - useful_w, borderType=cv2.BORDER_CONSTANT, value=(255, 255, 255))
        # 将图片由(h,w,c)变为(1,c,h,w)
        tensor = self.transform(img)
        tensor = tensor.unsqueeze_(0)

        if runtime == 'torch' or 'both':
            tensor = tensor.to(self.device)
            batch = {'shape': [(h, w)]}
            with torch.no_grad():
                if str(self.device).__contains__('cuda'):
                    torch.cuda.synchronize(self.device)
                start = time.time()
                torch_outputs = self.model(tensor)
                preds = torch_outputs
                # preds = torch_outputs[:, :, :useful_h, :useful_w]

                if str(self.device).__contains__('cuda'):
                    torch.cuda.synchronize(self.device)
                box_list, score_list = self.post_process(
                    batch, preds, is_output_polygon=is_output_polygon)
                box_list, score_list = box_list[0], score_list[0]
                if len(box_list) > 0:
                    if is_output_polygon:
                        idx = [x.sum() > 0 for x in box_list]
                        box_list = [
                            box_list[i] for i, v in enumerate(idx) if v
                        ]
                        score_list = [
                            score_list[i] for i, v in enumerate(idx) if v
                        ]
                    else:
                        idx = box_list.reshape(box_list.shape[0],
                                               -1).sum(axis=1) > 0  # 去掉全为0的框
                        box_list, score_list = box_list[idx], score_list[idx]
                else:
                    box_list, score_list = [], []
                t = time.time() - start

        if runtime == 'trt' or runtime == 'both':
            if str(self.device).__contains__('cuda'):
                torch.cuda.synchronize(self.device)
            batch = {'shape': [(h, w)]}
            # Notice: Here we only allocate device memory for speed up
            inputs, outputs, bindings, stream = common.allocate_buffers(
                self.engine)

            # Speed test: cpu(0.976s) vs gpu(0.719s)
            # ==> Set host input to the data.
            # The common.do_inference function will copy the input to the GPU before executing.
            inputs[0].host = tensor.cpu().numpy()  # for torch.Tensor
            # ==> Or set device input to the data.
            # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore.
            # c_type_pointer = ctypes.c_void_p(int(inputs[0].device))
            # x.cpu().numpy().copy_to_external(c_type_pointer)
            start = time.time()
            trt_outputs = common.do_inference(self.context,
                                              bindings=bindings,
                                              inputs=inputs,
                                              outputs=outputs,
                                              stream=stream,
                                              batch_size=self.batch_size)
            preds = torch.as_tensor(trt_outputs[0],
                                    dtype=torch.float32,
                                    device=torch.device('cpu'))
            preds = preds.view(-1, 2, input_h, input_w)
            # preds = preds[:, :, :useful_h, :useful_w]
            box_list, score_list = self.post_process(
                batch, preds, is_output_polygon=is_output_polygon)
            t = time.time() - start
            box_list, score_list = box_list[0], score_list[0]
            if len(box_list) > 0:
                if is_output_polygon:
                    idx = [x.sum() > 0 for x in box_list]
                    box_list = [box_list[i] for i, v in enumerate(idx) if v]
                    score_list = [
                        score_list[i] for i, v in enumerate(idx) if v
                    ]
                else:
                    idx = box_list.reshape(box_list.shape[0],
                                           -1).sum(axis=1) > 0  # 去掉全为0的框
                    box_list, score_list = box_list[idx], score_list[idx]
            else:
                box_list, score_list = [], []
        if runtime == 'both':
            print(
                "====================== Check output between tensorRT and torch ====================================="
            )
            for i, name in enumerate(self.output_names):
                try:
                    np.testing.assert_allclose(
                        torch_outputs[i].cpu().detach().numpy().reshape(-1),
                        trt_outputs[i],
                        rtol=1e-03,
                        atol=2e-04)
                except AssertionError as e:
                    print("ouput {} mismatch {}".format(
                        self.output_names[i], e))
                    continue
                print("ouput {} match\n".format(self.output_names[i]))

        if runtime not in ['trt', 'torch', 'both']:
            raise KeyError("support only ['torch', 'trt'] yet!")

        return preds[0,
                     0, :, :].detach().cpu().numpy(), box_list, score_list, t
Exemple #2
0
def run_on_tensorrt():
    from deploy import common

    config_file = '../configs/solov2/solov2_light_448_r34_fpn_8gpu_3x.py'
    onnx_file = 'weights/SOLOv2_light_R34.onnx'
    input_names = ['input']
    # output_names = ['C0', 'C1', 'C2', 'C3', 'C4']
    # output_names = ['cate_pred_0', 'cate_pred_1', 'cate_pred_2', 'cate_pred_3', 'cate_pred_4',
    #                 'kernel_pred_0', 'kernel_pred_1', 'kernel_pred_2', 'kernel_pred_3', 'kernel_pred_4',
    #                 'seg_pred']  # Origin
    output_names = ['cate_pred', 'kernel_pred', 'seg_pred']  # add permute & concate
    if isinstance(config_file, str):
        cfg = mmcv.Config.fromfile(config_file)
    elif not isinstance(config_file, mmcv.Config):
        raise TypeError('config must be a filename or Config object, '
                        'but got {}'.format(type(config_file)))

    # 1. Preprocess
    # input demo img size 427x640 --> resized 448x671 --> pad 448x672
    img = 'images/demo.jpg'
    # build the data pipeline
    test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
    test_pipeline = Compose(test_pipeline)
    # prepare data
    data = dict(img=img)
    data = test_pipeline(data)

    # 2. Run inference on trt
    print("Load onnx model from {}.".format(onnx_file))
    image_shape = data['img_meta'][0].data['pad_shape']
    input_shapes = ((1, image_shape[2], image_shape[0], image_shape[1]),)  # explict shape
    # input_shapes = ((1, 3, 448, 448), (1, 3, 608, 608), (1, 3, 768, 768))  # dynamic shape
    # shape_matrix = [
    # [1, 40, 40, 80],
    # [1, 36, 36, 80],
    # [1, 24, 24, 80],
    # [1, 16, 16, 80],
    # [1, 12, 12, 80],
    # [1, 128, 40, 40],
    # [1, 128, 36, 36],
    # [1, 128, 24, 24],
    # [1, 128, 16, 16],
    # [1, 128, 12, 12],
    # [1, 128, image_shape[0]//4, image_shape[1]//4]
    # ]
    shape_matrix = [
        [3872, 80],
        [3872, 128],
        [1, 128, image_shape[0] // 4, image_shape[1] // 4]
    ]
    with common.get_engine(onnx_file, onnx_file.replace(".onnx", ".engine"),
                           input_shapes=input_shapes, force_rebuild=False) \
            as engine, engine.create_execution_context() as context:
        # Notice: Here we only allocate device memory for speed up
        # DYNAMIC shape
        # context.active_optimization_profile = 0
        # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)]
        # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context)
        # EXPLICIT shape
        inputs, outputs, bindings, stream = common.allocate_buffers(engine)

        # Speed test: cpu(0.976s) vs gpu(0.719s)
        # ==> Set host input to the data.
        # The common.do_inference function will copy the input to the GPU before executing.
        inputs[0].host = data['img'][0].unsqueeze(0).cpu().numpy()  # for torch.Tensor
        # ==> Or set device input to the data.
        # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore.
        # c_type_pointer = ctypes.c_void_p(int(inputs[0].device))
        # x.cpu().numpy().copy_to_external(c_type_pointer)
        tic = cv2.getTickCount()
        trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,
                                            batch_size=1, h_=image_shape[0], w_=image_shape[1])
        print('-----> tensorRT inference time: {}ms'.format((cv2.getTickCount() - tic) * 1000 / cv2.getTickFrequency()))

    # 3. Get seg
    # 调用pytorch定义的获取分割图和matrix nms 以及后处理
    from mmdet.models.anchor_heads.solov2_head import SOLOv2Head
    solov2_head = SOLOv2Head(num_classes=81,
                             in_channels=256,
                             num_grids=[40, 36, 24, 16, 12],
                             strides=[8, 8, 16, 32, 32],
                             ins_out_channels = 128,
                             loss_ins=cfg.model.bbox_head.loss_ins,
                             loss_cate=cfg.model.bbox_head.loss_cate)
    # TODO: tensorrt output order is different from pytorch?
    # Origin
    # ids = [8, 9, 7, 6, 5, 3, 4, 2, 1, 0, 10]
    # ids = [9, 8, 7, 5, 6, 4, 3, 2, 0, 1, 10]  # TODO: tensorrt output order is different from pytorch?
    # Add permute & concate
    ids = [1, 0, 2]

    cate_preds = [torch.from_numpy(trt_outputs[x]).reshape(y) for x, y in zip(ids[:1], shape_matrix[:1])]
    kernel_preds = [torch.from_numpy(trt_outputs[x]).reshape(y) for x, y in zip(ids[1:2], shape_matrix[1:2])]
    seg_pred = torch.from_numpy(trt_outputs[2]).reshape(shape_matrix[2])
    result = solov2_head.get_seg(cate_preds, kernel_preds, seg_pred, [data['img_meta'][0].data], cfg.test_cfg, rescale=True)
    show_result_ins(img, result, get_classes('coco'), score_thr=0.25, out_file="images/demo_out_trt_solov2.jpg")
    print('Script done!')
def check(model, device, input_tensor, name, check_onnx=True, check_trt=True):
    ''' Check the Converted onnx model on onnxruntime and tensorrt '''
    x = input_tensor
    x.to(device)
    # ======================= Run pytorch model =========================================
    tic = time.time()
    with torch.no_grad():
        torch_output = model(x)
        # print(torch_output.shape)
        torch_output = to_list([torch_output])
    print(time.time() - tic)
    print(name)
    # print(model)
    # torch.save(model.state_dict(), 'weights/{}.pth'.format(name))

    # ======================= Run onnx on onnxruntime =========================================
    # export onnx model
    convert2onnx(model, x, 'weights/{}.onnx'.format(name))
    if check_onnx:
        print("Load onnx model from {}.".format('weights/{}.onnx'.format(name)))
        import onnxruntime as rt
        sess = rt.InferenceSession('weights/{}.onnx'.format(name))

        # check input and output
        for in_blob in sess.get_inputs():
            if in_blob.name not in input_names:
                print("Input blob name not match that in the mode")
            else:
                print("Input {}, shape {} and type {}".format(in_blob.name, in_blob.shape, in_blob.type))
        for out_blob in sess.get_outputs():
            if out_blob.name not in output_names:
                print("Output blob name not match that in the mode")
            else:
                print("Output {}, shape {} and type {}".format(out_blob.name, out_blob.shape, out_blob.type))

        onnx_output = sess.run(output_names, {input_names[0]: x.cpu().numpy()})

        print("========================== Check output between onnxruntime and torch! ================================")
        for i, out in enumerate(onnx_output):
            try:
                np.testing.assert_allclose(torch_output[i].cpu().detach().numpy(), out, rtol=1e-03, atol=2e-04)
            except AssertionError as e:
                print("ouput {} mismatch {}".format(output_names[i], e))
                continue
            print("ouput {} match\n".format(output_names[i]))

    # ======================= Run onnx on tensorrt backend =========================================
    if check_trt:
        # use API V2 here
        print("Load onnx model from {}.".format('weights/{}.onnx'.format(name)))

        with get_engine('weights/{}.onnx'.format(name), 'weights/{}.engine'.format(name)) as engine, engine.create_execution_context() as context:
            # Notice: Here we only allocate device memory for speed up
            inputs, outputs, bindings, stream = common.allocate_buffers(engine)

            # Speed test: cpu(0.976s) vs gpu(0.719s)
            # ==> Set host input to the data.
            # The common.do_inference function will copy the input to the GPU before executing.
            inputs[0].host = x.cpu().numpy()  # for torch.Tensor
            # ==> Or set device input to the data.
            # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore.
            # c_type_pointer = ctypes.c_void_p(int(inputs[0].device))
            # x.cpu().numpy().copy_to_external(c_type_pointer)
            trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream, batch_size=1, h_=224, w_=224)

        print("====================== Check output between tensorRT and torch =====================================")
        for i, trt_output in enumerate(trt_outputs):
            try:
                np.testing.assert_allclose(torch_output[i].cpu().detach().numpy().reshape(-1), trt_output, rtol=1e-03, atol=2e-04)
            except AssertionError as e:
                print("ouput {} mismatch {}".format(output_names[i], e))
                continue
            print("ouput {} match\n".format(output_names[i]))

        print("script done")
Exemple #4
0
torch_model = TestOps()
torch_out = torch.onnx._export(torch_model,
                               image,
                               'weights/test.onnx',
                               verbose=True,
                               opset_version=11)
from deploy import common

with common.get_engine('weights/test.onnx', 'weights/test.engine', input_shapes=((1, 128, 40, 40),), force_rebuild=True) \
        as engine, engine.create_execution_context() as context:
    # DYNAMIC shape
    # context.active_optimization_profile = 0
    # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)]
    # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context)

    # EXPLICIT shape
    inputs, outputs, bindings, stream = common.allocate_buffers(engine)

    # The common.do_inference function will copy the input to the GPU before executing.
    inputs[0].host = image.cpu().numpy()  # for torch.Tensor
    trt_outputs = common.do_inferenceV2(context,
                                        bindings=bindings,
                                        inputs=inputs,
                                        outputs=outputs,
                                        stream=stream,
                                        batch_size=1,
                                        h_=40,
                                        w_=40,
                                        c=128)
print('Done!')
def check(args, dummy_input, check_onnx=True, check_trt=True):
    ''' Check the Converted onnx model on onnxruntime and tensorrt '''
    # ======================= Run pytorch model =========================================
    # build the model from a config file and a checkpoint file
    model = init_detector(args.config, args.checkpoint, device='cuda:0')

    if hasattr(model, 'forward_dummy'):
        # model.forward = model.extract_feat
        model.forward = model.forward_dummy
    else:
        raise NotImplementedError(
            'ONNX exporting is not currently supported with {}'.
            format(model.__class__.__name__))

    with torch.no_grad():
        torch_output = model(dummy_input)
        torch_output = to_list(torch_output)

    # ======================= Run onnx on onnxruntime =========================================
    if check_onnx:
        import onnxruntime as rt
        print("Load onnx model from {}.".format(args.out))
        sess = rt.InferenceSession(args.out)

        # check input and output
        for in_blob in sess.get_inputs():
            if in_blob.name not in input_names:
                print("Input blob name not match that in the mode")
            else:
                print("Input {}, shape {} and type {}".format(in_blob.name, in_blob.shape, in_blob.type))
        for out_blob in sess.get_outputs():
            if out_blob.name not in output_names:
                print("Output blob name not match that in the mode")
            else:
                print("Output {}, shape {} and type {}".format(out_blob.name, out_blob.shape, out_blob.type))

        onnx_output = sess.run(output_names, {input_names[0]: dummy_input.cpu().numpy()})

        print("onnxruntime")
        for i, out in enumerate(onnx_output):
            try:
                np.testing.assert_allclose(torch_output[i].cpu().detach().numpy(), out, rtol=1e-03, atol=2e-04)
            except AssertionError as e:
                print("ouput {} mismatch {}".format(output_names[i], e))
                continue
            print("ouput {} match\n".format(output_names[i]))

    # ======================= Run onnx on tensorrt =========================================
    if check_trt:
        input_shapes = ((1, 3, args.shape[0], args.shape[1]),)  # explict shape
        # input_shapes = ((1, 3, 448, 448), (1, 3, 608, 608), (1, 3, 768, 768))  # dynamic shape
        # shape_matrix = [
        #     [1, 3, args.shape[0], args.shape[1]],
        #     [1, 40, 40, 80],
        #     [1, 36, 36, 80],
        #     [1, 24, 24, 80],
        #     [1, 16, 16, 80],
        #     [1, 12, 12, 80],
        #     [1, 128, 40, 40],
        #     [1, 128, 36, 36],
        #     [1, 128, 24, 24],
        #     [1, 128, 16, 16],
        #     [1, 128, 12, 12],
        #     [1, 128, args.shape[0] // 4, args.shape[1] // 4]
        # ]
        # shape_matrix = [
        #     [1, 3, args.shape[0], args.shape[1]],
        #     [3872, 80],
        #     [3872, 128],
        #     [1, 128, args.shape[0] // 4, args.shape[1] // 4]
        # ]
        with common.get_engine(args.out, args.out.replace(".onnx", ".engine"), input_shapes=input_shapes, force_rebuild=False) \
                as engine, engine.create_execution_context() as context:
            # Notice: Here we only allocate device memory for speed up

            # DYNAMIC shape
            # context.active_optimization_profile = 0
            # [context.set_binding_shape(x, tuple(y)) for x, y in enumerate(shape_matrix)]
            # inputs, outputs, bindings, stream = common.allocate_buffersV2(engine, context)

            # EXPLICIT shape
            inputs, outputs, bindings, stream = common.allocate_buffers(engine)

            # The common.do_inference function will copy the input to the GPU before executing.
            inputs[0].host = dummy_input.cpu().numpy()  # for torch.Tensor
            # ==> Or set device input to the data.
            # in this mode, common.do_inference function should not copy inputs.host to inputs.device anymore.
            # c_type_pointer = ctypes.c_void_p(int(inputs[0].device))
            # x.cpu().numpy().copy_to_external(c_type_pointer)
            trt_outputs = common.do_inferenceV2(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,
                                                batch_size=1, h_=args.shape[0], w_=args.shape[1])
        print("tensorrt")
        # TODO: tensorrt output order is different from pytorch? Origin
        # Origin
        # ids = [8, 9, 7, 6, 5, 3, 4, 2, 1, 0, 10]
        # Add permute & concate
        ids = [1, 0, 2]
        for i, (trt_output, id) in enumerate(zip(trt_outputs, ids)):
            try:
                np.testing.assert_allclose(torch_output[id].cpu().detach().numpy().reshape(-1), trt_output, rtol=1e-03, atol=2e-04)
            except AssertionError as e:
                print("ouput {} mismatch {}".format(output_names[id], e))
                continue
            print("ouput {} match\n".format(output_names[id]))

    print("script done")