Exemple #1
0
 def _test_onnx_importer(self,
                         model_name,
                         data_input_index,
                         opset_version=onnx.defs.onnx_opset_version()):
     model_dir = _download_onnx_model(model_name, opset_version)
     model_def = onnx.load(os.path.join(model_dir, 'model.onnx'))
     input_blob_dims = [
         int(x.dim_value) for x in
         model_def.graph.input[data_input_index].type.tensor_type.shape.dim
     ]
     op_inputs = [x.name for x in model_def.graph.input]
     op_outputs = [x.name for x in model_def.graph.output]
     print("{}".format(op_inputs))
     data = np.random.randn(*input_blob_dims).astype(np.float32)
     Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data})
     op = convert_onnx_model_to_trt_op(model_def, verbosity=3)
     device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
     op.device_option.CopyFrom(device_option)
     Y_trt = None
     ws = Workspace()
     with core.DeviceScope(device_option):
         ws.FeedBlob(op_inputs[data_input_index], data)
         ws.RunOperatorsOnce([op])
         output_values = [ws.FetchBlob(name) for name in op_outputs]
         Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
     np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
Exemple #2
0
 def _test_relu_graph(self, X, batch_size, trt_max_batch_size):
     node_def = make_node("Relu", ["X"], ["Y"])
     Y_c2 = c2.run_node(node_def, {"X": X})
     graph_def = make_graph(
         [node_def],
         name="test",
         inputs=[
             make_tensor_value_info("X", onnx.TensorProto.FLOAT,
                                    [batch_size, 1, 3, 2])
         ],
         outputs=[
             make_tensor_value_info("Y", onnx.TensorProto.FLOAT,
                                    [batch_size, 1, 3, 2])
         ])
     model_def = make_model(graph_def, producer_name='relu-test')
     op_outputs = [x.name for x in model_def.graph.output]
     op = convert_onnx_model_to_trt_op(model_def,
                                       max_batch_size=trt_max_batch_size)
     device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
     op.device_option.CopyFrom(device_option)
     Y_trt = None
     ws = Workspace()
     with core.DeviceScope(device_option):
         ws.FeedBlob("X", X)
         ws.RunOperatorsOnce([op])
         output_values = [ws.FetchBlob(name) for name in op_outputs]
         Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
     np.testing.assert_almost_equal(Y_c2, Y_trt)
Exemple #3
0
def c2_native_run_net(init_net, predict_net, inputs):
    ws = Workspace()
    if init_net:
        ws.RunNetOnce(init_net)

    if isinstance(inputs, dict):
        for key, value in inputs.items():
            ws.FeedBlob(key, value, predict_net.device_option)
    else:
        uninitialized = [
            input_name for input_name in predict_net.external_input
            if not ws.HasBlob(input_name)
        ]
        if len(uninitialized) == len(inputs):
            for key, value in zip(uninitialized, inputs):
                ws.FeedBlob(key, value, predict_net.device_option)
        else:
            # If everything is initialized,
            # we just initialized the first len(inputs) external_input.
            assert (len(inputs) <= len(predict_net.external_input))
            for i in range(len(inputs)):
                ws.FeedBlob(predict_net.external_input[i], inputs[i],
                            predict_net.device_option)

    ws.RunNetOnce(predict_net)

    output_names = predict_net.external_output
    output_values = [ws.FetchBlob(name) for name in output_names]
    return ws, namedtupledict('Outputs', output_names)(*output_values)
def benchmark_caffe2_model(init_net, predict_net, warmup_iters=3, main_iters=10, layer_details=True):
    '''
        Run the benchmark net on the target model.
        Return the execution time per iteration (millisecond).
    '''
    ws = Workspace()
    if init_net:
        ws.RunNetOnce(init_net)
    ws.CreateNet(predict_net)
    results = ws.BenchmarkNet(predict_net.name, warmup_iters, main_iters, layer_details)
    del ws
    return results[0]
Exemple #5
0
    def run_node(cls,
                 node,
                 inputs,
                 device='CPU',
                 opset_version=_known_opset_version):
        super(Caffe2Backend, cls).run_node(node, inputs, device)

        device_option = get_device_option(Device(device))
        with Workspace(), core.DeviceScope(device_option):  # temporary!
            if isinstance(inputs, dict):
                for key, value in inputs.items():
                    workspace.FeedBlob(key, value)
            else:
                assert len(node.input) == len(
                    inputs), "{}: expected {} but got {}".format(
                        node.op_type, len(node.input), len(inputs))
                for key, value in zip(node.input, inputs):
                    workspace.FeedBlob(key, value)

            cls._inplace_rewrite([node])
            init_ops, ops, _ = cls._onnx_node_to_caffe2_op(
                None, None, node, opset_version or cls._known_opset_version)
            ops = init_ops + ops
            for op in ops:
                op.device_option.CopyFrom(device_option)
            workspace.RunOperatorsOnce(ops)
            output_values = [workspace.FetchBlob(name) for name in node.output]
            return namedtupledict('Outputs', node.output)(*output_values)
Exemple #6
0
    def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None):
        super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info)

        device_option = get_device_option(Device(device))
        with Workspace(), core.DeviceScope(device_option):  # temporary!
            if isinstance(inputs, dict):
                for key, value in inputs.items():
                    workspace.FeedBlob(key, value)
            else:
                assert len(node.input) == len(inputs), "{}: expected {} but got {}".format(
                    node.op_type, len(node.input), len(inputs))
                for key, value in zip(node.input, inputs):
                    workspace.FeedBlob(key, value)

            ops = []
            cbackend = C.Caffe2Backend()
            ops_str = cbackend.convert_node(node.SerializeToString(), opset_version)
            for s in ops_str[0] + ops_str[1]:
                op = caffe2_pb2.OperatorDef()
                op.ParseFromString(s)
                op.device_option.CopyFrom(device_option)
                ops.append(op)
            # For testing
            if "ONNX_CAFFE2_DEBUG" in os.environ:
                init_ops, ops2, _ = cls._onnx_node_to_caffe2_op(
                    None, None, node, opset_version or cls._known_opset_version)
                ops2 = init_ops + ops2
                for op in ops2:
                    op.device_option.CopyFrom(device_option)
                print("\nC++:\n{}\nPython:\n{}".format(ops, ops2))
            workspace.RunOperatorsOnce(ops)
            output_values = [workspace.FetchBlob(name) for name in node.output]
            return namedtupledict('Outputs', node.output)(*output_values)
Exemple #7
0
    def prepare(cls, model, device='CPU', **kwargs):
        '''
        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,

        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
        there is no way we can know which blob is the input of the predict_graph.
        '''
        super(Caffe2Backend, cls).prepare(model, device, **kwargs)

        opset_version = None
        for imp in model.opset_import:
            if not imp.HasField("domain") or imp.domain == "":
                opset_version = imp.version
                if imp.version > cls._known_opset_version:
                    warnings.warn(
                        "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail."
                        .format(cls._known_opset_version, imp.version))
            else:
                warnings.warn("Unrecognized operator set {}".format(
                    imp.domain))
        if opset_version is None:
            if model.ir_version >= 0x00000003:
                raise RuntimeError(
                    "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)"
                )
            else:
                opset_version = 1

        ws = Workspace()
        device_option = get_device_option(Device(device))

        # Directly load initializer data into blobs in workspace
        cls._direct_initialize_parameters(
            model.graph.initializer,
            ws,
            device_option,
        )

        initialized = {init.name for init in model.graph.initializer}

        cls._direct_initialize_inputs(
            model.graph.input,
            initialized,
            ws,
            device_option,
        )

        uninitialized = [
            value_info.name for value_info in model.graph.input
            if value_info.name not in initialized
        ]

        init_net, predict_net = cls._onnx_model_to_caffe2_net(
            model, device, opset_version, False)

        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
        return retval
Exemple #8
0
 def test_resnet50(self):
     input_blob_dims = (1, 3, 224, 224)
     model_dir = _download_onnx_model('resnet50')
     model_def = onnx.load(os.path.join(model_dir, 'model.onnx'))
     op_inputs = [x.name for x in model_def.graph.input]
     op_outputs = [x.name for x in model_def.graph.output]
     n, c, h, w = input_blob_dims
     data = np.random.randn(n, c, h, w).astype(np.float32)
     Y_c2 = c2.run_model(model_def, {op_inputs[0]: data})
     op = convert_onnx_model_to_trt_op(model_def)
     device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
     op.device_option.CopyFrom(device_option)
     Y_trt = None
     ws = Workspace()
     with core.DeviceScope(device_option):
         ws.FeedBlob(op_inputs[0], data)
         ws.RunOperatorsOnce([op])
         output_values = [ws.FetchBlob(name) for name in op_outputs]
         Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
     np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
Exemple #9
0
def c2_native_run_net(init_net, predict_net, inputs, debug_arg=None):
    ws = Workspace()
    if init_net:
        ws.RunNetOnce(init_net)

    if isinstance(inputs, dict):
        for key, value in inputs.items():
            ws.FeedBlob(key, value, predict_net.device_option)
    else:
        uninitialized = [
            input_name for input_name in predict_net.external_input
            if not ws.HasBlob(input_name)
        ]
        if len(uninitialized) == len(inputs):
            for key, value in zip(uninitialized, inputs):
                ws.FeedBlob(key, value, predict_net.device_option)
        else:
            # If everything is initialized,
            # we just initialized the first len(inputs) external_input.
            # Added some extra logging to help debug sporadic sandcastle fails
            if len(inputs) > len(predict_net.external_input):
                print("c2_native_run_net assert. len(inputs)=", len(inputs),
                      "len(predict_net.external_input)=",
                      len(predict_net.external_input))
                print("debug_arg: ", debug_arg)
                print("predict_net ", type(predict_net), ":", predict_net)
                print("inputs ", type(inputs), ":", inputs)
            assert (len(inputs) <= len(predict_net.external_input))
            for i in range(len(inputs)):
                ws.FeedBlob(predict_net.external_input[i], inputs[i],
                            predict_net.device_option)

    ws.RunNetOnce(predict_net)

    output_names = predict_net.external_output
    output_values = [ws.FetchBlob(name) for name in output_names]
    return ws, namedtupledict('Outputs', output_names)(*output_values)
Exemple #10
0
    def run_node(cls,
                 node,
                 inputs,
                 device='CPU',
                 opset_version=_known_opset_version,
                 outputs_info=None):
        super(Caffe2Backend, cls).run_node(node,
                                           inputs,
                                           device=device,
                                           outputs_info=outputs_info,
                                           opset_version=opset_version)

        value_infos = []
        device_option = get_device_option(Device(device))
        ws = Workspace()
        with core.DeviceScope(device_option):  # temporary!
            if isinstance(inputs, dict):
                for key, value in inputs.items():
                    ws.FeedBlob(key, value)
                    value_infos.append(
                        onnx.helper.make_tensor_value_info(
                            name=key,
                            elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[
                                value.dtype],
                            shape=value.shape).SerializeToString())
            else:
                assert len(node.input) == len(
                    inputs), "{}: expected {} but got {}".format(
                        node.op_type, len(node.input), len(inputs))
                for key, value in zip(node.input, inputs):
                    ws.FeedBlob(key, value)
                    value_infos.append(
                        onnx.helper.make_tensor_value_info(
                            name=key,
                            elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[
                                value.dtype],
                            shape=value.shape).SerializeToString())

            ops = []
            cbackend = C.Caffe2Backend(cls._dummy_name)
            ops_str = cbackend.convert_node(node.SerializeToString(),
                                            value_infos, opset_version)
            for s in ops_str[0] + ops_str[1]:
                op = caffe2_pb2.OperatorDef()
                op.ParseFromString(s)
                op.device_option.CopyFrom(device_option)
                ops.append(op)
            ws.RunOperatorsOnce(ops)
            output_values = [ws.FetchBlob(name) for name in node.output]
            return namedtupledict('Outputs', node.output)(*output_values)
Exemple #11
0
def c2_native_run_op(op_def, inputs):
    ws = Workspace()
    if isinstance(inputs, dict):
        for key, value in inputs.items():
            ws.FeedBlob(key, value, op_def.device_option)
    else:
        assert (len(op_def.input) == len(inputs))
        for key, value in zip(op_def.input, inputs):
            ws.FeedBlob(key, value, op_def.device_option)

    ws.RunOperatorOnce(op_def)

    output_names = op_def.output
    output_values = [ws.FetchBlob(name) for name in output_names]
    return ws, namedtupledict('Outputs', output_names)(*output_values)
Exemple #12
0
    def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
        '''
        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,

        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
        there is no way we can know which blob is the input of the predict_graph.
        '''
        if not kwargs.pop('no_check_UNSAFE', False):
            super(Caffe2Backend, cls).prepare(model, device, **kwargs)
        opset_version = None
        for imp in model.opset_import:
            if not imp.HasField("domain") or imp.domain == "":
                opset_version = imp.version
                if imp.version > cls._known_opset_version:
                    warnings.warn(
                        "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail."
                        .format(cls._known_opset_version, imp.version))
            else:
                warnings.warn("Unrecognized operator set {}".format(
                    imp.domain))
        if opset_version is None:
            if model.ir_version >= 0x00000003:
                raise RuntimeError(
                    "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)"
                )
            else:
                opset_version = 1

        model = onnx.shape_inference.infer_shapes(model)

        # Check whether we have RNN related ops
        pred_model = cls.optimize_onnx(model, predict=True)
        rnn_nodes = []
        for node in pred_model.graph.node:
            if node.op_type in {'LSTM', 'GRU', 'RNN'}:
                rnn_nodes.append(node)

        # Build the C++ backend
        # TODO: build a predictor that supports GPU
        #       And for RNN nets, we need to avoid adding init_net
        use_cpp_backend = device == 'CPU' and not rnn_nodes
        # use python backend for now
        use_cpp_backend = False
        if use_cpp_backend:
            c2_rnn_ops = []
            if rnn_nodes:
                init_model = cls.optimize_onnx(model, init=True)
                for node in rnn_nodes:
                    c2ops = cls._onnx_node_to_caffe2_op(
                        init_model, pred_model, node, opset_version)
                    init_ops = [x.SerializeToString() for x in c2ops.init_ops]
                    ops = [x.SerializeToString() for x in c2ops.ops]
                    external_inputs = c2ops.interface_blobs
                    c2_rnn_ops.append(
                        C.Caffe2Ops(init_ops, ops, external_inputs))
                del init_model

            cbackend = C.Caffe2Backend(cls._dummy_name)
            if raw_values_dict:
                cls._external_value_resolution_pass(model, raw_values_dict)
            rep = cbackend.prepare(model.SerializeToString(), device,
                                   c2_rnn_ops)
            # For testing
            # Dump the net descriptions to file for comparison with the Python ones
            if "ONNX_CAFFE2_DEBUG" in os.environ:
                pred_net_str = rep.pred_net()
                pn = caffe2_pb2.NetDef()
                pn.ParseFromString(pred_net_str)
                init_net_str = rep.init_net()
                inn = caffe2_pb2.NetDef()
                inn.ParseFromString(init_net_str)
                with open("cpp.txt", "w") as f:
                    f.write("pred_net: \n{}".format(pn))

            rep_wrapper = Caffe2CppRep(rep)
            return rep_wrapper
        else:
            ws = Workspace()
            device_option = get_device_option(Device(device))

            init_net, predict_net = cls._onnx_model_to_caffe2_net(
                model, device, opset_version, False)

            if raw_values_dict:
                cls._external_value_resolution_pass(model, raw_values_dict)

            # Directly load initializer data into blobs in workspace
            cls._direct_initialize_parameters(
                model.graph.initializer,
                ws,
                device_option,
            )

            initialized = {init.name for init in model.graph.initializer}

            cls._direct_initialize_inputs(
                model.graph.input,
                initialized,
                ws,
                device_option,
            )

            uninitialized = [
                value_info.name for value_info in model.graph.input
                if value_info.name not in initialized
            ]

            if "ONNX_CAFFE2_DEBUG" in os.environ:
                with open("python.txt", "w") as f:
                    f.write("pred_net: \n{}".format(predict_net))
            retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
            return retval
Exemple #13
0
 def load(cls, db_path, db_type, *args, **kwargs):
     ws = Workspace()
     with ws._ctx:
         net = prepare_prediction_net(db_path, db_type)
         # TODO: reconstruct pem if so the predictor can be saved back
     return cls(pem=None, ws=ws, predict_net=net)
Exemple #14
0
        def op_func(*inputs, **args):
            ws = Workspace()
            schema = OpSchema.get(op_type)
            input_prefix = 'input_'
            output_prefix = 'output_'

            def get_name_list(prefix, num, max_num):
                return [prefix + str(x) for x in range(min(num, max_num))]

            input_names, output_names = [], []
            input_names = get_name_list(
                input_prefix, len(inputs), schema.max_input
            )
            # verify the length of input name is in range
            # of schema
            num_input = len(input_names)
            if num_input > schema.max_input or num_input < \
               schema.min_input or not schema.num_inputs_allowed(num_input):
                raise ValueError(
                    "Functional C2: Number of inputs not in \
                range: {} - {} or not allowed."
                    .format(schema.min_input, schema.max_input)
                )

            if 'num_output' in args:
                num_output = args['num_output']
                if num_output > schema.max_output or \
                   num_output < schema.min_output or \
                   not schema.num_outputs_allowed(num_output) or \
                   not schema.num_inputs_outputs_allowed(num_input,
                                                         num_output):
                    raise ValueError(
                        "Functional C2: Number of output \
                    not in range: {} - {} or not allowed"
                        .format(schema.min_output, schema.max_output)
                    )
                output_names = get_name_list(
                    output_prefix, num_output, schema.max_output
                )
                args.pop('num_output')
            calculated = schema.CalculateOutput(num_input)
            if not output_names and calculated != -1:
                output_names = get_name_list(
                    output_prefix, calculated, schema.max_output
                )

            if not output_names:
                max_output = schema.max_output
                # For an op with max_output == inf
                # and no Output defined in schema
                # user should pass output_size explicitly
                if schema.inf == max_output:
                    raise ValueError(
                        "For operators with max_output == inf,\
                        user should pass num_output explicity."
                    )
                output_names = get_name_list(
                    output_prefix, max_output, max_output
                )

            op = core.CreateOperator(
                op_type, input_names, output_names, **args
            )
            device_option = args.get('device_option', core.DeviceOption(caffe2_pb2.CPU))
            with core.DeviceScope(device_option):
                for i, input_blob in enumerate(inputs):
                    ws.FeedBlob(input_names[i], input_blob)
                # RunOperator
                ws.RunOperatorOnce(op)
                output_values = [ws.FetchBlob(x) for x in output_names]
                return namedtupledict('output', output_names)(*output_values)
Exemple #15
0
    def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
        '''
        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,

        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
        there is no way we can know which blob is the input of the predict_graph.
        '''
        if not kwargs.pop('no_check_UNSAFE', False):
            super(Caffe2Backend, cls).prepare(model, device, **kwargs)
        opset_version = None
        for imp in model.opset_import:
            if not imp.HasField("domain") or imp.domain == "":
                opset_version = imp.version
                if imp.version > cls._known_opset_version:
                    warnings.warn(
                        "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail."
                        .format(cls._known_opset_version, imp.version))
            else:
                warnings.warn("Unrecognized operator set {}".format(
                    imp.domain))
        if opset_version is None:
            if model.ir_version >= 0x00000003:
                raise RuntimeError(
                    "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)"
                )
            else:
                opset_version = 1

        # Prior to onnx version update to onnx-1.8.0, errors caused by failures in
        # in the onnx shape inference call were being supressed. Hence a try-catch block
        # is added around the infer_shapes call to avoid these failures and preserve status
        try:
            model = onnx.shape_inference.infer_shapes(model)
        except RuntimeError:
            warnings.warn(
                "ShapeInferenceWarning: Inferred shape and existing shape differ in rank"
            )

        ws = Workspace()
        device_option = get_device_option(Device(device))

        init_net, predict_net = cls._onnx_model_to_caffe2_net(
            model, device, opset_version, False)

        if raw_values_dict:
            cls._external_value_resolution_pass(model, raw_values_dict)

        # Directly load initializer data into blobs in workspace
        cls._direct_initialize_parameters(
            model.graph.initializer,
            ws,
            device_option,
        )

        initialized = {init.name for init in model.graph.initializer}

        cls._direct_initialize_inputs(
            model.graph.input,
            initialized,
            ws,
            device_option,
        )

        uninitialized = [
            value_info.name for value_info in model.graph.input
            if value_info.name not in initialized
        ]

        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
        return retval
Exemple #16
0
    def test_resnet50_core(self):
        N = 2
        warmup = 20
        repeat = 100
        print("Batch size: {}, repeat inference {} times, warmup {} times".
              format(N, repeat, warmup))
        init_net, pred_net, _ = self._get_c2_model('resnet50')
        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
        input_blob_dims = (N, 3, 224, 224)
        input_name = "real_data"

        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
        init_net.device_option.CopyFrom(device_option)
        pred_net.device_option.CopyFrom(device_option)
        for op in pred_net.op:
            op.device_option.CopyFrom(device_option)
            op.engine = 'CUDNN'
        net_outputs = pred_net.external_output
        Y_c2 = None
        data = np.random.randn(*input_blob_dims).astype(np.float32)
        c2_time = 1
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob(input_name, data)
            ws.RunNetOnce(init_net)
            ws.CreateNet(pred_net)
            for _ in range(warmup):
                ws.RunNet(pred_net.name)
            start = time.time()
            for _ in range(repeat):
                ws.RunNet(pred_net.name)
            end = time.time()
            c2_time = end - start
            output_values = [ws.FetchBlob(name) for name in net_outputs]
            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
        ws.ResetWorkspace()

        # Cut the graph
        init_net_cut, pred_net_cut = transform_caffe2_net(
            init_net, pred_net, {input_name: input_blob_dims})
        del init_net, pred_net
        #print_net(pred_net_cut)

        Y_trt = None
        input_name = pred_net_cut.external_input[0]
        print("C2 runtime: {}s".format(c2_time))
        ws = Workspace()
        with core.DeviceScope(device_option):
            ws.FeedBlob(input_name, data)
            ws.RunNetOnce(init_net_cut)
            ws.CreateNet(pred_net_cut)
            for _ in range(warmup):
                ws.RunNet(pred_net_cut.name)
            start = time.time()
            for _ in range(repeat):
                ws.RunNet(pred_net_cut.name)
            end = time.time()
            trt_time = end - start
            print("TRT runtime: {}s, improvement: {}%".format(
                trt_time, (c2_time - trt_time) / c2_time * 100))
            output_values = [ws.FetchBlob(name) for name in net_outputs]
            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)