def _test_onnx_importer(self, model_name, data_input_index, opset_version=onnx.defs.onnx_opset_version()): model_dir = _download_onnx_model(model_name, opset_version) model_def = onnx.load(os.path.join(model_dir, 'model.onnx')) input_blob_dims = [ int(x.dim_value) for x in model_def.graph.input[data_input_index].type.tensor_type.shape.dim ] op_inputs = [x.name for x in model_def.graph.input] op_outputs = [x.name for x in model_def.graph.output] print("{}".format(op_inputs)) data = np.random.randn(*input_blob_dims).astype(np.float32) Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data}) op = convert_onnx_model_to_trt_op(model_def, verbosity=3) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(op_inputs[data_input_index], data) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def _test_relu_graph(self, X, batch_size, trt_max_batch_size): node_def = make_node("Relu", ["X"], ["Y"]) Y_c2 = c2.run_node(node_def, {"X": X}) graph_def = make_graph( [node_def], name="test", inputs=[ make_tensor_value_info("X", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2]) ], outputs=[ make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2]) ]) model_def = make_model(graph_def, producer_name='relu-test') op_outputs = [x.name for x in model_def.graph.output] op = convert_onnx_model_to_trt_op(model_def, max_batch_size=trt_max_batch_size) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob("X", X) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_almost_equal(Y_c2, Y_trt)
def c2_native_run_net(init_net, predict_net, inputs): ws = Workspace() if init_net: ws.RunNetOnce(init_net) if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, predict_net.device_option) else: uninitialized = [ input_name for input_name in predict_net.external_input if not ws.HasBlob(input_name) ] if len(uninitialized) == len(inputs): for key, value in zip(uninitialized, inputs): ws.FeedBlob(key, value, predict_net.device_option) else: # If everything is initialized, # we just initialized the first len(inputs) external_input. assert (len(inputs) <= len(predict_net.external_input)) for i in range(len(inputs)): ws.FeedBlob(predict_net.external_input[i], inputs[i], predict_net.device_option) ws.RunNetOnce(predict_net) output_names = predict_net.external_output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def benchmark_caffe2_model(init_net, predict_net, warmup_iters=3, main_iters=10, layer_details=True): ''' Run the benchmark net on the target model. Return the execution time per iteration (millisecond). ''' ws = Workspace() if init_net: ws.RunNetOnce(init_net) ws.CreateNet(predict_net) results = ws.BenchmarkNet(predict_net.name, warmup_iters, main_iters, layer_details) del ws return results[0]
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version): super(Caffe2Backend, cls).run_node(node, inputs, device) device_option = get_device_option(Device(device)) with Workspace(), core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): workspace.FeedBlob(key, value) else: assert len(node.input) == len( inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): workspace.FeedBlob(key, value) cls._inplace_rewrite([node]) init_ops, ops, _ = cls._onnx_node_to_caffe2_op( None, None, node, opset_version or cls._known_opset_version) ops = init_ops + ops for op in ops: op.device_option.CopyFrom(device_option) workspace.RunOperatorsOnce(ops) output_values = [workspace.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None): super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info) device_option = get_device_option(Device(device)) with Workspace(), core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): workspace.FeedBlob(key, value) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): workspace.FeedBlob(key, value) ops = [] cbackend = C.Caffe2Backend() ops_str = cbackend.convert_node(node.SerializeToString(), opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) # For testing if "ONNX_CAFFE2_DEBUG" in os.environ: init_ops, ops2, _ = cls._onnx_node_to_caffe2_op( None, None, node, opset_version or cls._known_opset_version) ops2 = init_ops + ops2 for op in ops2: op.device_option.CopyFrom(device_option) print("\nC++:\n{}\nPython:\n{}".format(ops, ops2)) workspace.RunOperatorsOnce(ops) output_values = [workspace.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def prepare(cls, model, device='CPU', **kwargs): ''' For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph, for example, if "img" is the input blob for the predict_net, we require that in init_graph and in initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since there is no way we can know which blob is the input of the predict_graph. ''' super(Caffe2Backend, cls).prepare(model, device, **kwargs) opset_version = None for imp in model.opset_import: if not imp.HasField("domain") or imp.domain == "": opset_version = imp.version if imp.version > cls._known_opset_version: warnings.warn( "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}. We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail." .format(cls._known_opset_version, imp.version)) else: warnings.warn("Unrecognized operator set {}".format( imp.domain)) if opset_version is None: if model.ir_version >= 0x00000003: raise RuntimeError( "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)" ) else: opset_version = 1 ws = Workspace() device_option = get_device_option(Device(device)) # Directly load initializer data into blobs in workspace cls._direct_initialize_parameters( model.graph.initializer, ws, device_option, ) initialized = {init.name for init in model.graph.initializer} cls._direct_initialize_inputs( model.graph.input, initialized, ws, device_option, ) uninitialized = [ value_info.name for value_info in model.graph.input if value_info.name not in initialized ] init_net, predict_net = cls._onnx_model_to_caffe2_net( model, device, opset_version, False) retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) return retval
def test_resnet50(self): input_blob_dims = (1, 3, 224, 224) model_dir = _download_onnx_model('resnet50') model_def = onnx.load(os.path.join(model_dir, 'model.onnx')) op_inputs = [x.name for x in model_def.graph.input] op_outputs = [x.name for x in model_def.graph.output] n, c, h, w = input_blob_dims data = np.random.randn(n, c, h, w).astype(np.float32) Y_c2 = c2.run_model(model_def, {op_inputs[0]: data}) op = convert_onnx_model_to_trt_op(model_def) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(op_inputs[0], data) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def c2_native_run_net(init_net, predict_net, inputs, debug_arg=None): ws = Workspace() if init_net: ws.RunNetOnce(init_net) if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, predict_net.device_option) else: uninitialized = [ input_name for input_name in predict_net.external_input if not ws.HasBlob(input_name) ] if len(uninitialized) == len(inputs): for key, value in zip(uninitialized, inputs): ws.FeedBlob(key, value, predict_net.device_option) else: # If everything is initialized, # we just initialized the first len(inputs) external_input. # Added some extra logging to help debug sporadic sandcastle fails if len(inputs) > len(predict_net.external_input): print("c2_native_run_net assert. len(inputs)=", len(inputs), "len(predict_net.external_input)=", len(predict_net.external_input)) print("debug_arg: ", debug_arg) print("predict_net ", type(predict_net), ":", predict_net) print("inputs ", type(inputs), ":", inputs) assert (len(inputs) <= len(predict_net.external_input)) for i in range(len(inputs)): ws.FeedBlob(predict_net.external_input[i], inputs[i], predict_net.device_option) ws.RunNetOnce(predict_net) output_names = predict_net.external_output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None): super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info, opset_version=opset_version) value_infos = [] device_option = get_device_option(Device(device)) ws = Workspace() with core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value) value_infos.append( onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[ value.dtype], shape=value.shape).SerializeToString()) else: assert len(node.input) == len( inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): ws.FeedBlob(key, value) value_infos.append( onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[ value.dtype], shape=value.shape).SerializeToString()) ops = [] cbackend = C.Caffe2Backend(cls._dummy_name) ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) ws.RunOperatorsOnce(ops) output_values = [ws.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def c2_native_run_op(op_def, inputs): ws = Workspace() if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, op_def.device_option) else: assert (len(op_def.input) == len(inputs)) for key, value in zip(op_def.input, inputs): ws.FeedBlob(key, value, op_def.device_option) ws.RunOperatorOnce(op_def) output_names = op_def.output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs): ''' For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph, for example, if "img" is the input blob for the predict_net, we require that in init_graph and in initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since there is no way we can know which blob is the input of the predict_graph. ''' if not kwargs.pop('no_check_UNSAFE', False): super(Caffe2Backend, cls).prepare(model, device, **kwargs) opset_version = None for imp in model.opset_import: if not imp.HasField("domain") or imp.domain == "": opset_version = imp.version if imp.version > cls._known_opset_version: warnings.warn( "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}. We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail." .format(cls._known_opset_version, imp.version)) else: warnings.warn("Unrecognized operator set {}".format( imp.domain)) if opset_version is None: if model.ir_version >= 0x00000003: raise RuntimeError( "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)" ) else: opset_version = 1 model = onnx.shape_inference.infer_shapes(model) # Check whether we have RNN related ops pred_model = cls.optimize_onnx(model, predict=True) rnn_nodes = [] for node in pred_model.graph.node: if node.op_type in {'LSTM', 'GRU', 'RNN'}: rnn_nodes.append(node) # Build the C++ backend # TODO: build a predictor that supports GPU # And for RNN nets, we need to avoid adding init_net use_cpp_backend = device == 'CPU' and not rnn_nodes # use python backend for now use_cpp_backend = False if use_cpp_backend: c2_rnn_ops = [] if rnn_nodes: init_model = cls.optimize_onnx(model, init=True) for node in rnn_nodes: c2ops = cls._onnx_node_to_caffe2_op( init_model, pred_model, node, opset_version) init_ops = [x.SerializeToString() for x in c2ops.init_ops] ops = [x.SerializeToString() for x in c2ops.ops] external_inputs = c2ops.interface_blobs c2_rnn_ops.append( C.Caffe2Ops(init_ops, ops, external_inputs)) del init_model cbackend = C.Caffe2Backend(cls._dummy_name) if raw_values_dict: cls._external_value_resolution_pass(model, raw_values_dict) rep = cbackend.prepare(model.SerializeToString(), device, c2_rnn_ops) # For testing # Dump the net descriptions to file for comparison with the Python ones if "ONNX_CAFFE2_DEBUG" in os.environ: pred_net_str = rep.pred_net() pn = caffe2_pb2.NetDef() pn.ParseFromString(pred_net_str) init_net_str = rep.init_net() inn = caffe2_pb2.NetDef() inn.ParseFromString(init_net_str) with open("cpp.txt", "w") as f: f.write("pred_net: \n{}".format(pn)) rep_wrapper = Caffe2CppRep(rep) return rep_wrapper else: ws = Workspace() device_option = get_device_option(Device(device)) init_net, predict_net = cls._onnx_model_to_caffe2_net( model, device, opset_version, False) if raw_values_dict: cls._external_value_resolution_pass(model, raw_values_dict) # Directly load initializer data into blobs in workspace cls._direct_initialize_parameters( model.graph.initializer, ws, device_option, ) initialized = {init.name for init in model.graph.initializer} cls._direct_initialize_inputs( model.graph.input, initialized, ws, device_option, ) uninitialized = [ value_info.name for value_info in model.graph.input if value_info.name not in initialized ] if "ONNX_CAFFE2_DEBUG" in os.environ: with open("python.txt", "w") as f: f.write("pred_net: \n{}".format(predict_net)) retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) return retval
def load(cls, db_path, db_type, *args, **kwargs): ws = Workspace() with ws._ctx: net = prepare_prediction_net(db_path, db_type) # TODO: reconstruct pem if so the predictor can be saved back return cls(pem=None, ws=ws, predict_net=net)
def op_func(*inputs, **args): ws = Workspace() schema = OpSchema.get(op_type) input_prefix = 'input_' output_prefix = 'output_' def get_name_list(prefix, num, max_num): return [prefix + str(x) for x in range(min(num, max_num))] input_names, output_names = [], [] input_names = get_name_list( input_prefix, len(inputs), schema.max_input ) # verify the length of input name is in range # of schema num_input = len(input_names) if num_input > schema.max_input or num_input < \ schema.min_input or not schema.num_inputs_allowed(num_input): raise ValueError( "Functional C2: Number of inputs not in \ range: {} - {} or not allowed." .format(schema.min_input, schema.max_input) ) if 'num_output' in args: num_output = args['num_output'] if num_output > schema.max_output or \ num_output < schema.min_output or \ not schema.num_outputs_allowed(num_output) or \ not schema.num_inputs_outputs_allowed(num_input, num_output): raise ValueError( "Functional C2: Number of output \ not in range: {} - {} or not allowed" .format(schema.min_output, schema.max_output) ) output_names = get_name_list( output_prefix, num_output, schema.max_output ) args.pop('num_output') calculated = schema.CalculateOutput(num_input) if not output_names and calculated != -1: output_names = get_name_list( output_prefix, calculated, schema.max_output ) if not output_names: max_output = schema.max_output # For an op with max_output == inf # and no Output defined in schema # user should pass output_size explicitly if schema.inf == max_output: raise ValueError( "For operators with max_output == inf,\ user should pass num_output explicity." ) output_names = get_name_list( output_prefix, max_output, max_output ) op = core.CreateOperator( op_type, input_names, output_names, **args ) device_option = args.get('device_option', core.DeviceOption(caffe2_pb2.CPU)) with core.DeviceScope(device_option): for i, input_blob in enumerate(inputs): ws.FeedBlob(input_names[i], input_blob) # RunOperator ws.RunOperatorOnce(op) output_values = [ws.FetchBlob(x) for x in output_names] return namedtupledict('output', output_names)(*output_values)
def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs): ''' For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph, for example, if "img" is the input blob for the predict_net, we require that in init_graph and in initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since there is no way we can know which blob is the input of the predict_graph. ''' if not kwargs.pop('no_check_UNSAFE', False): super(Caffe2Backend, cls).prepare(model, device, **kwargs) opset_version = None for imp in model.opset_import: if not imp.HasField("domain") or imp.domain == "": opset_version = imp.version if imp.version > cls._known_opset_version: warnings.warn( "This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}. We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail." .format(cls._known_opset_version, imp.version)) else: warnings.warn("Unrecognized operator set {}".format( imp.domain)) if opset_version is None: if model.ir_version >= 0x00000003: raise RuntimeError( "Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)" ) else: opset_version = 1 # Prior to onnx version update to onnx-1.8.0, errors caused by failures in # in the onnx shape inference call were being supressed. Hence a try-catch block # is added around the infer_shapes call to avoid these failures and preserve status try: model = onnx.shape_inference.infer_shapes(model) except RuntimeError: warnings.warn( "ShapeInferenceWarning: Inferred shape and existing shape differ in rank" ) ws = Workspace() device_option = get_device_option(Device(device)) init_net, predict_net = cls._onnx_model_to_caffe2_net( model, device, opset_version, False) if raw_values_dict: cls._external_value_resolution_pass(model, raw_values_dict) # Directly load initializer data into blobs in workspace cls._direct_initialize_parameters( model.graph.initializer, ws, device_option, ) initialized = {init.name for init in model.graph.initializer} cls._direct_initialize_inputs( model.graph.input, initialized, ws, device_option, ) uninitialized = [ value_info.name for value_info in model.graph.input if value_info.name not in initialized ] retval = Caffe2Rep(init_net, predict_net, ws, uninitialized) return retval
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times". format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net) ws.CreateNet(pred_net) for _ in range(warmup): ws.RunNet(pred_net.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [ws.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) ws.ResetWorkspace() # Cut the graph init_net_cut, pred_net_cut = transform_caffe2_net( init_net, pred_net, {input_name: input_blob_dims}) del init_net, pred_net #print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net_cut) ws.CreateNet(pred_net_cut) for _ in range(warmup): ws.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [ws.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)