def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None): super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info, opset_version=opset_version) value_infos = [] device_option = get_device_option(Device(device)) ws = Workspace() with core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value) value_infos.append(onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], shape=value.shape).SerializeToString()) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): ws.FeedBlob(key, value) value_infos.append(onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], shape=value.shape).SerializeToString()) ops = [] cbackend = C.Caffe2Backend(cls._dummy_name) ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) ws.RunOperatorsOnce(ops) output_values = [ws.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def op_func(*inputs, **args): ws = Workspace() schema = OpSchema.get(op_type) input_prefix = 'input_' output_prefix = 'output_' def get_name_list(prefix, num, max_num): return [prefix + str(x) for x in range(min(num, max_num))] input_names, output_names = [], [] input_names = get_name_list( input_prefix, len(inputs), schema.max_input ) # verify the length of input name is in range # of schema num_input = len(input_names) if num_input > schema.max_input or num_input < \ schema.min_input or not schema.num_inputs_allowed(num_input): raise ValueError( "Functional C2: Number of inputs not in \ range: {} - {} or not allowed." .format(schema.min_input, schema.max_input) ) if 'num_output' in args: num_output = args['num_output'] if num_output > schema.max_output or \ num_output < schema.min_output or \ not schema.num_outputs_allowed(num_output) or \ not schema.num_inputs_outputs_allowed(num_input, num_output): raise ValueError( "Functional C2: Number of output \ not in range: {} - {} or not allowed" .format(schema.min_output, schema.max_output) ) output_names = get_name_list( output_prefix, num_output, schema.max_output ) args.pop('num_output') calculated = schema.CalculateOutput(num_input) if not output_names and calculated != -1: output_names = get_name_list( output_prefix, calculated, schema.max_output ) if not output_names: max_output = schema.max_output # For an op with max_output == inf # and no Output defined in schema # user should pass output_size explicitly if schema.inf == max_output: raise ValueError( "For operators with max_output == inf,\ user should pass num_output explicity." ) output_names = get_name_list( output_prefix, max_output, max_output ) op = core.CreateOperator( op_type, input_names, output_names, **args ) device_option = args.get('device_option', core.DeviceOption(caffe2_pb2.CPU)) with core.DeviceScope(device_option): for i, input_blob in enumerate(inputs): ws.FeedBlob(input_names[i], input_blob) # RunOperator ws.RunOperatorOnce(op) output_values = [ws.FetchBlob(x) for x in output_names] return namedtupledict('output', output_names)(*output_values)
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times". format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net) ws.CreateNet(pred_net) for _ in range(warmup): ws.RunNet(pred_net.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [ws.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) ws.ResetWorkspace() # Cut the graph init_net_cut, pred_net_cut = transform_caffe2_net( init_net, pred_net, {input_name: input_blob_dims}) del init_net, pred_net #print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net_cut) ws.CreateNet(pred_net_cut) for _ in range(warmup): ws.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [ws.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)