def c2_native_run_net(init_net, predict_net, inputs): ws = Workspace() if init_net: ws.RunNetOnce(init_net) if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, predict_net.device_option) else: uninitialized = [input_name for input_name in predict_net.external_input if not ws.HasBlob(input_name)] if len(uninitialized) == len(inputs): for key, value in zip(uninitialized, inputs): ws.FeedBlob(key, value, predict_net.device_option) else: # If everything is initialized, # we just initialized the first len(inputs) external_input. assert(len(inputs) <= len(predict_net.external_input)) for i in range(len(inputs)): ws.FeedBlob(predict_net.external_input[i], inputs[i], predict_net.device_option) ws.RunNetOnce(predict_net) output_names = predict_net.external_output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def run(self, inputs, **kwargs): super(Caffe2Rep, self).run(inputs, **kwargs) with core.DeviceScope(self.predict_net.device_option): if isinstance(inputs, dict): with core.NameScope(self._name_scope): for key, value in inputs.items(): self.workspace.FeedBlob(key, value) elif isinstance(inputs, list) or isinstance(inputs, tuple): if len(self.uninitialized) != len(inputs): raise RuntimeError('Expected {} values for uninitialized ' 'graph inputs ({}), but got {}.'.format( len(self.uninitialized), ', '.join(self.uninitialized), len(inputs))) for i, value in enumerate(inputs): # namescope already baked into protobuf self.workspace.FeedBlob(self.uninitialized[i], value) else: # single input self.workspace.FeedBlob(self.uninitialized[0], inputs) if not self.nets_created: self.workspace.CreateNet(self.init_net) self.workspace.CreateNet(self.predict_net) self.nets_created = True if not self.ran_init_net: self.workspace.RunNet(self.init_net.name) self.ran_init_net = True self.workspace.RunNet(self.predict_net.name) output_values = [self.workspace.FetchBlob(name) for name in self.predict_net.external_output] return namedtupledict('Outputs', self.predict_net.external_output)(*output_values)
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None): super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info, opset_version=opset_version) device_option = get_device_option(Device(device)) ws = Workspace() with core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): ws.FeedBlob(key, value) ops = [] cbackend = C.Caffe2Backend(cls._dummy_name) ops_str = cbackend.convert_node(node.SerializeToString(), opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) # For testing if "ONNX_CAFFE2_DEBUG" in os.environ: init_ops, ops2, _ = cls._onnx_node_to_caffe2_op( None, None, node, opset_version or cls._known_opset_version) ops2 = init_ops + ops2 for op in ops2: op.device_option.CopyFrom(device_option) print("\nC++:\n{}\nPython:\n{}".format(ops, ops2)) ws.RunOperatorsOnce(ops) output_values = [ws.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def c2_native_run_op(op_def, inputs): ws = Workspace() if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, op_def.device_option) else: assert(len(op_def.input) == len(inputs)) for key, value in zip(op_def.input, inputs): ws.FeedBlob(key, value, op_def.device_option) ws.RunOperatorOnce(op_def) output_names = op_def.output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def run(self, inputs): output_values = None if isinstance(inputs, dict): output_values = self.__core.run(inputs) elif isinstance(inputs, list) or isinstance(inputs, tuple): if len(inputs) != len(self.__uninitialized_inputs): raise RuntimeError('Expected {} values for uninitialized ' 'graph inputs ({}), but got {}.'.format( len(self.__uninitialized_inputs), ', '.join(self.__uninitialized_inputs), len(inputs))) input_map = {} for k, v in zip(self.__uninitialized_inputs, inputs): input_map[k] = v output_values = self.__core.run(input_map) else: # single input output_values = self.__core.run([inputs]) return namedtupledict('Outputs', self.__external_outputs)(*output_values)
def _test_onnx_importer(self, model_name, data_input_index = 0): model_dir = _download_onnx_model(model_name) model_def = onnx.load(os.path.join(model_dir, 'model.onnx')) input_blob_dims = [int(x.dim_value) for x in model_def.graph.input[data_input_index].type.tensor_type.shape.dim] op_inputs = [x.name for x in model_def.graph.input] op_outputs = [x.name for x in model_def.graph.output] print("{}".format(op_inputs)) data = np.random.randn(*input_blob_dims).astype(np.float32) Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data}) op = convert_onnx_model_to_trt_op(model_def, verbosity=3) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(op_inputs[data_input_index], data) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def _test_relu_graph(self, X, batch_size, trt_max_batch_size): node_def = make_node("Relu", ["X"], ["Y"]) Y_c2 = c2.run_node(node_def, {"X": X}) graph_def = make_graph( [node_def], name="test", inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])], outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])]) model_def = make_model(graph_def, producer_name='relu-test') op_outputs = [x.name for x in model_def.graph.output] op = convert_onnx_model_to_trt_op(model_def, max_batch_size=trt_max_batch_size) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob("X", X) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_almost_equal(Y_c2, Y_trt)
def run_node(cls, node, inputs, device='CPU', outputs_info=None, **kwargs): """ Run ONNX node. :param node: ONNX NodeProto object. :param inputs: Inputs. :param device: Device run on. :param outputs_info: None. :param kwargs: Other args. :return: Outputs. """ super(TensorflowBackend, cls).run_node(node, inputs, device) common.sys_config.device = device node = OnnxNode(node) input_tensors = [] for i in inputs: input_tensors.append(tf.constant(i)) if isinstance(inputs, dict): feed_dict_raw = inputs else: assert len(node.inputs) == len(inputs) feed_dict_raw = dict(zip(node.inputs, inputs)) # TODO: is constant the best way for feeding inputs? input_dict = dict([(x[0], tf.constant(x[1])) for x in feed_dict_raw.items()]) module = TFModule(node, cls) output_vals = module(**input_dict) output_vals = [ val.numpy() if isinstance(val, tf.Tensor) else val for val in output_vals ] return namedtupledict('Outputs', node.outputs)(*output_vals)
def native_run_graph(graph_def, inputs, initializer, init_func=None): # De-Optimization for i in range(len(graph_def.arg)): if graph_def.arg[i].name == 'optimization_level': graph_def.arg[i].i = 0 # Create an anonymous workspace ws = _workspace.Workspace() with ws.as_default(): # Register all the initializer before feeding them for name in initializer: _Tensor(name=name).Variable() # Feed the given values if necessary if init_func: init_func() # Feed the external inputs for name, blob in inputs.items(): _workspace.FeedTensor(name, blob) # Create and Run the graph graph_name = _workspace.CreateGraph(graph_def) _workspace.RunGraph(graph_name, return_outputs=False) # Fetch the outputs output_names = graph_def.output output_values = [_workspace.FetchTensor(name) for name in output_names] # Fetch the initializer initializer = [ numpy_helper.from_array( _workspace.FetchTensor(name), name=name) for name in initializer ] # Return the outputs return ws, namedtupledict('Outputs', output_names)(*output_values), initializer
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version): super(Caffe2Backend, cls).run_node(node, inputs, device) device_option = get_device_option(Device(device)) with Workspace(), core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): workspace.FeedBlob(key, value) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): workspace.FeedBlob(key, value) cls._inplace_rewrite([node]) init_ops, ops, _ = cls._onnx_node_to_caffe2_op( None, None, node, opset_version or cls._known_opset_version) ops = init_ops + ops for op in ops: op.device_option.CopyFrom(device_option) workspace.RunOperatorsOnce(ops) output_values = [workspace.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def _test_onnx_importer(self, model_name, data_input_index, opset_version=onnx.defs.onnx_opset_version()): model_dir = _download_onnx_model(model_name, opset_version) model_def = onnx.load(os.path.join(model_dir, 'model.onnx')) input_blob_dims = [int(x.dim_value) for x in model_def.graph.input[data_input_index].type.tensor_type.shape.dim] op_inputs = [x.name for x in model_def.graph.input] op_outputs = [x.name for x in model_def.graph.output] print("{}".format(op_inputs)) data = np.random.randn(*input_blob_dims).astype(np.float32) Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data}) op = convert_onnx_model_to_trt_op(model_def, verbosity=3) device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) op.device_option.CopyFrom(device_option) Y_trt = None ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(op_inputs[data_input_index], data) if opset_version >= 5: # Some newer models from ONNX Zoo come with pre-set "data_0" input ws.FeedBlob("data_0", data) ws.RunOperatorsOnce([op]) output_values = [ws.FetchBlob(name) for name in op_outputs] Y_trt = namedtupledict('Outputs', op_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def c2_native_run_net(init_net, predict_net, inputs, debug_arg=None): ws = Workspace() if init_net: ws.RunNetOnce(init_net) if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value, predict_net.device_option) else: uninitialized = [ input_name for input_name in predict_net.external_input if not ws.HasBlob(input_name) ] if len(uninitialized) == len(inputs): for key, value in zip(uninitialized, inputs): ws.FeedBlob(key, value, predict_net.device_option) else: # If everything is initialized, # we just initialized the first len(inputs) external_input. # Added some extra logging to help debug sporadic sandcastle fails if len(inputs) > len(predict_net.external_input): print("c2_native_run_net assert. len(inputs)=", len(inputs), "len(predict_net.external_input)=", len(predict_net.external_input)) print("debug_arg: ", debug_arg) print("predict_net ", type(predict_net), ":", predict_net) print("inputs ", type(inputs), ":", inputs) assert (len(inputs) <= len(predict_net.external_input)) for i in range(len(inputs)): ws.FeedBlob(predict_net.external_input[i], inputs[i], predict_net.device_option) ws.RunNetOnce(predict_net) output_names = predict_net.external_output output_values = [ws.FetchBlob(name) for name in output_names] return ws, namedtupledict('Outputs', output_names)(*output_values)
def run_node(cls, node, inputs, device='CPU', outputs_info=None, **kwargs): """ Run ONNX node. :param node: ONNX NodeProto object. :param inputs: Inputs. :param device: Device run on. :param outputs_info: None. :param kwargs: Other args. :return: Outputs. """ super(TensorflowBackend, cls).run_node(node, inputs, device) node_graph = tf.Graph() with node_graph.as_default(): node = OnnxNode(node) device_option = get_device_option(Device(device)) input_tensors = [] for i in inputs: input_tensors.append(tf.constant(i)) if isinstance(inputs, dict): feed_dict_raw = inputs else: assert len(node.inputs) == len(inputs) feed_dict_raw = dict(zip(node.inputs, inputs)) # TODO: is constant the best way for feeding inputs? input_dict = dict([ (x[0], tf.constant(x[1])) for x in feed_dict_raw.items() ]) ops = cls._onnx_node_to_tensorflow_op(node, input_dict) with tf.compat.v1.Session() as sess: with tf.device(device_option): sess.run(tf.compat.v1.global_variables_initializer()) output_vals = sess.run(ops) return namedtupledict('Outputs', node.outputs)(*output_vals)
def run_node(cls, node, inputs, uninit=[0]): """ run the keras model converted from the onnx node with given inputs. used for unit test. :param node: onnx node :param inputs: inputs :param uninit: index of inputs which need to :return: output dict of model """ super(KerasBackend, cls).run_node(node, inputs) node = OnnxNode(node) input_tensor = list() input_array = list() input_dict = dict() cls.extra_input = list() cls.extra_input_array = list() for i in range(len(inputs)): input_dict[node.inputs[i]] = inputs[i] for i in uninit: input_array.append(inputs[i]) shape = list(inputs[i].shape) # if len(shape) == 1: # shape = [-1, shape[0]] x = Input(batch_shape=shape, name=node.inputs[i], dtype=str(inputs[i].dtype)) input_tensor.append(x) input_dict[node.inputs[i]] = x out = cls._onnx_node_to_keras_op(node, input_dict)[0] model = Model(inputs=input_tensor + cls.extra_input, outputs=out) input_array += cls.extra_input_array if len(input_array) == 1: input_array = input_array[0] res = model.predict(input_array) return namedtupledict('Outputs', node.outputs)(*[res])
def run(self, inputs, **kwargs): """ Run TensorflowRep. :param inputs: Given inputs. :param kwargs: Other args. :return: Outputs. """ super(TensorflowRep, self).run(inputs, **kwargs) # TODO: handle name scope if necessary with self.graph.as_default(): with tf.Session() as sess: if isinstance(inputs, dict): feed_dict = inputs elif isinstance(inputs, list) or isinstance(inputs, tuple): if len(self.inputs) != len(inputs): raise RuntimeError( 'Expected {} values for uninitialized ' 'graph inputs ({}), but got {}.'.format( len(self.inputs), ', '.join(self.inputs), len(inputs))) feed_dict = dict(zip(self.inputs, inputs)) else: # single input feed_dict = dict([(self.inputs[0], inputs)]) feed_dict = { self.tensor_dict[key]: feed_dict[key] for key in self.inputs } sess.run(tf.global_variables_initializer()) outputs = [self.tensor_dict[output] for output in self.outputs] output_values = sess.run(outputs, feed_dict=feed_dict) return namedtupledict('Outputs', self.outputs)(*output_values)
def run_node(cls, node, inputs, device='CPU'): super(TensorflowBackend, cls).run_node(node, inputs, device) node = OnnxNode(node) device_option = get_device_option(Device(device)) input_tensors = [] for i in inputs: input_tensors.append(tf.constant(i)) if isinstance(inputs, dict): feed_dict_raw = inputs else: assert len(node.inputs) == len(inputs) feed_dict_raw = dict(zip(node.inputs, inputs)) # TODO: is constant the best way for feeding inputs? input_dict = dict([(x[0], tf.constant(x[1])) for x in \ feed_dict_raw.items()]) ops = cls._onnx_node_to_tensorflow_op(node, input_dict) output_vals = [] with tf.Session() as sess: with tf.device(device_option): sess.run(tf.global_variables_initializer()) output_vals = sess.run(ops) return namedtupledict('Outputs', node.outputs)(*output_vals)
def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None): super(Caffe2Backend, cls).run_node(node, inputs, device=device, outputs_info=outputs_info, opset_version=opset_version) value_infos = [] device_option = get_device_option(Device(device)) ws = Workspace() with core.DeviceScope(device_option): # temporary! if isinstance(inputs, dict): for key, value in inputs.items(): ws.FeedBlob(key, value) value_infos.append(onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], shape=value.shape).SerializeToString()) else: assert len(node.input) == len(inputs), "{}: expected {} but got {}".format( node.op_type, len(node.input), len(inputs)) for key, value in zip(node.input, inputs): ws.FeedBlob(key, value) value_infos.append(onnx.helper.make_tensor_value_info( name=key, elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype], shape=value.shape).SerializeToString()) ops = [] cbackend = C.Caffe2Backend(cls._dummy_name) ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version) for s in ops_str[0] + ops_str[1]: op = caffe2_pb2.OperatorDef() op.ParseFromString(s) op.device_option.CopyFrom(device_option) ops.append(op) ws.RunOperatorsOnce(ops) output_values = [ws.FetchBlob(name) for name in node.output] return namedtupledict('Outputs', node.output)(*output_values)
def run(self, inputs, **kwargs): super(Caffe2Rep, self).run(inputs, **kwargs) with core.DeviceScope(self.predict_net.device_option): if isinstance(inputs, dict): with core.NameScope(self._name_scope): for key, value in inputs.items(): self.workspace.FeedBlob(key, value) elif isinstance(inputs, list) or isinstance(inputs, tuple): if len(self.uninitialized) != len(inputs): raise RuntimeError('Expected {} values for uninitialized ' 'graph inputs ({}), but got {}.'.format( len(self.uninitialized), ', '.join(self.uninitialized), len(inputs))) for i, value in enumerate(inputs): # namescope already baked into protobuf self.workspace.FeedBlob(self.uninitialized[i], value) else: # single input self.workspace.FeedBlob(self.uninitialized[0], inputs) if not self.nets_created: self.workspace.CreateNet(self.init_net) self.workspace.CreateNet(self.predict_net) self.nets_created = True if not self.ran_init_net: self.workspace.RunNet(self.init_net.name) self.ran_init_net = True self.workspace.RunNet(self.predict_net.name) output_values = [] for name in self.predict_net.external_output: try: output_values.append(self.workspace.FetchBlob(name)) except Exception: output_values.append(self.workspace.FetchInt8Blob(name)) return namedtupledict('Outputs', self.predict_net.external_output)(*output_values)
def test_resnet50_core(self): N = 1 repeat = 1 print("Batch size: {}, repeat inference {} times".format(N, repeat)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CPU, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 workspace.SwitchWorkspace("onnxifi_test", True) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.RunNetOnce(init_net) workspace.CreateNet(pred_net) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) workspace.ResetWorkspace() # Fill the workspace with the weights with core.DeviceScope(device_option): workspace.RunNetOnce(init_net) # Cut the graph start = time.time() pred_net_cut = onnxifi_caffe2_net(pred_net, {input_name: input_blob_dims}, infer_shapes=True) del init_net, pred_net #_print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.CreateNet(pred_net_cut) end = time.time() print("Conversion time: {:.2f}s".format(end - start)) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("Onnxifi runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times".format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 workspace.SwitchWorkspace("gpu_test", True) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.RunNetOnce(init_net) workspace.CreateNet(pred_net) for _ in range(warmup): workspace.RunNet(pred_net.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) workspace.ResetWorkspace() # Fill the workspace with the weights with core.DeviceScope(device_option): workspace.RunNetOnce(init_net) # Cut the graph start = time.time() pred_net_cut = transform_caffe2_net(pred_net, {input_name: input_blob_dims}, build_serializable_op=True) del init_net, pred_net #_print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) with core.DeviceScope(device_option): workspace.FeedBlob(input_name, data) workspace.CreateNet(pred_net_cut) end = time.time() print("Conversion time: {:.2f}s".format(end -start)) for _ in range(warmup): workspace.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): workspace.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format(trt_time, (c2_time-trt_time)/c2_time*100)) output_values = [workspace.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def instantiate(cls, node, **kwargs): input_data1 = node.input_tensor[0] attrs = node.attrs if (attrs.get('ceil_mode') == None ): # define ceil_mode after Maxpool-10. default is 0. attrs['ceil_mode'] = 0 if (input_data1.ndim == 3): if (attrs.get('strides') == None): attrs['strides'] = (1, ) if (attrs.get('dilations') == None ): # define dilations[] after Maxpool-10. attrs['dilations'] = (1, ) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = ( math.floor(((math.ceil( input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.ceil(((math.ceil( input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), ) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = ( math.ceil(((math.ceil( input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.floor(((math.ceil( input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), ) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 1): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil((input_data1.shape[-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1) + 1) / attrs['strides'][-1])) else: # auto_pad is None if (attrs['ceil_mode'] == 0): tmp_shape.append( math.floor((input_data1.shape[-1] + attrs['pads'][0] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) else: tmp_shape.append( math.ceil((input_data1.shape[-1] + attrs['pads'][0] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) elif (input_data1.ndim == 4): if (attrs.get('strides') == None): attrs['strides'] = (1, 1) if (attrs.get('dilations') == None ): # define dilations[] after Maxpool-10. attrs['dilations'] = (1, 1) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = (math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.floor(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.ceil(( (math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.ceil(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2)) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = (math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.ceil(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.floor(( (math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.floor(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2)) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0, 0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0, 0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 2): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-2] / attrs['strides'][-2])) tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil((input_data1.shape[-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1) + 1) / attrs['strides'][-2])) tmp_shape.append( math.ceil((input_data1.shape[-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1) + 1) / attrs['strides'][-1])) else: # auto_pad is None if (attrs['ceil_mode'] == 0): tmp_shape.append( math.floor((input_data1.shape[-2] + attrs['pads'][0] + attrs['pads'][-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1)) / attrs['strides'][-2] + 1)) tmp_shape.append( math.floor((input_data1.shape[-1] + attrs['pads'][1] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) else: tmp_shape.append( math.ceil((input_data1.shape[-2] + attrs['pads'][0] + attrs['pads'][-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1)) / attrs['strides'][-2] + 1)) tmp_shape.append( math.ceil((input_data1.shape[-1] + attrs['pads'][1] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) elif (input_data1.ndim == 5): if (attrs.get('strides') == None): attrs['strides'] = (1, 1, 1) if (attrs.get('dilations') == None ): # define dilations[] after Maxpool-10. attrs['dilations'] = (1, 1, 1) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = (math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.floor(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.floor(( (math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + ((attrs['kernel_shape'][2] - 1) * attrs['dilations'][2] + 1) - input_data1.shape[4]) / 2), math.ceil(( (math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.ceil(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.ceil(( (math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + ((attrs['kernel_shape'][2] - 1) * attrs['dilations'][2] + 1) - input_data1.shape[4]) / 2)) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = (math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.ceil(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.ceil(( (math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + ((attrs['kernel_shape'][2] - 1) * attrs['dilations'][2] + 1) - input_data1.shape[4]) / 2), math.floor(( (math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + ((attrs['kernel_shape'][0] - 1) * attrs['dilations'][0] + 1) - input_data1.shape[2]) / 2), math.floor(( (math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + ((attrs['kernel_shape'][1] - 1) * attrs['dilations'][1] + 1) - input_data1.shape[3]) / 2), math.floor(( (math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + ((attrs['kernel_shape'][2] - 1) * attrs['dilations'][2] + 1) - input_data1.shape[4]) / 2)) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0, 0, 0, 0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0, 0, 0, 0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 3): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-3] / attrs['strides'][-3])) tmp_shape.append( math.ceil(input_data1.shape[-2] / attrs['strides'][-2])) tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil((input_data1.shape[-3] - ((attrs['kernel_shape'][-3] - 1) * attrs['dilations'][-3] + 1) + 1) / attrs['strides'][-3])) tmp_shape.append( math.ceil((input_data1.shape[-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1) + 1) / attrs['strides'][-2])) tmp_shape.append( math.ceil((input_data1.shape[-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1) + 1) / attrs['strides'][-1])) else: # auto_pad is None if (attrs['ceil_mode'] == 0): tmp_shape.append( math.floor((input_data1.shape[-3] + attrs['pads'][0] + attrs['pads'][-3] - ((attrs['kernel_shape'][-3] - 1) * attrs['dilations'][-3] + 1)) / attrs['strides'][-3] + 1)) tmp_shape.append( math.floor((input_data1.shape[-2] + attrs['pads'][1] + attrs['pads'][-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1)) / attrs['strides'][-2] + 1)) tmp_shape.append( math.floor((input_data1.shape[-1] + attrs['pads'][2] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) else: tmp_shape.append( math.ceil((input_data1.shape[-3] + attrs['pads'][0] + attrs['pads'][-3] - ((attrs['kernel_shape'][-3] - 1) * attrs['dilations'][-3] + 1)) / attrs['strides'][-3] + 1)) tmp_shape.append( math.ceil((input_data1.shape[-2] + attrs['pads'][1] + attrs['pads'][-2] - ((attrs['kernel_shape'][-2] - 1) * attrs['dilations'][-2] + 1)) / attrs['strides'][-2] + 1)) tmp_shape.append( math.ceil((input_data1.shape[-1] + attrs['pads'][2] + attrs['pads'][-1] - ((attrs['kernel_shape'][-1] - 1) * attrs['dilations'][-1] + 1)) / attrs['strides'][-1] + 1)) else: raise (ValueError) outputs_shape = tuple(tmp_shape) outputs_dtype = input_data1.dtype outputs_dict = { node.valid_var_name(node.outputs[0]): np.ones(shape=outputs_shape, dtype=outputs_dtype) } output_tensor = namedtupledict('output_tensor', outputs_dict.keys())(**outputs_dict) device = kwargs.get('device') if (issubclass(device.__class__, QumicoDevice) and QumicoDeviceType.OpenMP in device.options): cls.OpenMP = True return cls(node, input_tensor=node.input_tensor, output_tensor=output_tensor, attrs=attrs)
def test_resnet50_core(self): N = 2 warmup = 20 repeat = 100 print("Batch size: {}, repeat inference {} times, warmup {} times". format(N, repeat, warmup)) init_net, pred_net, _ = self._get_c2_model('resnet50') self._add_head_tail(pred_net, 'real_data', 'real_softmax') input_blob_dims = (N, 3, 224, 224) input_name = "real_data" device_option = core.DeviceOption(caffe2_pb2.CUDA, 0) init_net.device_option.CopyFrom(device_option) pred_net.device_option.CopyFrom(device_option) for op in pred_net.op: op.device_option.CopyFrom(device_option) op.engine = 'CUDNN' net_outputs = pred_net.external_output Y_c2 = None data = np.random.randn(*input_blob_dims).astype(np.float32) c2_time = 1 ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net) ws.CreateNet(pred_net) for _ in range(warmup): ws.RunNet(pred_net.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net.name) end = time.time() c2_time = end - start output_values = [ws.FetchBlob(name) for name in net_outputs] Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values) ws.ResetWorkspace() # Cut the graph init_net_cut, pred_net_cut = transform_caffe2_net( init_net, pred_net, {input_name: input_blob_dims}) del init_net, pred_net #print_net(pred_net_cut) Y_trt = None input_name = pred_net_cut.external_input[0] print("C2 runtime: {}s".format(c2_time)) ws = Workspace() with core.DeviceScope(device_option): ws.FeedBlob(input_name, data) ws.RunNetOnce(init_net_cut) ws.CreateNet(pred_net_cut) for _ in range(warmup): ws.RunNet(pred_net_cut.name) start = time.time() for _ in range(repeat): ws.RunNet(pred_net_cut.name) end = time.time() trt_time = end - start print("TRT runtime: {}s, improvement: {}%".format( trt_time, (c2_time - trt_time) / c2_time * 100)) output_values = [ws.FetchBlob(name) for name in net_outputs] Y_trt = namedtupledict('Outputs', net_outputs)(*output_values) np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
def instantiate(cls, node, **kwargs): input_data1 = node.input_tensor[0] attrs = node.attrs if (input_data1.ndim == 3): attrs['kernel_shape'] = (input_data1.shape[-1], ) if (attrs.get('strides') == None): attrs['strides'] = (1, ) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = ( math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), ) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = ( math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), ) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 if (attrs.get('count_include_pad') == None): attrs['count_include_pad'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 1): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil( (input_data1.shape[-1] - attrs['kernel_shape'][-1] + 1) / attrs['strides'][-1])) else: tmp_shape.append( math.floor( (input_data1.shape[-1] + attrs['pads'][0] + attrs['pads'][-1] - attrs['kernel_shape'][-1]) / attrs['strides'][-1] + 1)) elif (input_data1.ndim == 4): attrs['kernel_shape'] = (input_data1.shape[-2], input_data1.shape[-1]) if (attrs.get('strides') == None): attrs['strides'] = (1, 1) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = ( math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.floor( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.ceil( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2)) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = ( math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.ceil( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.floor( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2)) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0, 0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0, 0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 if (attrs.get('count_include_pad') == None): attrs['count_include_pad'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 2): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-2] / attrs['strides'][-2])) tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil( (input_data1.shape[-2] - attrs['kernel_shape'][-2] + 1) / attrs['strides'][-2])) tmp_shape.append( math.ceil( (input_data1.shape[-1] - attrs['kernel_shape'][-1] + 1) / attrs['strides'][-1])) else: tmp_shape.append( math.floor( (input_data1.shape[-2] + attrs['pads'][0] + attrs['pads'][-2] - attrs['kernel_shape'][-2]) / attrs['strides'][-2] + 1)) tmp_shape.append( math.floor( (input_data1.shape[-1] + attrs['pads'][1] + attrs['pads'][-1] - attrs['kernel_shape'][-1]) / attrs['strides'][-1] + 1)) elif (input_data1.ndim == 5): attrs['kernel_shape'] = (input_data1.shape[-3], input_data1.shape[-2], input_data1.shape[-1]) if (attrs.get('strides') == None): attrs['strides'] = (1, 1, 1) # pad_shape[i] = (output_spatial_shape[i] - 1) * strides_spatial_shape[i] + kernel_spatial_shape[i] - input_spatial_shape[i] auto_pad = attrs.get('auto_pad') if (attrs.get('pads') == None): if (auto_pad == 'SAME_UPPER'): attrs['pads'] = ( math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.floor( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.floor( ((math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + attrs['kernel_shape'][2] - input_data1.shape[4]) / 2), math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.ceil( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.ceil( ((math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + attrs['kernel_shape'][2] - input_data1.shape[4]) / 2)) elif (auto_pad == 'SAME_LOWER'): attrs['pads'] = ( math.ceil( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.ceil( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.ceil( ((math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + attrs['kernel_shape'][2] - input_data1.shape[4]) / 2), math.floor( ((math.ceil(input_data1.shape[2] / attrs['strides'][0]) - 1) * attrs['strides'][0] + attrs['kernel_shape'][0] - input_data1.shape[2]) / 2), math.floor( ((math.ceil(input_data1.shape[3] / attrs['strides'][1]) - 1) * attrs['strides'][1] + attrs['kernel_shape'][1] - input_data1.shape[3]) / 2), math.floor( ((math.ceil(input_data1.shape[4] / attrs['strides'][2]) - 1) * attrs['strides'][2] + attrs['kernel_shape'][2] - input_data1.shape[4]) / 2)) elif (auto_pad == 'VALID'): attrs['pads'] = (0, 0, 0, 0, 0, 0) elif (auto_pad == 'NOTSET' or auto_pad == None): attrs['pads'] = (0, 0, 0, 0, 0, 0) if (attrs.get('storage_order') == None): attrs['storage_order'] = 0 if (attrs.get('count_include_pad') == None): attrs['count_include_pad'] = 0 # SAME: output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides_spatial_shape[i]) # VALID: output_spatial_shape[i] = ceil((input_spatial_shape[i] - kernel_spatial_shape[i] + 1) / strides_spatial_shape[i]) # NOTSET: output_spatial_shape[i] = floor((input_spatial_shape[i] + pad_shape[i] - kernel_spatial_shape[i]) / strides_spatial_shape[i] + 1) tmp_shape = [] for d in range(0, input_data1.ndim - 3): tmp_shape.append(input_data1.shape[d]) if (auto_pad == 'SAME_UPPER') or (auto_pad == 'SAME_LOWER'): tmp_shape.append( math.ceil(input_data1.shape[-3] / attrs['strides'][-3])) tmp_shape.append( math.ceil(input_data1.shape[-2] / attrs['strides'][-2])) tmp_shape.append( math.ceil(input_data1.shape[-1] / attrs['strides'][-1])) elif (auto_pad == 'VALID'): tmp_shape.append( math.ceil( (input_data1.shape[-3] - attrs['kernel_shape'][-3] + 1) / attrs['strides'][-3])) tmp_shape.append( math.ceil( (input_data1.shape[-2] - attrs['kernel_shape'][-2] + 1) / attrs['strides'][-2])) tmp_shape.append( math.ceil( (input_data1.shape[-1] - attrs['kernel_shape'][-1] + 1) / attrs['strides'][-1])) else: tmp_shape.append( math.floor( (input_data1.shape[-3] + attrs['pads'][0] + attrs['pads'][-3] - attrs['kernel_shape'][-3]) / attrs['strides'][-3] + 1)) tmp_shape.append( math.floor( (input_data1.shape[-2] + attrs['pads'][1] + attrs['pads'][-2] - attrs['kernel_shape'][-2]) / attrs['strides'][-2] + 1)) tmp_shape.append( math.floor( (input_data1.shape[-1] + attrs['pads'][2] + attrs['pads'][-1] - attrs['kernel_shape'][-1]) / attrs['strides'][-1] + 1)) else: raise (ValueError) outputs_shape = tuple(tmp_shape) outputs_dtype = input_data1.dtype outputs_dict = { node.valid_var_name(node.outputs[0]): np.ones(shape=outputs_shape, dtype=outputs_dtype) } output_tensor = namedtupledict("output_tensor", outputs_dict.keys())(**outputs_dict) return cls(node, input_tensor=node.input_tensor, output_tensor=output_tensor, attrs=attrs)
def run( self, inputs, # type: Any **kwargs # type: Any ): # type: (...) -> Tuple[Any, ...] super(CoreMLRep, self).run(inputs, **kwargs) inputs_ = inputs _reshaped = False if not self.disable_rank5_mapping: for i, input_ in enumerate(inputs_): shape = input_.shape if len(shape) == 4 or len(shape) == 2: inputs_[i] = input_[np.newaxis, :] _reshaped = True elif len(shape) == 3: spec = self.model.get_spec() spec_shape = [ int(k) for k in spec.description.input[i].type.multiArrayType.shape ] prod = spec_shape[0] * spec_shape[1] * spec_shape[2] onnx_shape = list(shape) if onnx_shape != spec_shape: if onnx_shape[2] == prod: inputs_[i] = np.reshape( inputs_[i], [onnx_shape[0], onnx_shape[1]] + spec_shape) elif onnx_shape[1] * onnx_shape[2] == prod: inputs_[i] = np.reshape( inputs_[i], [1, onnx_shape[0]] + spec_shape) input_dict = dict(zip(self.input_names, map(np.array, inputs_))) _set_dtypes(input_dict, self.model) #type: ignore prediction = self.model.predict(input_dict, self.useCPUOnly) output_values = [prediction[name] for name in self.output_names] if not self.disable_rank5_mapping: for i, output_ in enumerate(output_values): shape = output_.shape #reshape the CoreML output to match Onnx's output shape try: output_values[i] = np.reshape( output_, self.onnx_outputs_info[ self.output_names[i]][2]) # type: ignore except RuntimeError: print( "Output '%s' shape incompatible between CoreML (%s) and onnx (%s)" % (self.output_names[i], output_.shape, self.onnx_outputs_info[self.output_names[i]])) ## Type Cast to ONNX expected output types for i, output_ in enumerate(output_values): output_type = self.onnx_outputs_info[self.output_names[i]][1] if TENSOR_TYPE_TO_NP_TYPE[output_type] != output_values[i].dtype: output_values[i] = output_values[i].astype( TENSOR_TYPE_TO_NP_TYPE[output_type]) result = namedtupledict('Outputs', self.output_names)( *output_values) # type: Tuple[Any, ...] return result