def onnx_upsample(nodes, layer_name, input_node, output_shape=None, resize_scale_factors=2): attrs = { "coordinate_transformation_mode": 'asymmetric', "mode": 'nearest', "nearest_mode": 'floor', } layer_name = layer_name # 不要和原来 的resize节点名重复 scales = np.array([1.0, 1.0, resize_scale_factors, resize_scale_factors]).astype(np.float32) scale_name = layer_name + ".scale" roi_name = layer_name + ".roi" scale = gs.Constant(scale_name, scales) roi = gs.Constant(roi_name, np.asarray([0, 0, 0, 0], np.float32)) inputs = [input_node, roi, scale] output_node = gs.Variable(layer_name, dtype=np.float32, shape=output_shape) node = gs.Node(op="Resize", inputs=inputs, outputs=[output_node], attrs=attrs) nodes.append(node) return output_node
def add_fc(self): """ add FC layer """ logging.info("Adding FC layer") # fetch some attrs from old fc1000; note MatMul doesn't have bias old_fc_op = [_n for _n in self.graph.nodes if _n.name == "fc1000"][0] old_fc_kernel = old_fc_op.inputs[1] fc_kernel_weights = old_fc_kernel.values[:, 1:] # instantiate fc weight # NOTE: expects KM weight, if transpose is not set (default not set) fc_weight = gs.Constant("fc_replaced_weight", values=fc_kernel_weights) # find input to fc to be added squeeze_replaced_op = [ _n for _n in self.graph.nodes if _n.name == "squeeze_replaced" ][0] squeeze_replaced_out = squeeze_replaced_op.outputs[0] # reshape input reshape_shape = np.array([-1, fc_kernel_weights.shape[0]], dtype=np.int64) fc_reshape_shape = gs.Constant("fc_reshape_shape", values=reshape_shape) # add FC: Reshape=>MatMul fc_reshape_out = self.graph.Reshape("fc_reshape_input", squeeze_replaced_out, fc_reshape_shape) fc_out = self.graph.MatMul("fc_replaced", fc_reshape_out, fc_weight)
def onnx_slice(nodes, layer_name, input_node, output_shape, start=(0, 0, 0, 0), shape=(2, 2, 3, 3), stride=(1, 1, 1, 1)): """ x = torch.randn([8,8]) x[:,2:4] onnx_slice(nodes,"slice",x,(0,2),(8,4),(1,1)) """ inputs = [input_node] inputs.extend([ gs.Constant(layer_name + '_constant_start', np.asarray(start, np.int32)), gs.Constant(layer_name + '_constant_shape', np.asarray(shape, np.int32)), gs.Constant(layer_name + '_constant_axis', np.arange(0, len(start)).astype(np.int32)), gs.Constant(layer_name + '_constant_stride', np.asarray(stride, np.int32)), ]) name = layer_name output_node = gs.Variable(name, np.float32, output_shape) node = gs.Node(op="Slice", inputs=inputs, outputs=[output_node]) nodes.append(node) return output_node
def fold_upsample_inputs(upsample, graph, opset=11): """ Inplace transformation of the graph. The upsample subgraph is collapsed to single upsample node with input and scale factor (constant tensor). Args: upsample: upsample node in the original graph. graph: graph object. """ if opset==9: # Gather the scale factor from mul op in the upsample input subgraph scale_factor = upsample.i(1).i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(1).attrs['value'].values # Create the new scales tensor scales = np.array([1.0, 1.0, scale_factor, scale_factor], dtype=np.float32) scale_tensor = gs.Constant(name=upsample.inputs[-1].name, values=scales) # Change the last input to the node to the new constant scales tensor. upsample.inputs[-1] = scale_tensor else: # In opset 11, upsample layer is exported as Resize. We will transform this Resize layer into an Upsample layer # and collapse the input sizes_tensor_name = upsample.inputs[3].name # Create the new scales tensor scale_factor = upsample.i(3).i(1).i().i().i().i().i(0).i(1).attrs['value'].values scales = np.array([1.0, 1.0, scale_factor, scale_factor], dtype=np.float32) scale_tensor = gs.Constant(name=sizes_tensor_name, values=scales) # Rename the Resize op to upsample and add the data and scales as inputs to the upsample layer. input_tensor = upsample.inputs[0] upsample.inputs = [input_tensor, scale_tensor] upsample.op = 'Upsample'
def modeify_model2(cls, input_file="model.onnx", output_file="add.onnx"): """重新修改resize的实现 """ graph = gs.import_onnx(onnx.load(input_file)) first_add = [node for node in graph.nodes if node.op == "LeakyRelu"][0] # 找到 LeakyRelu 的节点 # first_add = [node for node in graph.nodes if node.name == "LeakyRelu_2"][0] # 找到 LeakyRelu 的节点 # first_add.inputs = [inp for inp in first_add.inputs] # 找到其对应的输入 # first_add.outputs = [inp for inp in first_add.outputs] # 找到其对应的输出 first_add.outputs.clear( ) # 必须执行,clear 删除掉输出的相关链接 ,但也导致 LeakyRelu 没有了输出,因此必须重新实现生成新的输出 # graph.nodes.remove(first_add) # 删除整个节点 second_add = [node for node in graph.nodes if node.op == "MaxPool"][0] # second_add = [node for node in graph.nodes if node.name == "MaxPool_32"][0] second_add.inputs.clear() # 必须执行,clear 删除掉输入的相关链接,后面得重新指定其输入 # 重新定义LeakyRelu层 attrs = {"alpha": 0.1} lrelu = gs.Variable("new_lrelu", np.float32) node = gs.Node(op="LeakyRelu", inputs=first_add.inputs, outputs=[lrelu], attrs=attrs) graph.nodes.append(node) # 重新定义resize层(实现upsample) attrs = { "coordinate_transformation_mode": 'asymmetric', "mode": 'nearest', "nearest_mode": 'floor', } layer_name = "new_resize" # 不要和原来 的resize节点名重复 scales = np.array([1.0, 1.0, 2, 2]).astype(np.float32) scale_name = layer_name + ".scale" roi_name = layer_name + ".roi" scale = gs.Constant(scale_name, scales) roi = gs.Constant(roi_name, np.asarray([0, 0, 0, 0], np.float32)) # inputs =first_add.outputs inputs = [lrelu] inputs.append(roi) inputs.append(scale) resize = gs.Variable(layer_name, dtype=np.float32) node = gs.Node(op="Resize", inputs=inputs, outputs=[resize], attrs=attrs) graph.nodes.append(node) # 重新设置下一层的输入节点 second_add.inputs = [resize] # 5. Remove unused nodes/tensors, and topologically sort the graph graph.cleanup().toposort() onnx.save(gs.export_onnx(graph), output_file)
def modify(input: str, output: str, downsample_ratio: float = 0.25) -> None: print(f'\nonnx load: {input}') graph = gs.import_onnx(onnx.load(input)) _print_graph(graph) # update node Resize_3: scales resize_3 = [n for n in graph.nodes if n.name == 'Resize_3'][0] print() print(resize_3) scales = gs.Constant( '388', np.asarray([1, 1, downsample_ratio, downsample_ratio], dtype=np.float32)) resize_3.inputs = [ i if i.name != '388' else scales for i in resize_3.inputs ] print() print(resize_3) # remove input downsample_ratio graph.inputs = [i for i in graph.inputs if i.name != 'downsample_ratio'] # remove node Concat_2 concat_2 = [n for n in graph.nodes if n.name == 'Concat_2'][0] concat_2.outputs.clear() # remove unused nodes/tensors graph.cleanup() onnx.save(gs.export_onnx(graph), output)
def extract_anchors_tensor(split): # This will find the anchors that have been hardcoded somewhere within the ONNX graph. # The function will return a gs.Constant that can be directly used as an input to the NMS plugin. # The anchor tensor shape will be [1, num_anchors, 4]. Note that '1' is kept as first dim, regardless of # batch size, as it's not necessary to replicate the anchors for all images in the batch. # The anchors are available (one per coordinate) hardcoded as constants within certain box decoder nodes. # Each of these four constants have shape [1, num_anchors], so some numpy operations are used to expand the # dims and concatenate them as needed. # These constants can be found by starting from the Box Net's split operation , and for each coordinate, # walking down in the graph until either an Add or Mul node is found. The second input on this nodes will # be the anchor data required. def get_anchor_np(output_idx, op): node = self.graph.find_descendant_by_op( split.o(0, output_idx), op) assert node val = np.squeeze(node.inputs[1].values) return np.expand_dims(val.flatten(), axis=(0, 2)) anchors_y = get_anchor_np(0, "Add") anchors_x = get_anchor_np(1, "Add") anchors_h = get_anchor_np(2, "Mul") anchors_w = get_anchor_np(3, "Mul") anchors = np.concatenate( [anchors_y, anchors_x, anchors_h, anchors_w], axis=2) return gs.Constant(name="nms/anchors:0", values=anchors)
def onnx_reshape(nodes, layer_name, input_node, output_shape, value): inputs = [input_node] inputs.append( gs.Constant(layer_name + '_constant', np.asarray(value, np.int64))) output_node = gs.Variable(layer_name, np.float32, output_shape) node = gs.Node(op="Reshape", inputs=inputs, outputs=[output_node]) nodes.append(node) return output_node
def change_node(self): print("change_node") for i in range(1, len(self.graph.nodes[self.idx_node].inputs)-1): del self.graph.nodes[self.idx_node].inputs[i] for removed_node in self.graph.nodes[self.idx_node-self.rdd_nodes:self.rdd_nodes]: removed_node.outputs.clear() pads_folded_tensor = gs.Constant(name = self.graph.nodes[self.idx_node].name, values= np.array(self.trt_pad_values)) self.graph.nodes[self.idx_node].inputs[1] = pads_folded_tensor #self.graph.nodes[self.idx_node].attrs = self.attrs return self.graph
def fold_pad_inputs(node, graph): # Gather the amount of padding in each dimension from pytorch graph. pad_values_pyt = node.i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(0).attrs['value'].values # Assumption a 4d input tensor onnx_pad_values = [0]*4*2 # 4d tensor and 2 sides padding for each dimension j=3 for i in range(0, len(pad_values_pyt), 2): onnx_pad_values[j] = pad_values_pyt[i] onnx_pad_values[j+4] = pad_values_pyt[i+1] j-=1 # Change the existing pad tensor to the new onnx_pad values tensor pads_folded_tensor = gs.Constant(name=node.inputs[1].name, values=np.array(onnx_pad_values)) node.inputs[1] = pads_folded_tensor
def run(nM,nK,nN): tensor0 = gs.Variable("tensor0", np.float32, [nM, 1]) constant1xK = gs.Constant("constant1xK", np.ascontiguousarray(np.random.rand(1, nK).reshape(1, nK).astype(np.float32) * 2 - 1)) constantKxN = gs.Constant("constantKxN", np.ascontiguousarray(np.random.rand(nK, nN).reshape(nK, nN).astype(np.float32) * 2 - 1)) constantN = gs.Constant("constantN", np.ascontiguousarray(np.random.rand(nN).astype(np.float32) * 2 - 1)) constantNxK = gs.Constant("constantNxK", np.ascontiguousarray(np.random.rand(nN, nK).reshape(nN, nK).astype(np.float32) * 2 - 1)) constantK = gs.Constant("constantK", np.ascontiguousarray(np.random.rand(nK).astype(np.float32) * 2 - 1)) constantM1 = gs.Constant("constantM1", np.ascontiguousarray(np.array([-1], dtype=np.int64))) graphNodeList = [] tensor1 = gs.Variable("tensor1", np.float32, None) node1 = gs.Node("MatMul", "MMU1", inputs=[tensor0, constant1xK], outputs=[tensor1]) graphNodeList.append(node1) tensorLoop = tensor1 for i in range(nLoop): tensor2 = gs.Variable("tensor%d-1" % i, np.float32, None) node2 = gs.Node("MatMul", "MMU-" + str(i), inputs=[tensorLoop, constantKxN], outputs=[tensor2]) graphNodeList.append(node2) tensor3 = gs.Variable("tensor%d-2" % i, dtype=np.float32, shape=None) node3 = gs.Node("Add", "AddU-" + str(i), inputs=[tensor2, constantN], outputs=[tensor3]) graphNodeList.append(node3) tensor4 = gs.Variable("tensor%d-3" % i, dtype=np.float32, shape=None) node4 = gs.Node("Relu", "ReLUU-" + str(i), inputs=[tensor3], outputs=[tensor4]) graphNodeList.append(node4) tensor5 = gs.Variable("tensor%d-4" % i, dtype=np.float32, shape=None) node5 = gs.Node("MatMul", "MMD-" + str(i), inputs=[tensor4, constantNxK], outputs=[tensor5]) graphNodeList.append(node5) tensor6 = gs.Variable("tensor%d-5" % i, dtype=np.float32, shape=None) node6 = gs.Node("Add", "AddD-" + str(i), inputs=[tensor5, constantK], outputs=[tensor6]) graphNodeList.append(node6) tensor7 = gs.Variable("tensor%d-6" % i, dtype=np.float32, shape=None) node7 = gs.Node("Relu", "ReLUD-" + str(i), inputs=[tensor6], outputs=[tensor7]) graphNodeList.append(node7) tensorLoop = tensor7 tensor8 = gs.Variable("tensor8", dtype=np.float32, shape=None) node8 = gs.Node("ReduceSum", "Reduce", inputs=[tensorLoop, constantM1], outputs=[tensor8], attrs=OrderedDict([('keepdims', 0)])) graphNodeList.append(node8) graph = gs.Graph(nodes=graphNodeList, inputs=[tensor0], outputs=[tensor8], opset=13) onnxFile = "model-%d-%d-%d.onnx"%(nM,nK,nN) onnx.save(gs.export_onnx(graph.cleanup().toposort()), onnxFile) print("Succeeded building %s!" % (onnxFile)) os.system("trtexec --onnx=%s --useCudaGraph --noDataTransfers --fp16"%onnxFile)
def make_constant_linear(): DTYPE = np.float32 SHAPE = (4, 4) graph = gs.Graph() X0 = graph.constant(gs.Constant("const", values=np.ones(SHAPE, dtype=DTYPE))) # Explicitly clear shape to trigger the failure condition in reduce X0.shape = None X1 = graph.identity(X0) X2 = graph.identity(X1) X2.dtype = DTYPE X2.shape = SHAPE graph.outputs = [X2] save(graph, "reducable_with_const.onnx")
def fold_pad_inputs(node, graph): # Gather the amount of padding in each dimension from pytorch graph. #从pytroch图中收集每个维度的填充量 pad_values_pyt = node.i(1).i(0).i(0).i(0).i(0).i(0).i(0).i(0).attrs['value'].values # Assumption a 4d input tensor #假设一个4维度的输入,同时每个维度填充2 onnx_pad_values = [0]*4*2 # 4d tensor and 2 sides padding for each dimension j=3 #创建一个步长是2的循环列表 for i in range(0, len(pad_values_pyt), 2): #给相应的onnx_pad_values赋值 onnx_pad_values[j] = pad_values_pyt[i] onnx_pad_values[j+4] = pad_values_pyt[i+1] #更新相应的索引 j-=1 # Change the existing pad tensor to the new onnx_pad values tensor #利用新的onnx_pad_values对现有的张量进行更新 pads_folded_tensor = gs.Constant(name=node.inputs[1].name, values=np.array(onnx_pad_values)) #重新指定相应的张量 node.inputs[1] = pads_folded_tensor
def add_conv(self): """ add Conv layer """ logging.info("Adding Conv layer, instead of FC") # fetch some attrs from old fc1000; note MatMul doesn't have bias old_fc_op = [_n for _n in self.graph.nodes if _n.name == "fc1000"][0] old_fc_kernel = old_fc_op.inputs[1] # instantiate fc weight and attrs # NOTE: ONNX uses MCkHkW format fc_kernel_weights = old_fc_kernel.values.transpose()[1:, :].reshape( 1000, 2048, 1, 1) fc_weight = gs.Constant("fc_replaced_weight", values=fc_kernel_weights) attrs = {"kernel_shape": [1, 1]} # find input to fc to be added squeeze_replaced_op = [ _n for _n in self.graph.nodes if _n.name == "squeeze_replaced" ][0] squeeze_replaced_out = squeeze_replaced_op.outputs[0] # add FC: Conv fc_out = self.graph.Conv("fc_replaced", squeeze_replaced_out, fc_weight, attrs)
def test_const_inp_but_non_foldable_nested_graph(self): cond = gs.Constant("cond", values=np.array(True)) X = gs.Variable("X", dtype=np.float32, shape=(1, )) graph = Graph(inputs=[X]) then_graph = Graph(name="Then") then_graph.outputs = [then_graph.add(X, X)] else_graph = Graph(name="Else") else_graph.outputs = [else_graph.add(X, else_graph.add(X, X))] # Even though if_op looks foldable because it has all constant inputs, # it's not, since its subgraphs depend on variables in the outer scope. graph.outputs = [graph.if_op(cond, then_graph, else_graph)] # This should not raise because the `If` node should be excluded from # constant folding. graph.fold_constants(error_ok=False).cleanup() assert graph.nodes[0].op == "If" assert len(then_graph.nodes) == 1 assert len(else_graph.nodes) == 2
def test_with_nested_graph(self): cond = gs.Variable("cond", dtype=np.bool, shape=(1, )) X = gs.Variable("X", dtype=np.float32, shape=(1, )) Y = gs.Constant("Y", values=np.ones((1, ), dtype=np.float32)) graph = Graph(inputs=[X, cond]) then_graph = Graph(name="Then") then_graph.outputs = [then_graph.add(Y, Y)] else_graph = Graph(name="Else") else_graph.outputs = [else_graph.add(X, else_graph.add(Y, Y))] graph.outputs = [graph.if_op(cond, then_graph, else_graph)] graph.fold_constants() graph.cleanup() assert len(then_graph.nodes) == 0 assert np.all(then_graph.outputs[0].values == (Y.values * 2)) assert len(else_graph.nodes) == 1 assert isinstance(else_graph.nodes[0].inputs[1], Constant) assert np.all(else_graph.nodes[0].inputs[1].values == (Y.values * 2))
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import onnx_graphsurgeon as gs import numpy as np import onnx X = gs.Variable(name="X", dtype=np.float32, shape=(1, 3, 224, 224)) # Since W is a Constant, it will automatically be exported as an initializer W = gs.Constant(name="W", values=np.ones(shape=(5, 3, 3, 3), dtype=np.float32)) Y = gs.Variable(name="Y", dtype=np.float32, shape=(1, 5, 222, 222)) node = gs.Node(op="Conv", inputs=[X, W], outputs=[Y]) # Note that initializers do not necessarily have to be graph inputs graph = gs.Graph(nodes=[node], inputs=[X], outputs=[Y]) onnx.save(gs.export_onnx(graph), "test_conv.onnx")
inputTensor.name = 'inputTensor' graph.inputs = [inputTensor] node.inputs[0] = constantData for i in range(1, 24, 2): graph.outputs.append(node.o(i).o().o().outputs[0]) # Transpose continue graph.cleanup() onnx.save(gs.export_onnx(graph), onnxFile0) ''' graph = gs.import_onnx(onnx.load(onnxFile0)) wiliConstant0 = gs.Constant( "wiliConstant0", np.ascontiguousarray(np.array([0], dtype=np.int64))) wiliConstant1 = gs.Constant( "wiliConstant1", np.ascontiguousarray(np.array([1], dtype=np.int64))) wiliConstant3 = gs.Constant( "wiliConstant3", np.ascontiguousarray(np.array([3], dtype=np.int64))) nSlice = 0 graph.outputs = [] for node in graph.nodes: if node.op == 'Slice' and node.name == 'Slice_74': table512x256 = node.inputs[0].values[0] for i in range(1, 24, 2): factor256x256 = node.o(i).inputs[1].values tansposeNode = node.o(i).o().o() newTable = np.matmul(table512x256,
verbose=True, operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH, do_constant_folding=False) import onnx_graphsurgeon as gs import onnx import numpy as np graph = gs.import_onnx(onnx.load(src_onnx)) for node in graph.nodes: if node.op == 'Resize': # actually not used in this sample node_concat = node.i(2, 0) node_concat.i(0, 0).attrs['value'] = gs.Constant( '', np.concatenate((node_concat.i(0, 0).attrs['value'].values, node_concat.i(1, 0).attrs['value'].values))) node.inputs[2] = node_concat.inputs[0] node_concat.outputs.clear() if node.op == 'Clip': node_cast0 = node.i(1, 0) node_cast1 = node.i(2, 0) #change data type to fp32 node_cast0.i(0, 0).attrs['value'] = gs.Constant( '', np.asarray([-1.0], dtype=np.float32)) node_cast1.i(0, 0).attrs['value'] = gs.Constant( '', np.asarray([1.0], dtype=np.float32)) #skip cast node.inputs = [ node.inputs[0], node_cast0.inputs[0], node_cast1.inputs[0]
########################################################################################################## # The functions registered above greatly simplify the process of building the graph itself. graph = gs.Graph(opset=11) # Generates a graph which computes: # output = ReLU((A * X^T) + B) (.) C + D X = gs.Variable(name="X", shape=(64, 64), dtype=np.float32) graph.inputs = [X] # axt = (A * X^T) # Note that we can use NumPy arrays directly (e.g. Tensor A), # instead of Constants. These will automatically be converted to Constants. A = np.ones(shape=(64, 64), dtype=np.float32) axt = graph.gemm(A, X, trans_b=True) # dense = ReLU(axt + B) B = np.ones((64, 64), dtype=np.float32) * 0.5 dense = graph.relu(*graph.add(*axt, B)) # output = dense (.) C + D # If a Tensor instance is provided (e.g. Tensor C), it will not be modified at all. # If you prefer to set the exact names of tensors in the graph, you should # construct tensors manually instead of passing strings or NumPy arrays. C = gs.Constant(name="C", values=np.ones(shape=(64, 64), dtype=np.float32)) D = np.ones(shape=(64, 64), dtype=np.float32) graph.outputs = graph.add(*graph.mul(*dense, C), D) onnx.save(gs.export_onnx(graph), "model.onnx")
def fuse_br2b_br2c(self): """ Match and replace br2b+br2b with the fused plugin. This fusion is for Conv-Conv-Add-ReLU """ logging.info("Fusing ops in br2b_br2c path") op_names_lists_to_be_fused = [ [ "res2b_branch2b", "res2b_branch2b_relu", "res2b_branch2c", "res2b", "res2b_relu" ], [ "res2c_branch2b", "res2c_branch2b_relu", "res2c_branch2c", "res2c", "res2c_relu" ], ] for _idx, op_names_list in enumerate(op_names_lists_to_be_fused): # setup plugin info plugin_name = "RES2_BR2B_BR2C_{}".format(_idx + 1) # prep fusion: constants and attributes op_dict = dict() [ op_dict.update({_n.name: _n}) for _n in self.graph.nodes if _n.name in op_names_list ] op_list = [op_dict[_n] for _n in op_names_list] assert len(op_names_list) == len( op_list), "Need to capture all op objects in op_names_list" scale64 = gs.Constant("scale64", values=np.ones((64), dtype=np.float32)) scale256 = gs.Constant("scale256", values=np.ones((256), dtype=np.float32)) from_shortcut = op_list[3].i(0, 0).outputs[0]\ if op_list[3].i(1, 0).name == op_names_list[2] else\ op_list[3].i(1, 0).outputs[0] # build array with dynamic ranges required for the fusion plugin # NOTE: order matters dyn_list = [ self.dyn_range_map[from_shortcut.name], self.dyn_range_map[op_list[0].inputs[0].name], self.dyn_range_map[op_list[1].outputs[0].name], self.dyn_range_map[op_list[2].outputs[0].name], self.dyn_range_map[op_list[4].outputs[0].name], ] dynamic_ranges = np.array(dyn_list, dtype=np.float32) dyn_const = gs.Constant("{}_dynamic_ranges".format(plugin_name), values=dynamic_ranges) # this becomes attributes to ONNX node that fusion plugin uses # NOTE: order does not matter plugin_field_dict = { "c_br2b_w": op_list[0].inputs[1], "s_br2b_s": scale64, "s_br2b_b": op_list[0].inputs[2], "c_br2c_w": op_list[2].inputs[1], "s_br2c_s": scale256, "s_br2c_b": op_list[2].inputs[2], "dynamic_ranges": dyn_const, } attrs = { "plugin_version": "2", "plugin_namespace": "", } attrs.update(plugin_field_dict) # get plugin input/output plugin_inp = [from_shortcut, op_list[0].inputs[0]] plugin_out = [op_list[-1].outputs[0]] # replace ops with plugin self.graph.RES2PLUGIN("RnRes2Br2bBr2c_TRT", plugin_name, plugin_inp, plugin_out, attrs) # graph cleanup self.cleanup_graph() # done logging.info("Plugin {} successful".format(plugin_name))
def preprocess_onnx(self, model): """ Manipulate original ONNX file with graphSurgeon: insert InstanceNormalization 3D and PixelShuffle plugin, and export the new ONNX graph. """ graph = gs.import_onnx(model) if self.use_instnorm3d_plugin: for node in graph.nodes: # Replace InstanceNormalization with INSTNORM3D_TRT plugin node if node.op == "InstanceNormalization": node.op = "INSTNORM3D_TRT" node.attrs["scales"] = node.inputs[1] node.attrs["bias"] = node.inputs[2] node.attrs["plugin_version"] = "1" node.attrs["plugin_namespace"] = "" node.attrs["relu"] = 0 node.attrs["alpha"] = 0.0 scales = node.attrs["scales"].values biases = node.attrs["bias"].values assert len(scales) == len( biases ), "Scales and biases do not have the same length!" del node.inputs[2] del node.inputs[1] # Set leaky-relu node attributes to INSTNORM3D plugin and remove leaky-relu nodes. nodes = [ node for node in graph.nodes if node.op == "INSTNORM3D_TRT" ] for node in nodes: leaky_relu_node = node.o() attrs = leaky_relu_node.attrs node.attrs["relu"] = 1 node.attrs["alpha"] = attrs["alpha"] node.outputs = leaky_relu_node.outputs leaky_relu_node.outputs.clear() if self.use_conv3d1x1x1k4_plugin: nodes = [ node for node in graph.nodes if node.op == "INSTNORM3D_TRT" ] last_layer_node = nodes[-1].o() last_layer_node.op = "CONV3D1X1X1K4_TRT" weights = last_layer_node.inputs[1] weights_shape = weights.values.shape weights_c = weights_shape[1] weights_k = weights_shape[0] assert weights_shape == ( 4, 32, 1, 1, 1 ), "The plugin only supports 1x1x1 convolution with c == 32 and k == 4" last_layer_node.attrs["inputChannels"] = weights_c last_layer_node.attrs["weights"] = weights last_layer_node.attrs["plugin_version"] = "1" last_layer_node.attrs["plugin_namespace"] = "" del last_layer_node.inputs[1] # add the identity layer, since the last layer is quantized identity_out = gs.Variable("output", dtype=np.float32) identity = gs.Node(op="Identity", inputs=last_layer_node.outputs, outputs=[identity_out]) graph.nodes.append(identity) graph.outputs.append(identity_out) last_layer_node.outputs[0].name = "conv3d1x1x1k4_out" # Convert Deconv to Conv + PixelShuffle if self.use_conv_for_deconv: added_nodes = [] input_d = graph.inputs[0].shape[2] input_h = graph.inputs[0].shape[3] input_w = graph.inputs[0].shape[4] # We start the conversion from the lowest dimension current_d = input_d // 32 current_h = input_h // 32 current_w = input_w // 32 for (node_idx, node) in enumerate(graph.nodes): if node.op == "ConvTranspose": name = node.name node.op = "Conv" assert node.attrs["kernel_shape"] == [ 2, 2, 2 ], "The conversion only makes sense for 2x2x2 deconv" node.attrs["kernel_shape"] = [1, 1, 1] assert node.attrs["strides"] == [ 2, 2, 2 ], "The conversion only makes sense for stride=2x2x2 deconv" node.attrs["strides"] = [1, 1, 1] # Transpose weights from cktrs to (ktrs)c111 or (trsk)c111 assert len( node.inputs ) == 2, "Bias not handled in deconv->conv conversion" weights = node.inputs[1] weights_shape = weights.values.shape weights_c = weights_shape[0] weights_k = weights_shape[1] assert weights_shape[2:] == ( 2, 2, 2), "The conversion only makes sense for 2x2x2 deconv" weights_transpose_axes = ( 1, 2, 3, 4, 0) if self.pixel_shuffle_cdwh else (2, 3, 4, 1, 0) weights.values = weights.values.transpose( weights_transpose_axes).reshape( weights_k * 8, weights_c, 1, 1, 1) deconv_output = node.outputs[0] concat_node = graph.nodes[node_idx + 1] assert concat_node.op == "Concat", "Cannot find the right Concat node" if self.enable_pixelshuffle3d_plugin: # Insert PixelShuffle pixel_shuffle_output = gs.Variable( name + "_pixelshuffle_plugin_out") pixel_shuffle_node = gs.Node( "PIXELSHUFFLE3D_TRT", name + "_pixelshuffle_plugin", {}, [deconv_output], [pixel_shuffle_output]) pixel_shuffle_node.op = "PIXELSHUFFLE3D_TRT" pixel_shuffle_node.attrs["R"] = 2 pixel_shuffle_node.attrs["S"] = 2 pixel_shuffle_node.attrs["T"] = 2 pixel_shuffle_node.attrs["plugin_version"] = "1" pixel_shuffle_node.attrs["plugin_namespace"] = "" assert concat_node.inputs[ 0] is deconv_output, "Wrong concat order" if self.enable_pixelshuffle3d_plugin_concat_fuse: pixel_shuffle_node.outputs = concat_node.outputs pixel_shuffle_node.inputs.append( concat_node.inputs[1]) concat_node.outputs.clear() else: concat_node.inputs[0] = pixel_shuffle_output added_nodes.extend([pixel_shuffle_node]) else: reshape1_shape = [0, weights_k, 2, 2, 2, current_d, current_h, current_w] if self.pixel_shuffle_cdwh else\ [0, 2, 2, 2, weights_k, current_d, current_h, current_w] shuffle_axes = [0, 1, 5, 2, 6, 3, 7, 4 ] if self.pixel_shuffle_cdwh else [ 0, 4, 5, 1, 6, 2, 7, 3 ] current_d *= 2 current_h *= 2 current_w *= 2 reshape2_shape = [ 0, weights_k, current_d, current_h, current_w ] reshape1_shape_const = gs.Constant( name + "_pixelshuffle_reshape1_shape", np.array(reshape1_shape, dtype=np.int32)) reshape2_shape_const = gs.Constant( name + "_pixelshuffle_reshape2_shape", np.array(reshape2_shape, dtype=np.int32)) reshape1_output = gs.Variable( name + "_pixelshuffle_reshape1_out") shuffle_output = gs.Variable( name + "_pixelshuffle_shuffle_out") reshape2_output = gs.Variable( name + "_pixelshuffle_reshape2_out") reshape1_node = gs.Node( "Reshape", name + "_pixelshuffle_reshape1", {}, [deconv_output, reshape1_shape_const], [reshape1_output]) shuffle_node = gs.Node( "Transpose", name + "_pixelshuffle_transpose", {"perm": shuffle_axes}, [reshape1_output], [shuffle_output]) reshape2_node = gs.Node( "Reshape", name + "_pixelshuffle_reshape2", {}, [shuffle_output, reshape2_shape_const], [reshape2_output]) assert concat_node.inputs[ 0] is deconv_output, "Wrong concat order" concat_node.inputs[0] = reshape2_output added_nodes.extend( [reshape1_node, shuffle_node, reshape2_node]) graph.nodes.extend(added_nodes) # Remove the four unnecessary outputs. graph.outputs = [ output for output in graph.outputs if output.name == "output" ] # Remove dead nodes. graph.cleanup().toposort() # Add names to the layer after the graph is topsorted. uniq_num = 0 for node in graph.nodes: if not node.name or node.name.isdigit(): node.name = 'gs_{}_{}'.format(str(node.op), uniq_num) node.attrs['name'] = node.name uniq_num += 1 for out_idx, out_tensor in enumerate(node.outputs): postfix = "_" + out_idx if len(node.outputs) > 1 else "" if not out_tensor.name or out_tensor.name.isdigit(): out_tensor.name = node.name + "__output" + postfix return gs.export_onnx(graph)
import onnx_graphsurgeon as gs import os import tensorrt as trt nLoop = 10 nC = 32 onnxFile0 = "model-0.onnx" onnxFile1 = "model-1.onnx" tensor0 = gs.Variable(name="tensor-0", dtype=np.float32, shape=['B', 1, 16, 16]) constant32x1 = gs.Constant( "constant32x1", np.ascontiguousarray( np.random.rand(nC, 1, 3, 3).reshape(nC, 1, 3, 3).astype(np.float32) * 2 - 1)) constant32x32 = gs.Constant( "constant32x32", np.ascontiguousarray( np.random.rand(nC, nC, 3, 3).reshape(nC, nC, 3, 3).astype(np.float32) * 2 - 1)) constant32 = gs.Constant( "constant32", np.ascontiguousarray( np.random.rand(1, nC, 1, 1).reshape(1, nC, 1, 1).astype(np.float32) * 2 - 1)) constant32t = gs.Constant( "constant32t", np.ascontiguousarray(
tensor0 = gs.Variable(name="tensor0", dtype=np.float32, shape=['B', 3, 64, 64]) # 三个真正有用的张量 tensor1 = gs.Variable(name="tensor1", dtype=np.float32, shape=['B', 3, 64, 64]) tensor2 = gs.Variable(name="tensor2", dtype=np.float32, shape=['B', 3, 64, 64]) tensor3 = gs.Variable(name="tensor3", dtype=np.float32, shape=['B', 3, 64, 64]) # 一个假输入张量 tensor4 = gs.Variable(name="tensor4", dtype=np.float32, shape=['B', 1, 64, 64]) # 一个假输出张量 tensor5 = gs.Variable(name="tensor5", dtype=np.float32, shape=['B', 1, 64, 64]) # 两个无用张量 tensor6 = gs.Variable(name="tensor6", dtype=np.float32, shape=['B', 1, 64, 64]) tensor7 = gs.Variable(name="tensor7", dtype=np.float32, shape=None) # 中间结果张量 tensor8 = gs.Variable(name="tensor8", dtype=np.float32, shape=None) constant0 = gs.Constant(name="w", values=np.ones(shape=[1, 1, 1, 1], dtype=np.float32)) node0 = gs.Node(name="myAdd0", op="Add", inputs=[constant0, constant0], outputs=[tensor7]) node1 = gs.Node(name="myAdd1", op="Add", inputs=[tensor7, constant0], outputs=[tensor8]) node2 = gs.Node(name="myAdd2", op="Add", inputs=[tensor0, tensor8], outputs=[tensor1]) # 有效节点 node3 = gs.Node(name="myAdd3", op="Add",
# from collections import OrderedDict import numpy as np import onnx import onnx_graphsurgeon as gs tensor0 = gs.Variable(name="tensor0", dtype=np.float32, shape=['B', 3, 64, 64]) # 定义张量(变量) tensor1 = gs.Variable(name="tensor1", dtype=np.float32, shape=['B', 1, 64, 64]) tensor2 = gs.Variable(name="tensor2", dtype=np.float32, shape=None) # 可以不知道形状或者数据类型 tensor3 = gs.Variable(name="tensor3", dtype=np.float32, shape=None) constant0 = gs.Constant(name="constant0", values=np.ones(shape=[1, 3, 3, 3], dtype=np.float32)) # 定义张量(常量) constant1 = gs.Constant(name="constant1", values=np.ones(shape=[1], dtype=np.float32)) node0 = gs.Node(name="myConv", op="Conv", inputs=[tensor0, constant0], outputs=[tensor1]) # 定义节点,使用张量作为输入和输出 node0.attrs = OrderedDict([ ['dilations', [1, 1]], ['kernel_shape', [3, 3]], ['pads', [1, 1, 1, 1]], ['strides', [1, 1]], ]) # 节点的属性参数
node_concat = node.i(2, 0) values = [] for i in range(len(node_concat.inputs)): c = node_concat.i(i, 0) # print(c) while c.op != 'Constant': c = c.i(0, 0) values.append(c.attrs['value'].values) #以下是不可靠的写法(不可靠地假定了0号父亲是Constant) #node_concat.i(0, 0).attrs['value'] = gs.Constant('', np.concatenate(values)) #node.inputs[2] = node_concat.inputs[0] #以下是更可靠的写法 node_constant = gs.Node(op="Constant", name=node_concat.name, attrs={'value':gs.Constant('', np.concatenate(values))}) node_constant.outputs = node_concat.outputs[:] graph.nodes.append(node_constant) node_concat.outputs.clear() if node.op == 'Unsqueeze' and node.i(0, 0).op == 'Constant' and node.i(0, 0).attrs['value'].dtype == np.float64: node.i(0, 0).attrs['value'] = gs.Constant('', np.asarray([node.i(0, 0).attrs['value'].values], dtype=np.float32)) if node.op == 'Clip': node_cast0 = node.i(1, 0) node_cast1 = node.i(2, 0) #change data type to fp32 node_cast0.i(0, 0).attrs['value'] = gs.Constant('', np.asarray([-1.0], dtype=np.float32)) node_cast1.i(0, 0).attrs['value'] = gs.Constant('', np.asarray([1.0], dtype=np.float32)) #skip cast
def fuse_res2_mega(self): """ Search and replace all the res2 layers with the res2 megakernel plugin. This fusion is for mega fusion of entire res2a_* """ logging.info("Fusing ops in res2_mega") op_names_list = [ "res2a_branch1", "res2a_branch2a", "res2a_branch2a_relu", "res2a_branch2b", "res2a_branch2b_relu", "res2a_branch2c", "res2a", "res2a_relu", "res2b_branch2a", "res2b_branch2a_relu", "res2b_branch2b", "res2b_branch2b_relu", "res2b_branch2c", "res2b", "res2b_relu", "res2c_branch2a", "res2c_branch2a_relu", "res2c_branch2b", "res2c_branch2b_relu", "res2c_branch2c", "res2c", "res2c_relu", ] # setup plugin info plugin_name = "RES2_FULL_FUSION" # prep fusion: constants and attributes op_dict = dict() [ op_dict.update({_n.name: _n}) for _n in self.graph.nodes if _n.name in op_names_list ] op_list = [op_dict[_n] for _n in op_names_list] assert len(op_names_list) == len( op_list), "Need to capture all op objects in op_names_list" plugin_inp = [op_list[0].inputs[0]] plugin_out = [op_list[-1].outputs[0]] scale64 = gs.Constant("scale64", values=np.ones((64), dtype=np.float32)) scale256 = gs.Constant("scale256", values=np.ones((256), dtype=np.float32)) rescale = gs.Constant("rescale", values=np.ones((256), dtype=np.float32)) # build array with dynamic ranges required for the fusion plugin # NOTE: order matters dyn_list = [ self.dyn_range_map[plugin_inp[0].name], self.dyn_range_map[op_list[0].outputs[0].name], self.dyn_range_map[op_list[2].outputs[0].name], self.dyn_range_map[op_list[4].outputs[0].name], self.dyn_range_map[op_list[5].outputs[0].name], self.dyn_range_map[op_list[7].outputs[0].name], self.dyn_range_map[op_list[9].outputs[0].name], self.dyn_range_map[op_list[11].outputs[0].name], self.dyn_range_map[op_list[12].outputs[0].name], self.dyn_range_map[op_list[14].outputs[0].name], self.dyn_range_map[op_list[16].outputs[0].name], self.dyn_range_map[op_list[18].outputs[0].name], self.dyn_range_map[op_list[19].outputs[0].name], self.dyn_range_map[op_list[21].outputs[0].name], ] dynamic_ranges = np.array(dyn_list, dtype=np.float32) dyn_const = gs.Constant("{}_dynamic_ranges".format(plugin_name), values=dynamic_ranges) # this becomes attributes to ONNX node that fusion plugin uses # NOTE: order does not matter plugin_field_dict = { "c_res2a_br1_w": op_list[0].inputs[1], "s_res2a_br1_s": scale256, "s_res2a_br1_b": op_list[0].inputs[2], "c_res2a_br2a_w": op_list[1].inputs[1], "s_res2a_br2a_s": scale64, "s_res2a_br2a_b": op_list[1].inputs[2], "c_res2a_br2b_w": op_list[3].inputs[1], "s_res2a_br2b_s": scale64, "s_res2a_br2b_b": op_list[3].inputs[2], "c_res2a_br2c_w": op_list[5].inputs[1], "s_res2a_br2c_s": scale256, "s_res2a_br2c_b": op_list[5].inputs[2], "c_res2b_br2a_w": op_list[8].inputs[1], "s_res2b_br2a_s": scale64, "s_res2b_br2a_b": op_list[8].inputs[2], "c_res2b_br2b_w": op_list[10].inputs[1], "s_res2b_br2b_s": scale64, "s_res2b_br2b_b": op_list[10].inputs[2], "c_res2b_br2c_w": op_list[12].inputs[1], "s_res2b_br2c_s": scale256, "s_res2b_br2c_b": op_list[12].inputs[2], "c_res2c_br2a_w": op_list[15].inputs[1], "s_res2c_br2a_s": scale64, "s_res2c_br2a_b": op_list[15].inputs[2], "c_res2c_br2b_w": op_list[17].inputs[1], "s_res2c_br2b_s": scale64, "s_res2c_br2b_b": op_list[17].inputs[2], "c_res2c_br2c_w": op_list[19].inputs[1], "s_res2c_br2c_s": scale256, "s_res2c_br2c_b": op_list[19].inputs[2], "r_res2a_br2c_r": rescale, "r_res2b_br2c_r": rescale, "r_res2c_br2c_r": rescale, "dynamic_ranges": dyn_const, } attrs = { "plugin_version": "1", "plugin_namespace": "", } attrs.update(plugin_field_dict) # replace ops with plugin self.graph.RES2PLUGIN("RnRes2FullFusion_TRT", plugin_name, plugin_inp, plugin_out, attrs) # graph cleanup self.cleanup_graph() # done logging.info("Plugin {} successful".format(plugin_name))
node0 = gs.Node(name="myIdentity0", op="Identity", inputs=[tensor0], outputs=[tensor1]) node1 = gs.Node(name="myIdentity1", op="Identity", inputs=[tensor1], outputs=[tensor2]) graph = gs.Graph(nodes=[node0, node1], inputs=[tensor0], outputs=[tensor2]) graph.cleanup().toposort() onnx.save(gs.export_onnx(graph), "model-02-01.onnx") for node in graph.nodes: if node.op == 'Identity' and node.name == 'myIdentity0': # 遍历计算图找到需要添加节点的位置 constant0 = gs.Constant(name="constant0", values=np.ones(shape=[1, 1, 1, 1], dtype=np.float32)) # 构造新节点和新张量 tensor3 = gs.Variable(name="tensor3", dtype=np.float32, shape=None) newNode = gs.Node(name="myAdd", op="Add", inputs=[node.outputs[0], constant0], outputs=[tensor3]) graph.nodes.append(newNode) # 记得把新节点加入计算图中 index = node.o().inputs.index(node.outputs[0]) # 小心地找到下一个节点中对应输入张量的位置 node.o().inputs[index] = tensor3 # 替换为新张量 graph.cleanup().toposort() onnx.save(gs.export_onnx(graph), "model-02-02.onnx")
# See the License for the specific language governing permissions and # limitations under the License. # import onnx_graphsurgeon as gs import numpy as np import onnx # Computes outputs = input + ((a + b) + d) shape = (1, 3) # Inputs input = gs.Variable("input", shape=shape, dtype=np.float32) # Intermediate tensors a = gs.Constant("a", values=np.ones(shape=shape, dtype=np.float32)) b = gs.Constant("b", values=np.ones(shape=shape, dtype=np.float32)) c = gs.Variable("c") d = gs.Constant("d", values=np.ones(shape=shape, dtype=np.float32)) e = gs.Variable("e") # Outputs output = gs.Variable("output", shape=shape, dtype=np.float32) nodes = [ # c = (a + b) gs.Node("Add", inputs=[a, b], outputs=[c]), # e = (c + d) gs.Node("Add", inputs=[c, d], outputs=[e]), # output = input + e gs.Node("Add", inputs=[input, e], outputs=[output]),
def fuse_br1_br2c(self): """ Match and replace br1+br2c with the fused plugin. This fusion is for Conv(shortcut)-Add-ReLU """ logging.info("Fusing ops in br1_br2c path") op_names_list = [ "res2a_branch1", "res2a_branch2c", "res2a", "res2a_relu" ] # setup plugin info plugin_name = "RES2_BR1_BR2C_1" # prep fusion: constants and attributes op_dict = dict() [ op_dict.update({_n.name: _n}) for _n in self.graph.nodes if _n.name in op_names_list ] op_list = [op_dict[_n] for _n in op_names_list] assert len(op_names_list) == len( op_list), "Need to capture all op objects in op_names_list" scale = gs.Constant("scale", values=np.ones((256), dtype=np.float32)) # build array with dynamic ranges required for the fusion plugin # NOTE: order matters dyn_list = [ self.dyn_range_map[op_list[0].inputs[0].name], self.dyn_range_map[op_list[0].outputs[0].name], self.dyn_range_map[op_list[1].inputs[0].name], self.dyn_range_map[op_list[1].outputs[0].name], self.dyn_range_map[op_list[3].outputs[0].name], ] dynamic_ranges = np.array(dyn_list, dtype=np.float32) dyn_const = gs.Constant("{}_dynamic_ranges".format(plugin_name), values=dynamic_ranges) # this becomes attributes to ONNX node that fusion plugin uses # NOTE: order does not matter plugin_field_dict = { "c_br1_w": op_list[0].inputs[1], "s_br1_s": scale, "s_br1_b": op_list[0].inputs[2], "c_br2c_w": op_list[1].inputs[1], "s_br2c_s": scale, "s_br2c_b": op_list[1].inputs[2], "dynamic_ranges": dyn_const, } attrs = { "plugin_version": "2", "plugin_namespace": "", } attrs.update(plugin_field_dict) # get plugin input/output plugin_inp = [op_list[0].inputs[0], op_list[1].inputs[0]] plugin_out = [op_list[-1].outputs[0]] # replace ops with plugin self.graph.RES2PLUGIN("RnRes2Br1Br2c_TRT", plugin_name, plugin_inp, plugin_out, attrs) # graph cleanup self.cleanup_graph() # done logging.info("Plugin {} successful".format(plugin_name))