def __init__(self, model, dummy_input): model = distiller.make_non_parallel_copy(model) with torch.onnx.set_training(model, False): trace, _ = jit.get_trace_graph(model, dummy_input.cuda()) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, False) graph = trace.graph() self.ops = {} self.params = {} self.edges = [] self.temp = {} in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) # Operators with the same name create very confusing graphs (Resnet, for example), # so we "unroll" them. # Sometimes operations of different types have the same name, so we differentiate # using both name and type # (this happens, for example, when an operator is called via some functional API and # not via a module) same = [ op for op in self.ops.values() if op['orig-name'] + op['type'] == new_op['orig-name'] + new_op['type'] ] if len(same) > 0: new_op['name'] += "." + str(len(same)) new_op['name'] = onnx_name_2_pytorch_name( new_op['name'], new_op['type']) assert len(new_op['name']) > 0 self.ops[new_op['name']] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append( SummaryGraph.Edge(input_.uniqueName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append( SummaryGraph.Edge(new_op['name'], output.uniqueName())) new_op['attrs'] = { attr_name: node[attr_name] for attr_name in node.attributeNames() } self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del model
def __init__(self, model, dummy_input): with torch.onnx.set_training(model, False): trace, _ = jit.get_trace_graph(model, dummy_input) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, False) graph = trace.graph() self.ops = [] self.params = {} self.edges = [] self.temp = {} in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): op = {} op['name'] = node.scopeName() op['orig-name'] = node.scopeName() op['type'] = node.kind().lstrip('::onnx') op['inputs'] = [] op['outputs'] = [] op['params'] = [] # in-place operators create very confusing graphs (Resnet, for example), # so we "unroll" them same = [ layer for layer in self.ops if layer['orig-name'] == op['orig-name'] ] if len(same) > 0: op['name'] += "." + str(len(same)) op['name'] = onnx_name_2_pytorch_name(op['name']) #op['name'] = '\n'.join(op['name'], op['type']) op['name'] += ("\n" + op['type']) # print(op['name']) self.ops.append(op) for input_ in node.inputs(): self.__add_input(op, input_) self.edges.append((input_.uniqueName(), op['name'])) for output in node.outputs(): self.__add_output(op, output) self.edges.append((op['name'], output.uniqueName())) op['attrs'] = { attr_name: node[attr_name] for attr_name in node.attributeNames() } self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr()
def __init__(self, model, dummy_input, apply_scope_name_workarounds=True): self._src_model = model model_clone = distiller.make_non_parallel_copy(model) # Switch all instances of torch.nn.ModuleList in the model to our DistillerModuleList # See documentation of _DistillerModuleList class for details on why this is done model_clone, converted_module_names_map = _to_distiller_modulelist(model_clone) with torch.onnx.set_training(model_clone, False): device = distiller.model_device(model_clone) dummy_input = distiller.convert_tensors_recursively_to(dummy_input, device=device) trace, _ = jit.get_trace_graph(model_clone, dummy_input, _force_outplace=True) # As of PyTorch 1.1.0, ONNX trace optimization has two issues that result in incorrect scope names # of nodes in the trace graph. # These can make it impossible, in some cases, to derive the connectivity of the model using the original # module names. So we try to detect these cases and apply workarounds # Issue #1: # Gemm ops (aka "Linear" / "addmm" / "FC") get the scope name of the last non-Gemm node # that came before them. # Note that if the node prior to the Gemm node isn't the result of a dedicated module call, # then this issue doesn't occur. For simplicity we just track all Gemms. # TODO: This should be fixed in PyTorch 1.2.0, revisit when it's released aten_addmm_nodes_scope_names = [] onnx_gemm_count = 0 # Issue #2: # Dropout ops are removed by ONNX trace optimization. However, the op BEFORE the original dropout op # gets the scope name of the dropout op pre_dropout_nodes_scope_names = OrderedDict() prev_non_dropout_op = None for node in trace.graph().nodes(): kind = node.kind() if 'aten' not in kind: continue if kind == 'aten::dropout': if prev_non_dropout_op: pre_dropout_nodes_scope_names[node.scopeName()] = prev_non_dropout_op.scopeName() else: prev_non_dropout_op = node if kind == 'aten::addmm': aten_addmm_nodes_scope_names.append(node.scopeName()) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, torch.onnx.OperatorExportTypes.ONNX) graph = trace.graph() self.ops = OrderedDict() self.module_ops_map = defaultdict(list) self.params = OrderedDict() self.edges = [] self.temp = OrderedDict() in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) if apply_scope_name_workarounds: # Here we apply the workaround to the Gemm nodes scope name issue mentioned above if new_op['type'] == 'Gemm': new_op['orig-name'] = aten_addmm_nodes_scope_names[onnx_gemm_count] new_op['name'] = new_op['orig-name'] onnx_gemm_count += 1 # Here we apply the workaround to the issue of dropout op scope name overriding previous op's # scope name if new_op['name'] in pre_dropout_nodes_scope_names: new_op['orig-name'] = pre_dropout_nodes_scope_names[new_op['name']] new_op['name'] = new_op['orig-name'] # Convert the graph node's scope name to a PyTorch module name module_name = onnx_name_2_pytorch_name(new_op['orig-name']) # Get name from before conversion to DistillerModuleList module_name = converted_module_names_map[module_name] if len(module_name) == 0: # Special case where the module name is an empty string - this happens # when the op is called from the "top-level" of the model new_op['name'] = 'top_level_op' else: new_op['name'] = module_name # Save the calling module name in the op dict. Denormalize it so it can # be directly matched with the actual model module_name = distiller.denormalize_module_name(self._src_model, module_name) new_op['module-name'] = module_name # The node's scope name in the graph corresponds to the module from which the op was called. # This means that when ops are invoked from the same module via functional calls or direct # operations on tensors, these ops will have the SAME MODEL NAME associated with them. # For example: # t = t1 + t2 # t = F.relu(t) # In this case the add operation and the ReLU operation will have the same name, which is # derived from the module they're contained in. # # Another case where different ops will have the same module name is when a module is reused: # out = self.conv1(x) # out = self.relu(out) <=== First use of self.relu # out = self.conv2(out) # out = self.relu(out) <=== Second use of self.relu # In this case the graph will have 2 distinct ReLU nodes, with the same scope name. # # Operators with the same name create very confusing graphs (in ResNet, for example), # so we "unroll" them. same_module_cnt = len(self.module_ops_map[module_name]) if same_module_cnt: new_op['name'] += "__" + str(same_module_cnt) self.module_ops_map[module_name].append(new_op['name']) # Finally we register the new op in the ops collection msglogger.debug("new sgraph node - Scope name: {} ; Type: {} ; Display name {}".format( new_op['orig-name'], new_op['type'], new_op['name'])) self.ops[new_op['name']] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append(SummaryGraph.Edge(input_.uniqueName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append(SummaryGraph.Edge(new_op['name'], output.uniqueName())) new_op['attrs'] = OrderedDict([(attr_name, node[attr_name]) for attr_name in node.attributeNames()]) self.__merge_pad_avgpool() self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del model_clone
def __init__(self, model, dummy_input): self._src_model = model model_clone = distiller.make_non_parallel_copy(model) with torch.onnx.set_training(model_clone, False): device = next(model_clone.parameters()).device dummy_input = distiller.convert_tensors_recursively_to( dummy_input, device=device ) trace, _ = jit.get_trace_graph( model_clone, dummy_input, _force_outplace=True ) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, torch.onnx.OperatorExportTypes.ONNX) graph = trace.graph() self.ops = OrderedDict() self.params = OrderedDict() self.edges = [] self.temp = OrderedDict() in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) # Operators with the same name create very confusing graphs (Resnet, for example), # so we "unroll" them. # Sometimes operations of different types have the same name, so we differentiate # using both name and type # (this happens, for example, when an operator is called via some functional API and # not via a module) same = [ op for op in self.ops.values() if op["orig-name"] + op["type"] == new_op["orig-name"] + new_op["type"] ] if len(same) > 0: new_op["name"] += "." + str(len(same)) new_op["name"] = onnx_name_2_pytorch_name( new_op["name"], new_op["type"] ) assert len(new_op["name"]) > 0 if new_op["name"] in self.ops: # This is a patch. # ONNX names integrate the node type, while we don't (design bug). # This means that while parsing the ONNX graph we might find two nodes with the "same" name. # This patch increments the instance name, but this may break in the future. new_op["name"] = increment_instance(new_op["name"]) self.ops[new_op["name"]] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append( SummaryGraph.Edge(input_.uniqueName(), new_op["name"]) ) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append( SummaryGraph.Edge(new_op["name"], output.uniqueName()) ) new_op["attrs"] = OrderedDict( [ (attr_name, node[attr_name]) for attr_name in node.attributeNames() ] ) self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del model_clone
import torch import torch.nn as nn import torch.jit as jit class SimpleNet(nn.Module): def __init__(self): super(SimpleNet, self).__init__() self.conv1 = nn.Conv2d(3,10,kernel_size=3) def forward(self,x): x = self.conv1(x) return x net = SimpleNet() var = torch.rand(1,3,224,224) trace, out = jit.get_trace_graph(net,var) print("trace:\n{}".format(trace)) print(out.size())
def __init__(self, model, dummy_input, apply_scope_name_workarounds=True): self._src_model = model self._named_modules = OrderedDict(model.named_modules()) self._adj_map = None self._layers_topological_order = None self._top_level_ops = set() model_clone = utils.make_non_parallel_copy(model) # Switch all instances of torch.nn.ModuleList in the model to our CACPModuleList # See documentation of _ModuleList class for details on why this is done model_clone, converted_module_names_map = _to_modulelist(model_clone) with torch.onnx.set_training(model_clone, False): device = utils.model_device(model_clone) dummy_input = utils.convert_tensors_recursively_to(dummy_input, device=device) self.dummy_input = dummy_input trace, _ = jit.get_trace_graph(model_clone, dummy_input, _force_outplace=True) # As of PyTorch 1.3.0, ONNX trace optimization has an issue that results in incorrect scope names # of nodes in the trace graph. # These can make it impossible, in some cases, to derive the connectivity of the model using the original # module names. So we try to detect these cases and apply workarounds # The issue is: # Dropout ops are removed by ONNX trace optimization. However, the op BEFORE the original dropout op # gets the scope name of the dropout op pre_dropout_nodes_scope_names = OrderedDict() prev_non_dropout_op = None for node in trace.graph().nodes(): kind = node.kind() if 'aten' not in kind: continue if kind == 'aten::dropout': if prev_non_dropout_op: pre_dropout_nodes_scope_names[node.scopeName( )] = prev_non_dropout_op.scopeName() else: prev_non_dropout_op = node # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, torch.onnx.OperatorExportTypes.ONNX) graph = trace.graph() self.ops = OrderedDict() self.module_ops_map = defaultdict(list) self.params = OrderedDict() self.edges = [] self.temp = OrderedDict() in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) if apply_scope_name_workarounds: # Here we apply the workaround to the issue of dropout op scope name overriding previous op's # scope name if new_op['name'] in pre_dropout_nodes_scope_names: new_op['orig-name'] = pre_dropout_nodes_scope_names[ new_op['name']] new_op['name'] = new_op['orig-name'] # Convert the graph node's scope name to a PyTorch module name module_name = onnx_name_2_pytorch_name(new_op['orig-name']) # Get name from before conversion to CACPModuleList module_name = converted_module_names_map[module_name] if len(module_name) == 0: # Special case where the module name is an empty string - this happens # when the op is called from the "top-level" of the model new_op['name'] = 'top_level_op' else: new_op['name'] = module_name # Save the calling module name in the op dict. Denormalize it so it can # be directly matched with the actual model module_name = utils.denormalize_module_name( self._src_model, module_name) new_op['module-name'] = module_name # The node's scope name in the graph corresponds to the module from which the op was called. # This means that when ops are invoked from the same module via functional calls or direct # operations on tensors, these ops will have the SAME MODEL NAME associated with them. # For example: # t = t1 + t2 # t = F.relu(t) # In this case the add operation and the ReLU operation will have the same name, which is # derived from the module they're contained in. # # Another case where different ops will have the same module name is when a module is reused: # out = self.conv1(x) # out = self.relu(out) <=== First use of self.relu # out = self.conv2(out) # out = self.relu(out) <=== Second use of self.relu # In this case the graph will have 2 distinct ReLU nodes, with the same scope name. # # Operators with the same name create very confusing graphs (in ResNet, for example), # so we "unroll" them. same_module_cnt = len(self.module_ops_map[module_name]) if same_module_cnt: # TODO: Was this meant to be applied only to 'top_level_ops'? Also, it's not # applied to the first module that had the same name new_op['name'] += "_%s_%d" % (new_op['type'], same_module_cnt) self.module_ops_map[module_name].append(new_op['name']) # Finally we register the new op in the ops collection self.ops[new_op['name']] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append( SummaryGraph.Edge(input_.debugName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append( SummaryGraph.Edge(new_op['name'], output.debugName())) new_op['attrs'] = OrderedDict([ (attr_name, node[attr_name]) for attr_name in node.attributeNames() ]) self.__merge_pad_avgpool() self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del trace del graph del model_clone
def __init__(self, model, dummy_input): self._src_model = model model_clone = distiller.make_non_parallel_copy(model) with torch.onnx.set_training(model_clone, False): device = next(model_clone.parameters()).device dummy_input = distiller.convert_tensors_recursively_to( dummy_input, device=device) trace, _ = jit.get_trace_graph(model_clone, dummy_input, _force_outplace=True) # ONNX trace optimization has issues with Gemm ops (aka "Linear" / "addmm" / "FC"), where # Gemm nodes get the scope name of the last non-Gemm node that came before them. This can make # it impossible, in some cases, to derive the connectivity of the model using the original # module names. So we save the scope names for these nodes from the un-optimized trace. aten_addmm_nodes_scope_names = [ n.scopeName() for n in trace.graph().nodes() if n.kind() == 'aten::addmm' ] onnx_gemm_count = 0 # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, torch.onnx.OperatorExportTypes.ONNX) graph = trace.graph() self.ops = OrderedDict() self.module_ops_map = defaultdict(list) self.params = OrderedDict() self.edges = [] self.temp = OrderedDict() in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) # Here we apply the workaround to the Gemm nodes scope name issue mentioned above if new_op['type'] == 'Gemm': new_op['orig-name'] = aten_addmm_nodes_scope_names[ onnx_gemm_count] new_op['name'] = new_op['orig-name'] onnx_gemm_count += 1 # Convert the graph node's scope name to a PyTorch module name module_name = onnx_name_2_pytorch_name(new_op['orig-name']) new_op['module-name'] = module_name if len(module_name) == 0: # Special case where the module name is an empty string - this happens # when the op is called from the "top-level" of the model new_op['name'] = 'top_level_op' else: new_op['name'] = module_name # The node's scope name in the graph corresponds to the module from which the op was called. # This means that when ops are invoked from the same module via functional calls or direct # operations on tensors, these ops will have the SAME MODEL NAME associated with them. # For example: # t = t1 + t2 # t = F.relu(t) # In this case the add operation and the ReLU operation will have the same name, which is # derived from the module they're contained in. # # Another case where different ops will have the same module name is when a module is reused: # out = self.conv1(x) # out = self.relu(out) <=== First use of self.relu # out = self.conv2(out) # out = self.relu(out) <=== Second use of self.relu # In this case the graph will have 2 distinct ReLU nodes, with the same scope name. # # Operators with the same name create very confusing graphs (in ResNet, for example), # so we "unroll" them. same_module_cnt = len(self.module_ops_map[module_name]) if same_module_cnt: new_op['name'] += "__" + str(same_module_cnt) self.module_ops_map[module_name].append(new_op['name']) # Finally we register the new op in the ops collection msglogger.debug( "new sgraph node - Scope name: {} ; Type: {} ; Display name {}" .format(new_op['orig-name'], new_op['type'], new_op['name'])) self.ops[new_op['name']] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append( SummaryGraph.Edge(input_.uniqueName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append( SummaryGraph.Edge(new_op['name'], output.uniqueName())) new_op['attrs'] = OrderedDict([ (attr_name, node[attr_name]) for attr_name in node.attributeNames() ]) self.__merge_pad_avgpool() self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del model_clone