def successors_f(self, node_name, successors_types, done_list=None, logging=None): """Returns a list of <op>'s successors, if they match the <successors_types> criteria. Traverse the graph, starting at node <node_name>, and search for successor nodes, that have one of the node types listed in <successors_types>. If none is found, then return an empty list. <node_name> and the returned list of successors are strings, because """ node_name = distiller.normalize_module_name(node_name) node = self.find_op(node_name) node_is_an_op = True if node is None: node_is_an_op = False node = self.find_param(node_name) if node is None: #raise ValueError("something went wrong") return [] if done_list is None: done_list = [] done_list.append(node_name) if not isinstance(successors_types, list): successors_types = [successors_types] if node_is_an_op: # We check if we found the type of node we're looking for, # and that this is not the first node in our search. if node['type'] in successors_types and len(done_list) > 1: return [ distiller.denormalize_module_name(self._src_model, node_name) ] # This is an operation node succs = [ edge.dst for edge in self.edges if (edge.src == node_name and edge.dst not in done_list) ] else: # This is a data node succs = [ edge.dst for edge in self.edges if (edge.src == node_name and edge.dst not in done_list) ] ret = [] for successor in succs: ret += self.successors_f(successor, successors_types, done_list, logging) return [ distiller.denormalize_module_name(self._src_model, node) for node in ret ]
def create_thinning_recipe_channels(sgraph, model, zeros_mask_dict): """Create a recipe for removing channels from Convolution layers. The 4D weights of the model parameters (i.e. the convolution parameters) are examined one by one, to determine which has channels that are all zeros. For each weights tensor that has at least one zero-channel, we create a "thinning recipe". The thinning recipe contains meta-instructions of how the model should be changed in order to remove the channels. """ msglogger.info("Invoking create_thinning_recipe_channels") thinning_recipe = ThinningRecipe(modules={}, parameters={}) layers = {mod_name : m for mod_name, m in model.named_modules()} # Traverse all of the model's parameters, search for zero-channels, and # create a thinning recipe that descibes the required changes to the model. for param_name, param in model.named_parameters(): # We are only interested in 4D weights (of Convolution layers) if param.dim() != 4: continue num_channels = param.size(1) nonzero_channels = find_nonzero_channels(param, param_name) # If there are non-zero channels in this tensor then continue to next tensor if num_channels <= len(nonzero_channels): continue # We are removing channels, so update the number of incoming channels (IFMs) # in the convolutional layer layer_name = param_name_2_layer_name(param_name) assert isinstance(layers[layer_name], torch.nn.modules.Conv2d) append_module_directive(thinning_recipe, layer_name, key='in_channels', val=len(nonzero_channels)) # Select only the non-zero filters indices = nonzero_channels.data.squeeze() append_param_directive(thinning_recipe, param_name, (1, indices)) # Find all instances of Convolution layers that immediately preceed this layer predecessors = sgraph.predecessors_f(normalize_module_name(layer_name), ['Conv']) # Convert the layers names to PyTorch's convoluted naming scheme (when DataParallel is used) predecessors = [denormalize_module_name(model, predecessor) for predecessor in predecessors] for predecessor in predecessors: # For each of the convolutional layers that preceed, we have to reduce the number of output channels. append_module_directive(thinning_recipe, predecessor, key='out_channels', val=len(nonzero_channels)) # Now remove channels from the weights tensor of the successor conv append_param_directive(thinning_recipe, predecessor+'.weight', (0, indices)) # Now handle the BatchNormalization layer that follows the convolution bn_layers = sgraph.predecessors_f(normalize_module_name(layer_name), ['BatchNormalization']) if len(bn_layers) > 0: assert len(bn_layers) == 1 # Thinning of the BN layer that follows the convolution bn_layer_name = denormalize_module_name(model, bn_layers[0]) bn_thinning(thinning_recipe, layers, bn_layer_name, len_thin_features=len(nonzero_channels), thin_features=indices) return thinning_recipe
def predecessors_f(self, node_name, predecessors_types, done_list=None, logging=None): """Returns a list of <op>'s predecessors, if they match the <predecessors_types> criteria. """ node_name = distiller.normalize_module_name(node_name) node = self.find_op(node_name) node_is_an_op = True if node is None: node_is_an_op = False node = self.find_param(node_name) if node is None: msglogger.warning( "predecessors_f: Could not find node {}".format(node_name)) return [] if done_list is None: done_list = [] done_list.append(node_name) if not isinstance(predecessors_types, list): predecessors_types = [predecessors_types] if node_is_an_op: # We check if we found the type of node we're looking for, # and that this is not the first node in our search. if node['type'] in predecessors_types and len(done_list) > 1: return [ distiller.denormalize_module_name(self._src_model, node_name) ] # This is an operation node preds = [ edge.src for edge in self.edges if (edge.dst == node_name and edge.src not in done_list) ] else: # This is a data node preds = [ edge.src for edge in self.edges if (edge.dst == node_name and edge.src not in done_list) ] ret = [] for predecessor in preds: ret += self.predecessors_f(predecessor, predecessors_types, done_list, logging) return [ distiller.denormalize_module_name(self._src_model, node) for node in ret ]
def successors(self, node, depth, done_list=None, denorm_names=True): """Returns a list of <op>'s successors""" if done_list is None: done_list = [] node_name = node['name'] if isinstance(node, dict) else node succs = [ edge.dst for edge in self.edges if (edge.src == node_name and edge.dst not in done_list) ] done_list += succs if depth == 1: ret = succs else: ret = [] for successor in succs: ret += self.successors(successor, depth - 1, done_list, denorm_names) if denorm_names: ret = [ distiller.denormalize_module_name(self._src_model, x) for x in ret ] return ret
def append_module_directive(model, thinning_recipe, module_name, key, val): msglogger.debug("\t[recipe] setting {}.{} = {}".format( module_name, key, val)) module_name = denormalize_module_name(model, module_name) mod_directive = thinning_recipe.modules.get(module_name, {}) mod_directive[key] = val thinning_recipe.modules[module_name] = mod_directive
def successors(self, node, depth, done_list=None): """Returns a list of <op>'s successors""" if done_list is None: done_list = [] if isinstance(node, dict): # This is an operation node succs = [ edge.dst for edge in self.edges if (edge.src == node["name"] and edge.dst not in done_list) ] done_list += succs else: # This is a data node succs = [ edge.dst for edge in self.edges if (edge.src == node and edge.dst not in done_list) ] done_list += succs if depth == 1: ret = succs else: ret = [] for successor in succs: ret += self.successors(successor, depth - 1, done_list) return [distiller.denormalize_module_name(self._src_model, x) for x in ret]
def predecessors(self, op, depth, done_list=None): """Returns a list of <op>'s predecessors""" if done_list is None: done_list = [] if isinstance(op, dict): preds = [ edge.src for edge in self.edges if (edge.dst == op["name"] and edge.src not in done_list) ] done_list += preds else: preds = [ edge.src for edge in self.edges if (edge.dst == op and edge.src not in done_list) ] done_list += preds if depth == 1: ret = preds else: ret = [] for predecessor in preds: ret += self.predecessors(predecessor, depth - 1, done_list) return [distiller.denormalize_module_name(self._src_model, x) for x in ret]
def named_params_layers(self): for param_name, param in self._src_model.named_parameters(): # remove the extension of param_name, and then normalize it # to create a normalized layer name normalized_layer_name = distiller.normalize_module_name( '.'.join(param_name.split('.')[:-1])) sgraph_layer_name = distiller.denormalize_module_name( self._src_model, normalized_layer_name) yield sgraph_layer_name, param_name, param
def name_test(dataset, arch): model = create_model(False, dataset, arch, parallel=False) modelp = create_model(False, dataset, arch, parallel=True) assert model is not None and modelp is not None mod_names = [mod_name for mod_name, _ in model.named_modules()] mod_names_p = [mod_name for mod_name, _ in modelp.named_modules()] assert mod_names is not None and mod_names_p is not None assert len(mod_names)+1 == len(mod_names_p) for i in range(len(mod_names)-1): assert mod_names[i+1] == normalize_module_name(mod_names_p[i+2]) logging.debug("{} {} {}".format(mod_names_p[i+2], mod_names[i+1], normalize_module_name(mod_names_p[i+2]))) assert mod_names_p[i+2] == denormalize_module_name(modelp, mod_names[i+1])
def adjacency_map(self, dedicated_modules_only=False): """Returns a mapping from each op in the graph to its immediate predecessors and successors. The keys in the generated mapping are op names, and the values are instances of AdjacentsEntry. The op names are "de-normalized", meaning they can be used directly with the underlying model's named_modules(), for example. Args: dedicated_modules_only (bool): If set, the generated mapping will not include any ops that can't be associated with a dedicated module within the underlying model. Examples of this will be functional calls, such as "F.relu()", and tensor operations, such as "t3 = t1 + t2". """ adj_map = OrderedDict() for op_name, op in self.ops.items(): def dedicated_module_check(n): module_name = self.ops[distiller.normalize_module_name( n)]['module-name'] return len(self.module_ops_map[module_name] ) == 1 or not dedicated_modules_only if not dedicated_module_check(op_name): continue entry = AdjacentsEntry() # Find the immediate preceding and succeeding modules. Depth of 1 gets us the # input and output tensors, depth of 2 gets the actual modules entry.predecessors = [ n for n in self.predecessors(op, 2) if dedicated_module_check(n) ] entry.successors = [ n for n in self.successors(op, 2) if dedicated_module_check(n) ] adj_map[distiller.denormalize_module_name(self._src_model, op_name)] = entry return adj_map
def append_module_directive(model, thinning_recipe, module_name, key, val): module_name = denormalize_module_name(model, module_name) mod_directive = thinning_recipe.modules.get(module_name, {}) mod_directive[key] = val thinning_recipe.modules[module_name] = mod_directive
def create_thinning_recipe_filters(sgraph, model, zeros_mask_dict): """Create a recipe for removing filters from Convolution layers. The 4D weights of the model parameters (i.e. the convolution parameters) are examined one by one, to determine which has filters that are all zeros. For each weights tensor that has at least one zero-filter, we create a "thinning recipe". The thinning recipe contains meta-instructions of how the model should be changed in order to remove the filters. """ msglogger.info("Invoking create_thinning_recipe_filters") thinning_recipe = ThinningRecipe(modules={}, parameters={}) layers = {mod_name: m for mod_name, m in model.named_modules()} for param_name, param in model.named_parameters(): # We are only interested in 4D weights if param.dim() != 4: continue # Find the number of zero-valued filters in this weights tensor filter_view = param.view(param.size(0), -1) num_filters = filter_view.size()[0] nonzero_filters = torch.nonzero(filter_view.abs().sum(dim=1)) num_nnz_filters = nonzero_filters.nelement() if num_nnz_filters == 0: raise ValueError( "Trying to set zero filters for parameter %s is not allowed" % param_name) # If there are non-zero filters in this tensor then continue to next tensor if num_filters <= num_nnz_filters: msglogger.debug("SKipping {} shape={}".format( param_name_2_layer_name(param_name), param.shape)) continue msglogger.info("In tensor %s found %d/%d zero filters", param_name, num_filters - num_nnz_filters, num_filters) # We are removing filters, so update the number of outgoing channels (OFMs) # in the convolutional layer layer_name = param_name_2_layer_name(param_name) assert isinstance(layers[layer_name], torch.nn.modules.Conv2d) append_module_directive(model, thinning_recipe, layer_name, key='out_channels', val=num_nnz_filters) # Select only the non-zero filters indices = nonzero_filters.data.squeeze() append_param_directive(thinning_recipe, param_name, (0, indices)) if layers[layer_name].bias is not None: # This convolution has bias coefficients append_param_directive(thinning_recipe, layer_name + '.bias', (0, indices)) # Find all instances of Convolution or FC (GEMM) layers that immediately follow this layer successors = sgraph.successors_f(normalize_module_name(layer_name), ['Conv', 'Gemm']) # Convert the layers names to PyTorch's convoluted naming scheme (when DataParallel is used) successors = [ denormalize_module_name(model, successor) for successor in successors ] for successor in successors: if isinstance(layers[successor], torch.nn.modules.Conv2d): # For each of the convolutional layers that follow, we have to reduce the number of input channels. append_module_directive(model, thinning_recipe, successor, key='in_channels', val=num_nnz_filters) msglogger.debug("[recipe] {}: setting in_channels = {}".format( successor, num_nnz_filters)) # Now remove channels from the weights tensor of the successor conv append_param_directive( thinning_recipe, denormalize_module_name(model, successor) + '.weight', (1, indices)) elif isinstance(layers[successor], torch.nn.modules.Linear): # If a Linear (Fully-Connected) layer follows, we need to update it's in_features member fm_size = layers[successor].in_features // layers[ layer_name].out_channels in_features = fm_size * num_nnz_filters append_module_directive(model, thinning_recipe, successor, key='in_features', val=in_features) msglogger.debug("[recipe] {}: setting in_features = {}".format( successor, in_features)) # Now remove channels from the weights tensor of the successor FC layer: # This is a bit tricky: fm_height = fm_width = int(math.sqrt(fm_size)) view_4D = (layers[successor].out_features, layers[layer_name].out_channels, fm_height, fm_width) view_2D = (layers[successor].out_features, in_features) append_param_directive( thinning_recipe, denormalize_module_name(model, successor) + '.weight', (1, indices, view_4D, view_2D)) # Now handle the BatchNormalization layer that follows the convolution bn_layers = sgraph.successors_f(normalize_module_name(layer_name), ['BatchNormalization']) if len(bn_layers) > 0: assert len(bn_layers) == 1 # Thinning of the BN layer that follows the convolution bn_layer_name = denormalize_module_name(model, bn_layers[0]) bn_thinning(thinning_recipe, layers, bn_layer_name, len_thin_features=num_nnz_filters, thin_features=indices) return thinning_recipe
def __init__(self, model, dummy_input, apply_scope_name_workarounds=True): self._src_model = model model_clone = distiller.make_non_parallel_copy(model) # Switch all instances of torch.nn.ModuleList in the model to our DistillerModuleList # See documentation of _DistillerModuleList class for details on why this is done model_clone, converted_module_names_map = _to_distiller_modulelist(model_clone) with torch.onnx.set_training(model_clone, False): device = distiller.model_device(model_clone) dummy_input = distiller.convert_tensors_recursively_to(dummy_input, device=device) trace, _ = jit.get_trace_graph(model_clone, dummy_input, _force_outplace=True) # As of PyTorch 1.1.0, ONNX trace optimization has two issues that result in incorrect scope names # of nodes in the trace graph. # These can make it impossible, in some cases, to derive the connectivity of the model using the original # module names. So we try to detect these cases and apply workarounds # Issue #1: # Gemm ops (aka "Linear" / "addmm" / "FC") get the scope name of the last non-Gemm node # that came before them. # Note that if the node prior to the Gemm node isn't the result of a dedicated module call, # then this issue doesn't occur. For simplicity we just track all Gemms. # TODO: This should be fixed in PyTorch 1.2.0, revisit when it's released aten_addmm_nodes_scope_names = [] onnx_gemm_count = 0 # Issue #2: # Dropout ops are removed by ONNX trace optimization. However, the op BEFORE the original dropout op # gets the scope name of the dropout op pre_dropout_nodes_scope_names = OrderedDict() prev_non_dropout_op = None for node in trace.graph().nodes(): kind = node.kind() if 'aten' not in kind: continue if kind == 'aten::dropout': if prev_non_dropout_op: pre_dropout_nodes_scope_names[node.scopeName()] = prev_non_dropout_op.scopeName() else: prev_non_dropout_op = node if kind == 'aten::addmm': aten_addmm_nodes_scope_names.append(node.scopeName()) # Let ONNX do the heavy lifting: fusing the convolution nodes; fusing the nodes # composing a GEMM operation; etc. torch.onnx._optimize_trace(trace, torch.onnx.OperatorExportTypes.ONNX) graph = trace.graph() self.ops = OrderedDict() self.module_ops_map = defaultdict(list) self.params = OrderedDict() self.edges = [] self.temp = OrderedDict() in_out = list(graph.inputs()) + list(graph.outputs()) for param in in_out: self.__add_param(param) for node in graph.nodes(): new_op = self.__create_op(node) if apply_scope_name_workarounds: # Here we apply the workaround to the Gemm nodes scope name issue mentioned above if new_op['type'] == 'Gemm': new_op['orig-name'] = aten_addmm_nodes_scope_names[onnx_gemm_count] new_op['name'] = new_op['orig-name'] onnx_gemm_count += 1 # Here we apply the workaround to the issue of dropout op scope name overriding previous op's # scope name if new_op['name'] in pre_dropout_nodes_scope_names: new_op['orig-name'] = pre_dropout_nodes_scope_names[new_op['name']] new_op['name'] = new_op['orig-name'] # Convert the graph node's scope name to a PyTorch module name module_name = onnx_name_2_pytorch_name(new_op['orig-name']) # Get name from before conversion to DistillerModuleList module_name = converted_module_names_map[module_name] if len(module_name) == 0: # Special case where the module name is an empty string - this happens # when the op is called from the "top-level" of the model new_op['name'] = 'top_level_op' else: new_op['name'] = module_name # Save the calling module name in the op dict. Denormalize it so it can # be directly matched with the actual model module_name = distiller.denormalize_module_name(self._src_model, module_name) new_op['module-name'] = module_name # The node's scope name in the graph corresponds to the module from which the op was called. # This means that when ops are invoked from the same module via functional calls or direct # operations on tensors, these ops will have the SAME MODEL NAME associated with them. # For example: # t = t1 + t2 # t = F.relu(t) # In this case the add operation and the ReLU operation will have the same name, which is # derived from the module they're contained in. # # Another case where different ops will have the same module name is when a module is reused: # out = self.conv1(x) # out = self.relu(out) <=== First use of self.relu # out = self.conv2(out) # out = self.relu(out) <=== Second use of self.relu # In this case the graph will have 2 distinct ReLU nodes, with the same scope name. # # Operators with the same name create very confusing graphs (in ResNet, for example), # so we "unroll" them. same_module_cnt = len(self.module_ops_map[module_name]) if same_module_cnt: new_op['name'] += "__" + str(same_module_cnt) self.module_ops_map[module_name].append(new_op['name']) # Finally we register the new op in the ops collection msglogger.debug("new sgraph node - Scope name: {} ; Type: {} ; Display name {}".format( new_op['orig-name'], new_op['type'], new_op['name'])) self.ops[new_op['name']] = new_op for input_ in node.inputs(): self.__add_input(new_op, input_) self.edges.append(SummaryGraph.Edge(input_.uniqueName(), new_op['name'])) for output in node.outputs(): self.__add_output(new_op, output) self.edges.append(SummaryGraph.Edge(new_op['name'], output.uniqueName())) new_op['attrs'] = OrderedDict([(attr_name, node[attr_name]) for attr_name in node.attributeNames()]) self.__merge_pad_avgpool() self.add_macs_attr() self.add_footprint_attr() self.add_arithmetic_intensity_attr() del model_clone
def op_meta(n): return OpSimpleMetadata(distiller.denormalize_module_name(self._src_model, n), self.ops[n]['type'])
def create_thinning_recipe_channels(sgraph, model, zeros_mask_dict): """Create a recipe for removing channels from Convolution layers. The 4D weights of the model parameters (i.e. the convolution parameters) are examined one by one, to determine which has channels that are all zeros. For each weights tensor that has at least one zero-channel, we create a "thinning recipe". The thinning recipe contains meta-instructions of how the model should be changed in order to remove the channels. """ msglogger.info("Invoking create_thinning_recipe_channels") thinning_recipe = ThinningRecipe(modules={}, parameters={}) layers = {mod_name: m for mod_name, m in model.named_modules()} # Traverse all of the model's parameters, search for zero-channels, and # create a thinning recipe that descibes the required changes to the model. for param_name, param in model.named_parameters(): # We are only interested in 4D weights (of Convolution layers) if param.dim() != 4: continue num_channels = param.size(1) nonzero_channels = find_nonzero_channels(param, param_name) num_nnz_channels = nonzero_channels.nelement() if num_nnz_channels == 0: raise ValueError( "Trying to set zero channels for parameter %s is not allowed" % param_name) # If there are non-zero channels in this tensor then continue to next tensor if num_channels <= num_nnz_channels: continue # We are removing channels, so update the number of incoming channels (IFMs) # in the convolutional layer layer_name = param_name_2_layer_name(param_name) assert isinstance(layers[layer_name], torch.nn.modules.Conv2d) append_module_directive(model, thinning_recipe, layer_name, key='in_channels', val=num_nnz_channels) # Select only the non-zero filters indices = nonzero_channels.data.squeeze() append_param_directive(thinning_recipe, param_name, (1, indices)) # Find all instances of Convolution layers that immediately preceed this layer predecessors = sgraph.predecessors_f(normalize_module_name(layer_name), ['Conv']) # Convert the layers names to PyTorch's convoluted naming scheme (when DataParallel is used) predecessors = [ normalize_module_name(predecessor) for predecessor in predecessors ] if len(predecessors) == 0: msglogger.info( "Could not find predecessors for name={} normal={} {}".format( layer_name, normalize_module_name(layer_name), denormalize_module_name(model, layer_name))) for predecessor in predecessors: # For each of the convolutional layers that preceed, we have to reduce the number of output channels. append_module_directive(model, thinning_recipe, predecessor, key='out_channels', val=num_nnz_channels) # Now remove channels from the weights tensor of the predecessor conv append_param_directive( thinning_recipe, denormalize_module_name(model, predecessor) + '.weight', (0, indices)) if layers[denormalize_module_name(model, predecessor)].bias is not None: # This convolution has bias coefficients append_param_directive( thinning_recipe, denormalize_module_name(model, predecessor) + '.bias', (0, indices)) # Now handle the BatchNormalization layer that follows the convolution bn_layers = sgraph.predecessors_f(normalize_module_name(layer_name), ['BatchNormalization']) if len(bn_layers) > 0: # if len(bn_layers) != 1: # raise RuntimeError("{} should have exactly one BN predecessors, but has {}".format(layer_name, len(bn_layers))) for bn_layer in bn_layers: # Thinning of the BN layer that follows the convolution bn_layer_name = denormalize_module_name(model, bn_layer) msglogger.debug( "[recipe] {}: predecessor BN module = {}".format( layer_name, bn_layer_name)) append_bn_thinning_directive( thinning_recipe, layers, bn_layer_name, len_thin_features=num_nnz_channels, thin_features=indices) msglogger.debug(thinning_recipe) return thinning_recipe
def create_thinning_recipe_filters(sgraph, model, zeros_mask_dict): """Create a recipe for removing filters from Convolution layers. The 4D weights of the model parameters (i.e. the convolution parameters) are examined one by one, to determine which has filters that are all zeros. For each weights tensor that has at least one zero-filter, we create a "thinning recipe". The thinning recipe contains meta-instructions of how the model should be changed in order to remove the filters. """ msglogger.info("Invoking create_thinning_recipe_filters") msglogger.debug(sgraph.ops.keys()) thinning_recipe = ThinningRecipe(modules={}, parameters={}) layers = {mod_name: m for mod_name, m in model.named_modules()} """ log 2018-09-19 CKH 如果是1x1和dwconv3x3相连, 都要做pruning filter, 那在thinning的时候, 对于dwconv3x3, 它要按照前向结点thinning一次param, 再按本身 的recipe再thinning一次, 但是dwconv3x3实际上param只有一个维度, 比如32x32的dwconv3x3, parameter只有[32,1], 那就不可能thinning两次 因为按前向结点thinning的时候, 要thinning一个filter, in和out同时都被thinning掉了, 因为in&out之间只有一条线相连, 所以这种情况就不要做 dwconv3x3的thinning, 做1x1的thinning, 就达到对3x3 filter pruning的目的了 """ for param_name, param in model.named_parameters(): # We are only interested in 4D weights if param.dim() != 4: continue # Find the number of zero-valued filters in this weights tensor filter_view = param.view(param.size(0), -1) num_filters = filter_view.size()[0] nonzero_filters = torch.nonzero(filter_view.abs().sum(dim=1)) num_nnz_filters = nonzero_filters.nelement() if num_nnz_filters == 0: raise ValueError( "Trying to set zero filters for parameter %s is not allowed" % param_name) # If there are non-zero filters in this tensor then continue to next tensor if num_filters <= num_nnz_filters: msglogger.debug("SKipping {} shape={}".format( param_name_2_layer_name(param_name), param.shape)) continue msglogger.info("In tensor %s found %d/%d zero filters", param_name, num_filters - num_nnz_filters, num_filters) # We are removing filters, so update the number of outgoing channels (OFMs) # in the convolutional layer layer_name = param_name_2_layer_name(param_name) assert isinstance(layers[layer_name], torch.nn.modules.Conv2d) # 改变architecture的in_ch和out_ch append_module_directive(model, thinning_recipe, layer_name, key='out_channels', val=num_nnz_filters) # Select only the non-zero filters indices = nonzero_filters.data.squeeze() # 改变parameter tensor的维度大小 append_param_directive(thinning_recipe, param_name, (0, indices)) if layers[layer_name].bias is not None: # This convolution has bias coefficients append_param_directive(thinning_recipe, layer_name + '.bias', (0, indices)) # Find all instances of Convolution or FC (GEMM) layers that immediately follow this layer msglogger.debug("{} => {}".format(layer_name, normalize_module_name(layer_name))) # Add type name before put in successors_f 2018-09-19 CKH norm_module_name = normalize_module_name(layer_name) # 可以考虑用isinstance(layers[successor], torch.nn.modules.Conv2d)的方式来判断module的type 2018-09-19 CKH if isinstance(layers[norm_module_name], torch.nn.modules.Conv2d): norm_module_name += '.Conv' successors = sgraph.successors_f(norm_module_name, ['Conv', 'Gemm']) # Convert the layers names to PyTorch's convoluted naming scheme (when DataParallel is used) successors = ['.'.join(succs.split('.')[0:-1]) for succs in successors] successors = [ denormalize_module_name(model, successor) for successor in successors ] for successor in successors: if isinstance(layers[successor], torch.nn.modules.Conv2d): # 如果遇到successor是dwconv, 需要把dw的out_ch也改掉(只改architecture), 同时把dwconv的后向结点的in_ch也改掉 # (architecture和parameter维度都要改), 先默认dwcon不会紧跟一个dwconv 2018-09-19 CKH successor_norm_module_name = normalize_module_name(successor) if isinstance(layers[successor_norm_module_name], torch.nn.modules.Conv2d): successor_norm_module_name += '.Conv' if layers[successor].groups == layers[successor].in_channels: append_module_directive(model, thinning_recipe, successor, key='in_channels', val=num_nnz_filters) append_module_directive(model, thinning_recipe, successor, key='out_channels', val=num_nnz_filters) layers[successor].groups = num_nnz_filters msglogger.debug( "[recipe] {}: setting in_channels = {}".format( successor, num_nnz_filters)) # Now remove channels from the weights tensor of the successor conv append_param_directive( thinning_recipe, denormalize_module_name(model, successor) + '.weight', (0, indices)) # 对于1x1后面的depthwise3x3, 在前向1x1的output_channel被剪掉后, 需要①剪掉input_channel②剪掉output_channel # ③剪掉bias, 这三个操作的mask和1x1的mask都一样, 也就是1x1的output_channel一动, 跟着后面depthwise3x3全部要动 # 2018-09-20 CKH if layers[successor].bias is not None: # This convolution has bias coefficients append_param_directive(thinning_recipe, successor + '.bias', (0, indices)) successors2 = sgraph.successors_f( successor_norm_module_name, ['Conv', 'Gemm']) # Convert the layers names to PyTorch's convoluted naming scheme (when DataParallel is used) successors2 = [ '.'.join(succs.split('.')[0:-1]) for succs in successors2 ] successors2 = [ denormalize_module_name(model, successor) for successor in successors2 ] for successor2 in successors2: if isinstance(layers[successor2], torch.nn.modules.Conv2d): append_module_directive(model, thinning_recipe, successor2, key='in_channels', val=num_nnz_filters) msglogger.debug( "[recipe] {}: setting in_channels = {}".format( successor, num_nnz_filters)) append_param_directive( thinning_recipe, denormalize_module_name(model, successor2) + '.weight', (1, indices)) else: # For each of the convolutional layers that follow, we have to reduce the number of input channels. append_module_directive(model, thinning_recipe, successor, key='in_channels', val=num_nnz_filters) msglogger.debug( "[recipe] {}: setting in_channels = {}".format( successor, num_nnz_filters)) # Now remove channels from the weights tensor of the successor conv append_param_directive( thinning_recipe, denormalize_module_name(model, successor) + '.weight', (1, indices)) elif isinstance(layers[successor], torch.nn.modules.Linear): # If a Linear (Fully-Connected) layer follows, we need to update it's in_features member fm_size = layers[successor].in_features // layers[ layer_name].out_channels in_features = fm_size * num_nnz_filters append_module_directive(model, thinning_recipe, successor, key='in_features', val=in_features) msglogger.debug("[recipe] {}: setting in_features = {}".format( successor, in_features)) # Now remove channels from the weights tensor of the successor FC layer: # This is a bit tricky: fm_height = fm_width = int(math.sqrt(fm_size)) view_4D = (layers[successor].out_features, layers[layer_name].out_channels, fm_height, fm_width) view_2D = (layers[successor].out_features, in_features) append_param_directive( thinning_recipe, denormalize_module_name(model, successor) + '.weight', (1, indices, view_4D, view_2D)) # Now handle the BatchNormalization layer that follows the convolution bn_layers = sgraph.successors_f(normalize_module_name(layer_name), ['BatchNormalization']) if len(bn_layers) > 0: assert len(bn_layers) == 1 # Thinning of the BN layer that follows the convolution bn_layer_name = denormalize_module_name(model, bn_layers[0]) bn_thinning(thinning_recipe, layers, bn_layer_name, len_thin_features=num_nnz_filters, thin_features=indices) return thinning_recipe