def test_ranked_channel_pruning(): model, zeros_mask_dict = common.setup_test("resnet20_cifar", "cifar10", parallel=False) # Test that we can access the weights tensor of the first convolution in layer 1 conv1_p = distiller.model_find_param(model, "layer1.0.conv1.weight") assert conv1_p is not None # Test that there are no zero-channels assert distiller.sparsity_ch(conv1_p) == 0.0 # # Create a channel-ranking pruner pruner = distiller.pruning.L1RankedStructureParameterPruner( "channel_pruner", group_type="Channels", desired_sparsity=0.1, weights="layer1.0.conv1.weight") pruner.set_param_mask(conv1_p, "layer1.0.conv1.weight", zeros_mask_dict, meta=None) conv1 = common.find_module_by_name(model, "layer1.0.conv1") assert conv1 is not None # Test that the mask has the correct fraction of channels pruned. # We asked for 10%, but there are only 16 channels, so we have to settle for 1/16 channels logger.info("layer1.0.conv1 = {}".format(conv1)) expected_pruning = int(0.1 * conv1.in_channels) / conv1.in_channels assert distiller.sparsity_ch( zeros_mask_dict["layer1.0.conv1.weight"].mask) == expected_pruning # Use the mask to prune assert distiller.sparsity_ch(conv1_p) == 0 zeros_mask_dict["layer1.0.conv1.weight"].apply_mask(conv1_p) assert distiller.sparsity_ch(conv1_p) == expected_pruning # Remove channels (and filters) conv0 = common.find_module_by_name(model, "conv1") assert conv0 is not None assert conv0.out_channels == 16 assert conv1.in_channels == 16 # Test thinning input_shape = tuple( distiller.apputils.classification_get_input_shape("cifar10")) distiller.remove_channels(model, zeros_mask_dict, input_shape, optimizer=None) assert conv0.out_channels == 15 assert conv1.in_channels == 15
def rank_and_prune_channels(fraction_to_prune, param, param_name=None, zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=distiller.norms.l1_norm, noise=0.0, group_size=1, rounding_fn=math.floor): if binary_map is None: bottomk_channels, channel_mags = distiller.norms.rank_channels( param, group_size, magnitude_fn, fraction_to_prune, rounding_fn, noise) if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold).type(param.data.type()) if zeros_mask_dict is not None: mask, _ = distiller.thresholding.expand_binary_map( param, 'Channels', binary_map) zeros_mask_dict[param_name].mask = mask msglogger.info( "%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", magnitude_fn, param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) return binary_map
def rank_prune_channels(self, fraction_to_prune, param, param_name, zeros_mask_dict): bottomk_channels, channel_mags = self.rank_channels( fraction_to_prune, param) if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return num_filters = param.size(0) num_channels = param.size(1) threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold).type(param.data.type()) a = binary_map.expand(num_filters, num_channels) c = a.unsqueeze(-1) d = c.expand(num_filters, num_channels, param.size(2) * param.size(3)).contiguous() zeros_mask_dict[param_name].mask = d.view(num_filters, num_channels, param.size(2), param.size(3)) msglogger.info( "L1RankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, len(bottomk_channels), num_channels)
def rank_and_prune_channels(fraction_to_prune, param, param_name=None, zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=l1_magnitude, noise=0.0, group_size=1, rounding_fn=math.floor): if binary_map is None: bottomk_channels, channel_mags = LpRankedStructureParameterPruner.rank_channels( magnitude_fn, fraction_to_prune, param, group_size, rounding_fn, noise) if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold).type(param.data.type()) threshold_type = 'L1' if magnitude_fn == l1_magnitude else 'L2' if zeros_mask_dict is not None: zeros_mask_dict[ param_name].mask = LpRankedStructureParameterPruner.ch_binary_map_to_mask( binary_map, param) msglogger.info( "%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", threshold_type, param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) return binary_map
def rank_and_prune_channels(fraction_to_prune, param, param_name=None, zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=l1_magnitude): def rank_channels(fraction_to_prune, param): num_filters = param.size(0) num_channels = param.size(1) kernel_size = param.size(2) * param.size(3) # First, reshape the weights tensor such that each channel (kernel) in the original # tensor, is now a row in the 2D tensor. view_2d = param.view(-1, kernel_size) # Next, compute the sums of each kernel kernel_mags = magnitude_fn(view_2d, dim=1) # Now group by channels k_sums_mat = kernel_mags.view(num_filters, num_channels).t() channel_mags = k_sums_mat.mean(dim=1) k = int(fraction_to_prune * channel_mags.size(0)) if k == 0: msglogger.info("Too few channels (%d)- can't prune %.1f%% channels", num_channels, 100*fraction_to_prune) return None, None bottomk, _ = torch.topk(channel_mags, k, largest=False, sorted=True) return bottomk, channel_mags def binary_map_to_mask(binary_map, param): num_filters = param.size(0) num_channels = param.size(1) a = binary_map.expand(num_filters, num_channels) c = a.unsqueeze(-1) d = c.expand(num_filters, num_channels, param.size(2) * param.size(3)).contiguous() return d.view(num_filters, num_channels, param.size(2), param.size(3)) if binary_map is None: bottomk_channels, channel_mags = rank_channels(fraction_to_prune, param) if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold).type(param.data.type()) threshold_type = 'L1' if magnitude_fn == l1_magnitude else 'L2' if zeros_mask_dict is not None: zeros_mask_dict[param_name].mask = binary_map_to_mask(binary_map, param) msglogger.info("%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", threshold_type, param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) return binary_map
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2, 4]): df = pd.DataFrame(columns=[ 'Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)', 'Cols (%)', 'Rows (%)', 'Ch (%)', '2D (%)', '3D (%)', 'Fine (%)', 'Std', 'Mean', 'Abs-Mean' ]) pd.set_option('precision', 2) params_size = 0 sparse_params_size = 0 summary_param_types = ['weight', 'bias'] for name, param in model.state_dict().items(): # Extract just the actual parameter's name, which in this context we treat as its "type" curr_param_type = name.split('.')[-1] if param.dim( ) in param_dims and curr_param_type in summary_param_types: _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = ([ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param) * 100, distiller.sparsity_rows(param) * 100, distiller.sparsity_ch(param) * 100, distiller.sparsity_2D(param) * 100, distiller.sparsity_3D(param) * 100, (1 - _density) * 100, param.std().item(), param.mean().item(), param.abs().mean().item() ]) total_sparsity = (1 - sparse_params_size / params_size) * 100 df.loc[len(df.index)] = ([ 'Total sparsity:', '-', params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0 ]) if return_total_sparsity: return df, total_sparsity return df
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2,4]): df = pd.DataFrame(columns=['Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)', 'Cols (%)','Rows (%)', 'Ch (%)', '2D (%)', '3D (%)', 'Fine (%)', 'Std', 'Mean', 'Abs-Mean']) pd.set_option('precision', 2) params_size = 0 sparse_params_size = 0 for name, param in model.state_dict().items(): if (param.dim() in param_dims) and any(type in name for type in ['weight', 'bias']): _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = ([ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param)*100, distiller.sparsity_rows(param)*100, distiller.sparsity_ch(param)*100, distiller.sparsity_2D(param)*100, distiller.sparsity_3D(param)*100, (1-_density)*100, param.std().item(), param.mean().item(), param.abs().mean().item() ]) total_sparsity = (1 - sparse_params_size/params_size)*100 df.loc[len(df.index)] = ([ 'Total sparsity:', '-', params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0]) if return_total_sparsity: return df, total_sparsity return df
def rank_and_prune_channels(fraction_to_prune, param, param_name=None, zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=distiller.norms.l1_norm, group_size=1, rounding_fn=math.floor, noise=0): assert binary_map is None if binary_map is None: bottomk_channels, channel_mags = distiller.norms.rank_channels(param, group_size, magnitude_fn, fraction_to_prune, rounding_fn, noise) # Todo: this little piece of code can be refactored if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold) # These are the indices of channels we want to keep indices = binary_map.nonzero().squeeze() if len(indices.shape) == 0: indices = indices.expand(1) # Find the module representing this layer distiller.assign_layer_fq_names(model) layer_name = _param_name_2_layer_name(param_name) conv = distiller.find_module_by_fq_name(model, layer_name) try: Y = model.intermediate_fms['output_fms'][layer_name] X = model.intermediate_fms['input_fms'][layer_name] except AttributeError: raise ValueError("To use FMReconstructionChannelPruner you must first collect input statistics") # We need to remove the chosen weights channels. Because we are using # min(MSE) to compute the weights, we need to start by removing feature-map # channels from the input. Then we perform the MSE regression to generate # a smaller weights tensor. if op_type == 'fc': X = X[:, binary_map] elif conv.kernel_size == (1, 1): X = X[:, binary_map, :] X = X.transpose(1, 2) X = X.contiguous().view(-1, X.size(2)) else: # X is (batch, ck^2, num_pts) # we want: (batch, c, k^2, num_pts) X = X.view(X.size(0), -1, np.prod(conv.kernel_size), X.size(2)) X = X[:, binary_map, :, :] X = X.view(X.size(0), -1, X.size(3)) X = X.transpose(1, 2) X = X.contiguous().view(-1, X.size(2)) # Approximate the weights given input-FMs and output-FMs new_w = _least_square_sklearn(X, Y) new_w = torch.from_numpy(new_w) # shape: (num_filters, num_non_masked_channels * k^2) cnt_retained_channels = binary_map.sum() if op_type == 'conv': # Expand the weights back to their original size, new_w = new_w.contiguous().view(param.size(0), cnt_retained_channels, param.size(2), param.size(3)) # Copy the weights that we learned from minimizing the feature-maps least squares error, # to our actual weights tensor. param.detach()[:, indices, :, :] = new_w.type(param.type()) else: param.detach()[:, indices] = new_w.type(param.type()) if zeros_mask_dict is not None: binary_map = binary_map.type(param.type()) if op_type == 'conv': zeros_mask_dict[param_name].mask, _ = distiller.thresholding.expand_binary_map(param, 'Channels', binary_map) msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) else: msglogger.error("fc sparsity = %.2f" % (1 - binary_map.sum().item() / binary_map.size(0))) zeros_mask_dict[param_name].mask = binary_map.expand(param.size(0), param.size(1)) msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", param_name, distiller.sparsity_cols(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) return binary_map
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2, 4]): df = pd.DataFrame(columns=[ "Name", "Shape", "NNZ (dense)", "NNZ (sparse)", "Cols (%)", "Rows (%)", "Ch (%)", "2D (%)", "3D (%)", "Fine (%)", "Std", "Mean", "Abs-Mean", ]) pd.set_option("precision", 2) params_size = 0 sparse_params_size = 0 for name, param in model.state_dict().items(): # Extract just the actual parameter's name, which in this context we treat as its "type" if param.dim() in param_dims and any(type in name for type in ["weight", "bias"]): _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = [ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param) * 100, distiller.sparsity_rows(param) * 100, distiller.sparsity_ch(param) * 100, distiller.sparsity_2D(param) * 100, distiller.sparsity_3D(param) * 100, (1 - _density) * 100, param.std().item(), param.mean().item(), param.abs().mean().item(), ] total_sparsity = (1 - sparse_params_size / params_size) * 100 df.loc[len(df.index)] = [ "Total sparsity:", "-", params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0, ] if return_total_sparsity: return df, total_sparsity return df