def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2, 4]): df = pd.DataFrame(columns=[ 'Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)', 'Cols (%)', 'Rows (%)', 'Ch (%)', '2D (%)', '3D (%)', 'Fine (%)', 'Std', 'Mean', 'Abs-Mean' ]) pd.set_option('precision', 2) params_size = 0 sparse_params_size = 0 summary_param_types = ['weight', 'bias'] for name, param in model.state_dict().items(): # Extract just the actual parameter's name, which in this context we treat as its "type" curr_param_type = name.split('.')[-1] if param.dim( ) in param_dims and curr_param_type in summary_param_types: _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = ([ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param) * 100, distiller.sparsity_rows(param) * 100, distiller.sparsity_ch(param) * 100, distiller.sparsity_2D(param) * 100, distiller.sparsity_3D(param) * 100, (1 - _density) * 100, param.std().item(), param.mean().item(), param.abs().mean().item() ]) total_sparsity = (1 - sparse_params_size / params_size) * 100 df.loc[len(df.index)] = ([ 'Total sparsity:', '-', params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0 ]) if return_total_sparsity: return df, total_sparsity return df
def test_sparsity(): zeros = torch.zeros(2, 3, 5, 6) print(distiller.sparsity(zeros)) assert distiller.sparsity(zeros) == 1.0 assert distiller.sparsity_3D(zeros) == 1.0 assert distiller.density_3D(zeros) == 0.0 ones = torch.ones(12, 43, 4, 6) assert distiller.sparsity(ones) == 0.0 x = torch.tensor([[1., 2., 0, 4., 0], [1., 2., 0, 4., 0]]) assert distiller.density(x) == 0.6 assert distiller.density_cols(x, transposed=False) == 0.6 assert distiller.sparsity_rows(x, transposed=False) == 0 x = torch.tensor([[0., 0., 0], [1., 4., 0], [1., 2., 0], [0., 0., 0]]) assert distiller.density(x) == 4 / 12 assert distiller.sparsity_rows(x, transposed=False) == 0.5 assert common.almost_equal(distiller.sparsity_cols(x, transposed=False), 1 / 3) assert common.almost_equal(distiller.sparsity_rows(x), 1 / 3)
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2,4]): df = pd.DataFrame(columns=['Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)', 'Cols (%)','Rows (%)', 'Ch (%)', '2D (%)', '3D (%)', 'Fine (%)', 'Std', 'Mean', 'Abs-Mean']) pd.set_option('precision', 2) params_size = 0 sparse_params_size = 0 for name, param in model.state_dict().items(): if (param.dim() in param_dims) and any(type in name for type in ['weight', 'bias']): _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = ([ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param)*100, distiller.sparsity_rows(param)*100, distiller.sparsity_ch(param)*100, distiller.sparsity_2D(param)*100, distiller.sparsity_3D(param)*100, (1-_density)*100, param.std().item(), param.mean().item(), param.abs().mean().item() ]) total_sparsity = (1 - sparse_params_size/params_size)*100 df.loc[len(df.index)] = ([ 'Total sparsity:', '-', params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0]) if return_total_sparsity: return df, total_sparsity return df
def rank_and_prune_channels(fraction_to_prune, param, param_name=None, zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=distiller.norms.l1_norm, group_size=1, rounding_fn=math.floor, noise=0): assert binary_map is None if binary_map is None: bottomk_channels, channel_mags = distiller.norms.rank_channels(param, group_size, magnitude_fn, fraction_to_prune, rounding_fn, noise) # Todo: this little piece of code can be refactored if bottomk_channels is None: # Empty list means that fraction_to_prune is too low to prune anything return threshold = bottomk_channels[-1] binary_map = channel_mags.gt(threshold) # These are the indices of channels we want to keep indices = binary_map.nonzero().squeeze() if len(indices.shape) == 0: indices = indices.expand(1) # Find the module representing this layer distiller.assign_layer_fq_names(model) layer_name = _param_name_2_layer_name(param_name) conv = distiller.find_module_by_fq_name(model, layer_name) try: Y = model.intermediate_fms['output_fms'][layer_name] X = model.intermediate_fms['input_fms'][layer_name] except AttributeError: raise ValueError("To use FMReconstructionChannelPruner you must first collect input statistics") # We need to remove the chosen weights channels. Because we are using # min(MSE) to compute the weights, we need to start by removing feature-map # channels from the input. Then we perform the MSE regression to generate # a smaller weights tensor. if op_type == 'fc': X = X[:, binary_map] elif conv.kernel_size == (1, 1): X = X[:, binary_map, :] X = X.transpose(1, 2) X = X.contiguous().view(-1, X.size(2)) else: # X is (batch, ck^2, num_pts) # we want: (batch, c, k^2, num_pts) X = X.view(X.size(0), -1, np.prod(conv.kernel_size), X.size(2)) X = X[:, binary_map, :, :] X = X.view(X.size(0), -1, X.size(3)) X = X.transpose(1, 2) X = X.contiguous().view(-1, X.size(2)) # Approximate the weights given input-FMs and output-FMs new_w = _least_square_sklearn(X, Y) new_w = torch.from_numpy(new_w) # shape: (num_filters, num_non_masked_channels * k^2) cnt_retained_channels = binary_map.sum() if op_type == 'conv': # Expand the weights back to their original size, new_w = new_w.contiguous().view(param.size(0), cnt_retained_channels, param.size(2), param.size(3)) # Copy the weights that we learned from minimizing the feature-maps least squares error, # to our actual weights tensor. param.detach()[:, indices, :, :] = new_w.type(param.type()) else: param.detach()[:, indices] = new_w.type(param.type()) if zeros_mask_dict is not None: binary_map = binary_map.type(param.type()) if op_type == 'conv': zeros_mask_dict[param_name].mask, _ = distiller.thresholding.expand_binary_map(param, 'Channels', binary_map) msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", param_name, distiller.sparsity_ch(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) else: msglogger.error("fc sparsity = %.2f" % (1 - binary_map.sum().item() / binary_map.size(0))) zeros_mask_dict[param_name].mask = binary_map.expand(param.size(0), param.size(1)) msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)", param_name, distiller.sparsity_cols(zeros_mask_dict[param_name].mask), fraction_to_prune, binary_map.sum().item(), param.size(1)) return binary_map
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2, 4]): df = pd.DataFrame(columns=[ "Name", "Shape", "NNZ (dense)", "NNZ (sparse)", "Cols (%)", "Rows (%)", "Ch (%)", "2D (%)", "3D (%)", "Fine (%)", "Std", "Mean", "Abs-Mean", ]) pd.set_option("precision", 2) params_size = 0 sparse_params_size = 0 for name, param in model.state_dict().items(): # Extract just the actual parameter's name, which in this context we treat as its "type" if param.dim() in param_dims and any(type in name for type in ["weight", "bias"]): _density = distiller.density(param) params_size += torch.numel(param) sparse_params_size += param.numel() * _density df.loc[len(df.index)] = [ name, distiller.size_to_str(param.size()), torch.numel(param), int(_density * param.numel()), distiller.sparsity_cols(param) * 100, distiller.sparsity_rows(param) * 100, distiller.sparsity_ch(param) * 100, distiller.sparsity_2D(param) * 100, distiller.sparsity_3D(param) * 100, (1 - _density) * 100, param.std().item(), param.mean().item(), param.abs().mean().item(), ] total_sparsity = (1 - sparse_params_size / params_size) * 100 df.loc[len(df.index)] = [ "Total sparsity:", "-", params_size, int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0, ] if return_total_sparsity: return df, total_sparsity return df