Ejemplo n.º 1
0
def test_ranked_channel_pruning():
    model, zeros_mask_dict = common.setup_test("resnet20_cifar",
                                               "cifar10",
                                               parallel=False)

    # Test that we can access the weights tensor of the first convolution in layer 1
    conv1_p = distiller.model_find_param(model, "layer1.0.conv1.weight")
    assert conv1_p is not None

    # Test that there are no zero-channels
    assert distiller.sparsity_ch(conv1_p) == 0.0

    # # Create a channel-ranking pruner
    pruner = distiller.pruning.L1RankedStructureParameterPruner(
        "channel_pruner",
        group_type="Channels",
        desired_sparsity=0.1,
        weights="layer1.0.conv1.weight")
    pruner.set_param_mask(conv1_p,
                          "layer1.0.conv1.weight",
                          zeros_mask_dict,
                          meta=None)

    conv1 = common.find_module_by_name(model, "layer1.0.conv1")
    assert conv1 is not None

    # Test that the mask has the correct fraction of channels pruned.
    # We asked for 10%, but there are only 16 channels, so we have to settle for 1/16 channels
    logger.info("layer1.0.conv1 = {}".format(conv1))
    expected_pruning = int(0.1 * conv1.in_channels) / conv1.in_channels
    assert distiller.sparsity_ch(
        zeros_mask_dict["layer1.0.conv1.weight"].mask) == expected_pruning

    # Use the mask to prune
    assert distiller.sparsity_ch(conv1_p) == 0
    zeros_mask_dict["layer1.0.conv1.weight"].apply_mask(conv1_p)
    assert distiller.sparsity_ch(conv1_p) == expected_pruning

    # Remove channels (and filters)
    conv0 = common.find_module_by_name(model, "conv1")
    assert conv0 is not None
    assert conv0.out_channels == 16
    assert conv1.in_channels == 16

    # Test thinning
    input_shape = tuple(
        distiller.apputils.classification_get_input_shape("cifar10"))
    distiller.remove_channels(model,
                              zeros_mask_dict,
                              input_shape,
                              optimizer=None)
    assert conv0.out_channels == 15
    assert conv1.in_channels == 15
    def rank_and_prune_channels(fraction_to_prune,
                                param,
                                param_name=None,
                                zeros_mask_dict=None,
                                model=None,
                                binary_map=None,
                                magnitude_fn=distiller.norms.l1_norm,
                                noise=0.0,
                                group_size=1,
                                rounding_fn=math.floor):
        if binary_map is None:
            bottomk_channels, channel_mags = distiller.norms.rank_channels(
                param, group_size, magnitude_fn, fraction_to_prune,
                rounding_fn, noise)
            if bottomk_channels is None:
                # Empty list means that fraction_to_prune is too low to prune anything
                return
            threshold = bottomk_channels[-1]
            binary_map = channel_mags.gt(threshold).type(param.data.type())

        if zeros_mask_dict is not None:
            mask, _ = distiller.thresholding.expand_binary_map(
                param, 'Channels', binary_map)
            zeros_mask_dict[param_name].mask = mask
            msglogger.info(
                "%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
                magnitude_fn, param_name,
                distiller.sparsity_ch(zeros_mask_dict[param_name].mask),
                fraction_to_prune,
                binary_map.sum().item(), param.size(1))
        return binary_map
    def rank_prune_channels(self, fraction_to_prune, param, param_name,
                            zeros_mask_dict):
        bottomk_channels, channel_mags = self.rank_channels(
            fraction_to_prune, param)
        if bottomk_channels is None:
            # Empty list means that fraction_to_prune is too low to prune anything
            return

        num_filters = param.size(0)
        num_channels = param.size(1)

        threshold = bottomk_channels[-1]
        binary_map = channel_mags.gt(threshold).type(param.data.type())
        a = binary_map.expand(num_filters, num_channels)
        c = a.unsqueeze(-1)
        d = c.expand(num_filters, num_channels,
                     param.size(2) * param.size(3)).contiguous()
        zeros_mask_dict[param_name].mask = d.view(num_filters, num_channels,
                                                  param.size(2), param.size(3))

        msglogger.info(
            "L1RankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
            param_name,
            distiller.sparsity_ch(zeros_mask_dict[param_name].mask),
            fraction_to_prune, len(bottomk_channels), num_channels)
Ejemplo n.º 4
0
    def rank_and_prune_channels(fraction_to_prune,
                                param,
                                param_name=None,
                                zeros_mask_dict=None,
                                model=None,
                                binary_map=None,
                                magnitude_fn=l1_magnitude,
                                noise=0.0,
                                group_size=1,
                                rounding_fn=math.floor):

        if binary_map is None:
            bottomk_channels, channel_mags = LpRankedStructureParameterPruner.rank_channels(
                magnitude_fn, fraction_to_prune, param, group_size,
                rounding_fn, noise)
            if bottomk_channels is None:
                # Empty list means that fraction_to_prune is too low to prune anything
                return
            threshold = bottomk_channels[-1]
            binary_map = channel_mags.gt(threshold).type(param.data.type())

        threshold_type = 'L1' if magnitude_fn == l1_magnitude else 'L2'
        if zeros_mask_dict is not None:
            zeros_mask_dict[
                param_name].mask = LpRankedStructureParameterPruner.ch_binary_map_to_mask(
                    binary_map, param)
            msglogger.info(
                "%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
                threshold_type, param_name,
                distiller.sparsity_ch(zeros_mask_dict[param_name].mask),
                fraction_to_prune,
                binary_map.sum().item(), param.size(1))
        return binary_map
Ejemplo n.º 5
0
    def rank_and_prune_channels(fraction_to_prune, param, param_name=None,
                                zeros_mask_dict=None, model=None, binary_map=None, magnitude_fn=l1_magnitude):
        def rank_channels(fraction_to_prune, param):
            num_filters = param.size(0)
            num_channels = param.size(1)
            kernel_size = param.size(2) * param.size(3)

            # First, reshape the weights tensor such that each channel (kernel) in the original
            # tensor, is now a row in the 2D tensor.
            view_2d = param.view(-1, kernel_size)
            # Next, compute the sums of each kernel
            kernel_mags = magnitude_fn(view_2d, dim=1)
            # Now group by channels
            k_sums_mat = kernel_mags.view(num_filters, num_channels).t()
            channel_mags = k_sums_mat.mean(dim=1)
            k = int(fraction_to_prune * channel_mags.size(0))
            if k == 0:
                msglogger.info("Too few channels (%d)- can't prune %.1f%% channels",
                               num_channels, 100*fraction_to_prune)
                return None, None

            bottomk, _ = torch.topk(channel_mags, k, largest=False, sorted=True)
            return bottomk, channel_mags

        def binary_map_to_mask(binary_map, param):
            num_filters = param.size(0)
            num_channels = param.size(1)
            a = binary_map.expand(num_filters, num_channels)
            c = a.unsqueeze(-1)
            d = c.expand(num_filters, num_channels, param.size(2) * param.size(3)).contiguous()
            return d.view(num_filters, num_channels, param.size(2), param.size(3))

        if binary_map is None:
            bottomk_channels, channel_mags = rank_channels(fraction_to_prune, param)
            if bottomk_channels is None:
                # Empty list means that fraction_to_prune is too low to prune anything
                return
            threshold = bottomk_channels[-1]
            binary_map = channel_mags.gt(threshold).type(param.data.type())

        threshold_type = 'L1' if magnitude_fn == l1_magnitude else 'L2'
        if zeros_mask_dict is not None:
            zeros_mask_dict[param_name].mask = binary_map_to_mask(binary_map, param)
            msglogger.info("%sRankedStructureParameterPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
                           threshold_type, param_name,
                           distiller.sparsity_ch(zeros_mask_dict[param_name].mask),
                           fraction_to_prune, binary_map.sum().item(), param.size(1))
        return binary_map
Ejemplo n.º 6
0
def weights_sparsity_summary(model,
                             return_total_sparsity=False,
                             param_dims=[2, 4]):

    df = pd.DataFrame(columns=[
        'Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)', 'Cols (%)', 'Rows (%)',
        'Ch (%)', '2D (%)', '3D (%)', 'Fine (%)', 'Std', 'Mean', 'Abs-Mean'
    ])
    pd.set_option('precision', 2)
    params_size = 0
    sparse_params_size = 0
    summary_param_types = ['weight', 'bias']
    for name, param in model.state_dict().items():
        # Extract just the actual parameter's name, which in this context we treat as its "type"
        curr_param_type = name.split('.')[-1]
        if param.dim(
        ) in param_dims and curr_param_type in summary_param_types:
            _density = distiller.density(param)
            params_size += torch.numel(param)
            sparse_params_size += param.numel() * _density
            df.loc[len(df.index)] = ([
                name,
                distiller.size_to_str(param.size()),
                torch.numel(param),
                int(_density * param.numel()),
                distiller.sparsity_cols(param) * 100,
                distiller.sparsity_rows(param) * 100,
                distiller.sparsity_ch(param) * 100,
                distiller.sparsity_2D(param) * 100,
                distiller.sparsity_3D(param) * 100, (1 - _density) * 100,
                param.std().item(),
                param.mean().item(),
                param.abs().mean().item()
            ])

    total_sparsity = (1 - sparse_params_size / params_size) * 100

    df.loc[len(df.index)] = ([
        'Total sparsity:', '-', params_size,
        int(sparse_params_size), 0, 0, 0, 0, 0, total_sparsity, 0, 0, 0
    ])

    if return_total_sparsity:
        return df, total_sparsity
    return df
Ejemplo n.º 7
0
def weights_sparsity_summary(model, return_total_sparsity=False, param_dims=[2,4]):

    df = pd.DataFrame(columns=['Name', 'Shape', 'NNZ (dense)', 'NNZ (sparse)',
                               'Cols (%)','Rows (%)', 'Ch (%)', '2D (%)', '3D (%)',
                               'Fine (%)', 'Std', 'Mean', 'Abs-Mean'])
    pd.set_option('precision', 2)
    params_size = 0
    sparse_params_size = 0
    for name, param in model.state_dict().items():
        if (param.dim() in param_dims) and any(type in name for type in ['weight', 'bias']):
            _density = distiller.density(param)
            params_size += torch.numel(param)
            sparse_params_size += param.numel() * _density
            df.loc[len(df.index)] = ([
                name,
                distiller.size_to_str(param.size()),
                torch.numel(param),
                int(_density * param.numel()),
                distiller.sparsity_cols(param)*100,
                distiller.sparsity_rows(param)*100,
                distiller.sparsity_ch(param)*100,
                distiller.sparsity_2D(param)*100,
                distiller.sparsity_3D(param)*100,
                (1-_density)*100,
                param.std().item(),
                param.mean().item(),
                param.abs().mean().item()
            ])

    total_sparsity = (1 - sparse_params_size/params_size)*100

    df.loc[len(df.index)] = ([
        'Total sparsity:',
        '-',
        params_size,
        int(sparse_params_size),
        0, 0, 0, 0, 0,
        total_sparsity,
        0, 0, 0])

    if return_total_sparsity:
        return df, total_sparsity
    return df
Ejemplo n.º 8
0
    def rank_and_prune_channels(fraction_to_prune, param, param_name=None,
                                zeros_mask_dict=None, model=None, binary_map=None, 
                                magnitude_fn=distiller.norms.l1_norm, group_size=1, rounding_fn=math.floor,
                                noise=0):
        assert binary_map is None
        if binary_map is None:
            bottomk_channels, channel_mags = distiller.norms.rank_channels(param, group_size, magnitude_fn,
                                                                           fraction_to_prune, rounding_fn, noise)

            # Todo: this little piece of code can be refactored
            if bottomk_channels is None:
                # Empty list means that fraction_to_prune is too low to prune anything
                return

            threshold = bottomk_channels[-1]
            binary_map = channel_mags.gt(threshold)

            # These are the indices of channels we want to keep
            indices = binary_map.nonzero().squeeze()
            if len(indices.shape) == 0:
                indices = indices.expand(1)

            # Find the module representing this layer
            distiller.assign_layer_fq_names(model)
            layer_name = _param_name_2_layer_name(param_name)
            conv = distiller.find_module_by_fq_name(model, layer_name)
            try:
                Y = model.intermediate_fms['output_fms'][layer_name]
                X = model.intermediate_fms['input_fms'][layer_name]
            except AttributeError:
                raise ValueError("To use FMReconstructionChannelPruner you must first collect input statistics")

            # We need to remove the chosen weights channels.  Because we are using 
            # min(MSE) to compute the weights, we need to start by removing feature-map 
            # channels from the input.  Then we perform the MSE regression to generate
            # a smaller weights tensor.
            if op_type == 'fc':
                X = X[:, binary_map]
            elif conv.kernel_size == (1, 1):
                X = X[:, binary_map, :]
                X = X.transpose(1, 2)
                X = X.contiguous().view(-1, X.size(2))
            else:
                # X is (batch, ck^2, num_pts)
                # we want:   (batch, c, k^2, num_pts)
                X = X.view(X.size(0), -1, np.prod(conv.kernel_size), X.size(2))
                X = X[:, binary_map, :, :]
                X = X.view(X.size(0), -1, X.size(3))
                X = X.transpose(1, 2)
                X = X.contiguous().view(-1, X.size(2))

            # Approximate the weights given input-FMs and output-FMs
            new_w = _least_square_sklearn(X, Y)
            new_w = torch.from_numpy(new_w) # shape: (num_filters, num_non_masked_channels * k^2)
            cnt_retained_channels = binary_map.sum()

            if op_type == 'conv':
                # Expand the weights back to their original size,
                new_w = new_w.contiguous().view(param.size(0), cnt_retained_channels, param.size(2), param.size(3))

                # Copy the weights that we learned from minimizing the feature-maps least squares error,
                # to our actual weights tensor.
                param.detach()[:, indices, :,   :] = new_w.type(param.type())
            else:
                param.detach()[:, indices] = new_w.type(param.type())

        if zeros_mask_dict is not None:
            binary_map = binary_map.type(param.type())
            if op_type == 'conv':
                zeros_mask_dict[param_name].mask, _ = distiller.thresholding.expand_binary_map(param,
                                                                                               'Channels', binary_map)
                msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
                               param_name,
                               distiller.sparsity_ch(zeros_mask_dict[param_name].mask),
                               fraction_to_prune, binary_map.sum().item(), param.size(1))
            else:
                msglogger.error("fc sparsity = %.2f" % (1 - binary_map.sum().item() / binary_map.size(0)))
                zeros_mask_dict[param_name].mask = binary_map.expand(param.size(0), param.size(1))
                msglogger.info("FMReconstructionChannelPruner - param: %s pruned=%.3f goal=%.3f (%d/%d)",
                               param_name,
                               distiller.sparsity_cols(zeros_mask_dict[param_name].mask),
                               fraction_to_prune, binary_map.sum().item(), param.size(1))
        return binary_map
def weights_sparsity_summary(model,
                             return_total_sparsity=False,
                             param_dims=[2, 4]):
    df = pd.DataFrame(columns=[
        "Name",
        "Shape",
        "NNZ (dense)",
        "NNZ (sparse)",
        "Cols (%)",
        "Rows (%)",
        "Ch (%)",
        "2D (%)",
        "3D (%)",
        "Fine (%)",
        "Std",
        "Mean",
        "Abs-Mean",
    ])
    pd.set_option("precision", 2)
    params_size = 0
    sparse_params_size = 0
    for name, param in model.state_dict().items():
        # Extract just the actual parameter's name, which in this context we treat as its "type"
        if param.dim() in param_dims and any(type in name
                                             for type in ["weight", "bias"]):
            _density = distiller.density(param)
            params_size += torch.numel(param)
            sparse_params_size += param.numel() * _density
            df.loc[len(df.index)] = [
                name,
                distiller.size_to_str(param.size()),
                torch.numel(param),
                int(_density * param.numel()),
                distiller.sparsity_cols(param) * 100,
                distiller.sparsity_rows(param) * 100,
                distiller.sparsity_ch(param) * 100,
                distiller.sparsity_2D(param) * 100,
                distiller.sparsity_3D(param) * 100,
                (1 - _density) * 100,
                param.std().item(),
                param.mean().item(),
                param.abs().mean().item(),
            ]

    total_sparsity = (1 - sparse_params_size / params_size) * 100

    df.loc[len(df.index)] = [
        "Total sparsity:",
        "-",
        params_size,
        int(sparse_params_size),
        0,
        0,
        0,
        0,
        0,
        total_sparsity,
        0,
        0,
        0,
    ]

    if return_total_sparsity:
        return df, total_sparsity
    return df