Example #1
0
    def __init__(self, model, stat_name, classes):
        """
        Args:
            model - the model we are monitoring.
            stat_name - name for the statistics being collected.
                You can access a module's activation statistics by referring to module.<stat_name>
                For example:
                    print(module.sparsity)
            classes - a list of class types for which we collect activation statistics.
                Passing an empty list or None will collect statistics for all class types.
        """
        super(ActivationStatsCollector, self).__init__()
        self.model = model
        self.stat_name = stat_name
        self.classes = classes
        self.fwd_hook_handles = []

        # The layer names are mangled, because torch.Modules don't have names and we need to invent
        # a unique, human-readable name per layer.
        distiller.utils.assign_layer_fq_names(model)

        # Currently this is internal, and its only purpose is to enable skipping collection
        # for wrapped modules inside post-training quantization wrapper classes.
        # When doing PTQ, the outputs of these wrapped modules are actually intermediate results
        # which are not relevant for tracking.
        self._dont_collect_list = [module.wrapped_module.distiller_name for module in model.modules() if
                                   is_post_train_quant_wrapper(module)]
def init_linear_quant_params(quantizer,
                             original_model,
                             eval_fn,
                             dummy_input,
                             init_mode,
                             init_method='Powell',
                             search_clipping=False,
                             run_device='cpu'):
    """
    Initializes all linear quantization parameters of the model.
    Args:
        quantizer (PostTrainLinearQuantizer): the quantizer, **after** calling prepare model.
        original_model (nn.Module): the original, pre-quantized, model.
        init_mode (ClipMode or callable or str or dict): See `init_layer_linear_qaunt_params`.
          if init_mode is dict - init_mode is configuration for the different layers,
          i.e. init_mode = Dict[layer_name:str, init_mode_layer: ClipMode or callable or str].
        eval_fn: evaluation function for the model. Assumed it has a signature of the form
          `eval_fn(model)->float`. this is the function to be minimized by the optimization algorithm.
          Note - unlike in `init_layer_linear_quant_params`, this argument is required here.
        dummy_input: dummy sample input to the model
        init_method: See `init_layer_linear_qaunt_params`.
        search_clipping (bool): if set, optimize clipping values, otherwise optimize scale factor
    """
    non_parallel_model = _make_non_parallel_copy(original_model).to(
        device=run_device if callable(init_mode) else 'cpu')
    layers_topological_order = SummaryGraph(
        non_parallel_model, dummy_input).layers_topological_order()
    q_named_modules = OrderedDict(quantizer.model.named_modules())
    for module_name in layers_topological_order:
        # check to see if it was quantized:
        q_module = q_named_modules[distiller.denormalize_module_name(
            quantizer.model, module_name)]
        if not is_post_train_quant_wrapper(q_module, False):
            continue
        module_init_mode = init_mode[module_name] if isinstance(
            init_mode, dict) else init_mode
        msglogger.debug('Initializing layer \'%s\' using %s mode' %
                        (module_name, module_init_mode))
        init_layer_linear_quant_params(quantizer,
                                       non_parallel_model,
                                       module_name,
                                       module_init_mode,
                                       init_method=init_method,
                                       eval_fn=eval_fn,
                                       search_clipping=search_clipping,
                                       run_device=run_device)
    if non_parallel_model != original_model:
        del non_parallel_model

    quantizer._post_prepare_model()
    quantizer.model.eval()
Example #3
0
    def _should_collect(self, module):
        if module.distiller_name in self._dont_collect_list:
            return False
        # In general, we only collect stats for "leaf" modules.
        # We make an exception for models that were quantized with 'PostTrainLinearQuantizer'. In these
        # models, the quantized modules are actually wrappers of the original FP32 modules, so they are
        # NOT leaf modules - but we still want to track them.
        if distiller.has_children(module) and not is_post_train_quant_wrapper(module):
            return False
        if isinstance(module, torch.nn.Identity):
            return False

        register_all_class_types = not self.classes
        if register_all_class_types or isinstance(module, tuple(self.classes)):
            return True

        return False
def validate_quantization_settings(quantized_model, search_clipping):
    if search_clipping:
        return
    for n, m in quantized_model.named_modules():
        if not is_post_train_quant_wrapper(m, False):
            continue

        err_msg = 'Detected asymmetric quantization of {}. ' \
                  'Switch to symmetric quantization or enable search_clipping.'
        if not isinstance(m, RangeLinearEmbeddingWrapper):
            if m.output_quant_settings.num_bits and \
                    is_linear_quant_mode_asymmetric(m.mode.activations) and \
                    not m.clip_half_range:
                raise ValueError(
                    err_msg.format('activations without fused ReLU'))
        if isinstance(
                m,
            (RangeLinearEmbeddingWrapper, RangeLinearQuantParamLayerWrapper)):
            if is_linear_quant_mode_asymmetric(m.mode.weights):
                raise ValueError(err_msg.format('weights'))
def init_layer_linear_quant_params(quantizer,
                                   original_model,
                                   layer_name,
                                   init_mode=ClipMode.NONE,
                                   init_method='Powell',
                                   eval_fn=None,
                                   search_clipping=False,
                                   run_device='cpu'):
    """
    Initializes a layer's linear quant parameters.
    This is done to set the scipy.optimize.minimize initial guess.
    Args:
        quantizer (PostTrainLinearQuantizer): the quantizer, **after** calling prepare model.
        original_model (nn.Module): the original, pre-quantized, model.
        layer_name (str): the name of the layer.
        init_mode (ClipMode or callable or str): the initialization mode.
          If ClipMode, the initialization will be according to the respective ClipMode.
          If callable - init_mode will be treated as a loss function between the activations pre and post-quantization,
            and the initialization process will attempt to find the minimum of that loss function.
            E.g. if l1_loss has been passed, the initialization vector will be
              scale, zero_point = argmin_{s, zp} (l1_loss(layer(input), q_layer(input; s, zp)))
          If str - the mode will be chosen from a list of options. The options are:
            [NONE, AVG, LAPLACE, GAUSS, L1, L2 ,L3].
          Defaults to ClipMode.NONE
        init_method (str or callable): applicable only in the case of init_mode = 'L1/2/3' or callable.
          chooses the minimization method for finding the local argmin_{s, zp}.
          Defaults to 'Powell'
        eval_fn: evaluation function for the model. Assumed it has a signature of the form
          `eval_fn(model)->float`. this is the function to be minimized by the optimization algorithm.
          applicable only in the case of init_mode = 'L1/2/3' or callable.
        search_clipping (bool): if set, optimize clipping values, otherwise optimize scale factor
    """
    denorm_layer_name = distiller.denormalize_module_name(
        quantizer.model, layer_name)
    msglogger.info(denorm_layer_name)
    if isinstance(init_mode, str):
        init_mode = _init_mode_from_str(init_mode)
    layer = dict(original_model.named_modules())[layer_name]
    local_args, local_kwargs = quantizer.modules_processed_args[
        denorm_layer_name]
    if isinstance(init_mode, ClipMode):
        local_kwargs['clip_acts'] = init_mode
    replace_fn = quantizer.replacement_factory.get(
        type(layer), quantizer.default_repalcement_fn)
    quantized_layer = replace_fn(deepcopy(layer), *local_args,
                                 **local_kwargs).eval()
    if not is_post_train_quant_wrapper(quantized_layer, False):
        # the module wasn't quantized, nothing to do here
        return

    if callable(init_mode):
        input_for_layer = get_input_for_layer(original_model, layer_name,
                                              eval_fn)
        quantized_layer = optimize_for_layer(
            layer.to(device=run_device),
            quantized_layer.to(device=run_device),
            init_mode,
            input_for_layer,
            init_method,
            search_clipping=search_clipping)
        del input_for_layer

    distiller.model_setattr(quantizer.model, denorm_layer_name,
                            quantized_layer)
    quantizer.model.eval()