def __init__(self, model, stat_name, classes): """ Args: model - the model we are monitoring. stat_name - name for the statistics being collected. You can access a module's activation statistics by referring to module.<stat_name> For example: print(module.sparsity) classes - a list of class types for which we collect activation statistics. Passing an empty list or None will collect statistics for all class types. """ super(ActivationStatsCollector, self).__init__() self.model = model self.stat_name = stat_name self.classes = classes self.fwd_hook_handles = [] # The layer names are mangled, because torch.Modules don't have names and we need to invent # a unique, human-readable name per layer. distiller.utils.assign_layer_fq_names(model) # Currently this is internal, and its only purpose is to enable skipping collection # for wrapped modules inside post-training quantization wrapper classes. # When doing PTQ, the outputs of these wrapped modules are actually intermediate results # which are not relevant for tracking. self._dont_collect_list = [module.wrapped_module.distiller_name for module in model.modules() if is_post_train_quant_wrapper(module)]
def init_linear_quant_params(quantizer, original_model, eval_fn, dummy_input, init_mode, init_method='Powell', search_clipping=False, run_device='cpu'): """ Initializes all linear quantization parameters of the model. Args: quantizer (PostTrainLinearQuantizer): the quantizer, **after** calling prepare model. original_model (nn.Module): the original, pre-quantized, model. init_mode (ClipMode or callable or str or dict): See `init_layer_linear_qaunt_params`. if init_mode is dict - init_mode is configuration for the different layers, i.e. init_mode = Dict[layer_name:str, init_mode_layer: ClipMode or callable or str]. eval_fn: evaluation function for the model. Assumed it has a signature of the form `eval_fn(model)->float`. this is the function to be minimized by the optimization algorithm. Note - unlike in `init_layer_linear_quant_params`, this argument is required here. dummy_input: dummy sample input to the model init_method: See `init_layer_linear_qaunt_params`. search_clipping (bool): if set, optimize clipping values, otherwise optimize scale factor """ non_parallel_model = _make_non_parallel_copy(original_model).to( device=run_device if callable(init_mode) else 'cpu') layers_topological_order = SummaryGraph( non_parallel_model, dummy_input).layers_topological_order() q_named_modules = OrderedDict(quantizer.model.named_modules()) for module_name in layers_topological_order: # check to see if it was quantized: q_module = q_named_modules[distiller.denormalize_module_name( quantizer.model, module_name)] if not is_post_train_quant_wrapper(q_module, False): continue module_init_mode = init_mode[module_name] if isinstance( init_mode, dict) else init_mode msglogger.debug('Initializing layer \'%s\' using %s mode' % (module_name, module_init_mode)) init_layer_linear_quant_params(quantizer, non_parallel_model, module_name, module_init_mode, init_method=init_method, eval_fn=eval_fn, search_clipping=search_clipping, run_device=run_device) if non_parallel_model != original_model: del non_parallel_model quantizer._post_prepare_model() quantizer.model.eval()
def _should_collect(self, module): if module.distiller_name in self._dont_collect_list: return False # In general, we only collect stats for "leaf" modules. # We make an exception for models that were quantized with 'PostTrainLinearQuantizer'. In these # models, the quantized modules are actually wrappers of the original FP32 modules, so they are # NOT leaf modules - but we still want to track them. if distiller.has_children(module) and not is_post_train_quant_wrapper(module): return False if isinstance(module, torch.nn.Identity): return False register_all_class_types = not self.classes if register_all_class_types or isinstance(module, tuple(self.classes)): return True return False
def validate_quantization_settings(quantized_model, search_clipping): if search_clipping: return for n, m in quantized_model.named_modules(): if not is_post_train_quant_wrapper(m, False): continue err_msg = 'Detected asymmetric quantization of {}. ' \ 'Switch to symmetric quantization or enable search_clipping.' if not isinstance(m, RangeLinearEmbeddingWrapper): if m.output_quant_settings.num_bits and \ is_linear_quant_mode_asymmetric(m.mode.activations) and \ not m.clip_half_range: raise ValueError( err_msg.format('activations without fused ReLU')) if isinstance( m, (RangeLinearEmbeddingWrapper, RangeLinearQuantParamLayerWrapper)): if is_linear_quant_mode_asymmetric(m.mode.weights): raise ValueError(err_msg.format('weights'))
def init_layer_linear_quant_params(quantizer, original_model, layer_name, init_mode=ClipMode.NONE, init_method='Powell', eval_fn=None, search_clipping=False, run_device='cpu'): """ Initializes a layer's linear quant parameters. This is done to set the scipy.optimize.minimize initial guess. Args: quantizer (PostTrainLinearQuantizer): the quantizer, **after** calling prepare model. original_model (nn.Module): the original, pre-quantized, model. layer_name (str): the name of the layer. init_mode (ClipMode or callable or str): the initialization mode. If ClipMode, the initialization will be according to the respective ClipMode. If callable - init_mode will be treated as a loss function between the activations pre and post-quantization, and the initialization process will attempt to find the minimum of that loss function. E.g. if l1_loss has been passed, the initialization vector will be scale, zero_point = argmin_{s, zp} (l1_loss(layer(input), q_layer(input; s, zp))) If str - the mode will be chosen from a list of options. The options are: [NONE, AVG, LAPLACE, GAUSS, L1, L2 ,L3]. Defaults to ClipMode.NONE init_method (str or callable): applicable only in the case of init_mode = 'L1/2/3' or callable. chooses the minimization method for finding the local argmin_{s, zp}. Defaults to 'Powell' eval_fn: evaluation function for the model. Assumed it has a signature of the form `eval_fn(model)->float`. this is the function to be minimized by the optimization algorithm. applicable only in the case of init_mode = 'L1/2/3' or callable. search_clipping (bool): if set, optimize clipping values, otherwise optimize scale factor """ denorm_layer_name = distiller.denormalize_module_name( quantizer.model, layer_name) msglogger.info(denorm_layer_name) if isinstance(init_mode, str): init_mode = _init_mode_from_str(init_mode) layer = dict(original_model.named_modules())[layer_name] local_args, local_kwargs = quantizer.modules_processed_args[ denorm_layer_name] if isinstance(init_mode, ClipMode): local_kwargs['clip_acts'] = init_mode replace_fn = quantizer.replacement_factory.get( type(layer), quantizer.default_repalcement_fn) quantized_layer = replace_fn(deepcopy(layer), *local_args, **local_kwargs).eval() if not is_post_train_quant_wrapper(quantized_layer, False): # the module wasn't quantized, nothing to do here return if callable(init_mode): input_for_layer = get_input_for_layer(original_model, layer_name, eval_fn) quantized_layer = optimize_for_layer( layer.to(device=run_device), quantized_layer.to(device=run_device), init_mode, input_for_layer, init_method, search_clipping=search_clipping) del input_for_layer distiller.model_setattr(quantizer.model, denorm_layer_name, quantized_layer) quantizer.model.eval()