def __init__(self, model, config_list, optimizer, dummy_input=None): assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type" super().__init__(model, config_list, optimizer, dummy_input) device = next(model.parameters()).device self.quant_grad = QuantForward() modules_to_compress = self.get_modules_to_compress() self.bound_model.register_buffer("steps", torch.Tensor([1])) for layer, config in modules_to_compress: if "weight" in config.get("quant_types", []): layer.module.register_parameter("weight_scale", torch.nn.Parameter(torch.Tensor([1.0]))) # todo: support per-channel quantization for weight since TensorRT use it for conv weight weight_bits = get_bits_length(config, "weight") layer.module.register_buffer('weight_bits', torch.Tensor([weight_bits])) qmax = 2 ** (weight_bits - 1) - 1 qmin = -2 ** (weight_bits - 1) init_weight_scale = layer.module.weight.data.detach().abs().mean() * 2 / (qmax ** 0.5) layer.module.weight_scale = torch.nn.Parameter(init_weight_scale) layer.module.weight_qmax = qmax layer.module.weight_qmin = qmin self.optimizer.add_param_group({"params": layer.module.weight_scale}) if "output" in config.get("quant_types", []): # scale of output will be initialized using the first batch data layer.module.register_parameter("output_scale", torch.nn.Parameter(torch.Tensor([1.0]))) output_bits = get_bits_length(config, "output") layer.module.register_buffer('output_bits', torch.Tensor([output_bits])) qmax = 2 ** (output_bits - 1) - 1 qmin = -2 ** (output_bits - 1) layer.module.output_qmax = qmax layer.module.output_qmin = qmin self.optimizer.add_param_group({"params": layer.module.output_scale}) if "input" in config.get("quant_types", []): # scale of input will be initialized using the first batch data layer.module.register_parameter("input_scale", torch.nn.Parameter(torch.Tensor([1.0]))) input_bits = get_bits_length(config, "input") layer.module.register_buffer('input_bits', torch.Tensor([input_bits])) qmax = 2 ** (input_bits - 1) - 1 qmin = -2 ** (input_bits - 1) layer.module.input_qmax = qmax layer.module.input_qmin = qmin self.optimizer.add_param_group({"params": layer.module.input_scale}) self.bound_model.to(device)
def __init__(self, model, config_list, optimizer): assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type" super().__init__(model, config_list, optimizer) device = next(model.parameters()).device modules_to_compress = self.get_modules_to_compress() for layer, config in modules_to_compress: if "weight" in config.get("quant_types", []): weight_bits = get_bits_length(config, 'weight') layer.module.register_buffer('weight_bits', torch.Tensor([int(weight_bits)])) self.bound_model.to(device)
def __init__(self, model, config_list, optimizer, dummy_input=None): """ Parameters ---------- model : torch.nn.Module the model to be quantized config_list : list of dict list of configurations for quantization supported keys for dict: - quant_types : list of string type of quantization you want to apply, currently support 'weight', 'input', 'output' - quant_bits : int or dict of {str : int} bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8}, when the type is int, all quantization types share same bits length - quant_start_step : int disable quantization until model are run by certain number of steps, this allows the network to enter a more stable state where output quantization ranges do not exclude a significant fraction of values, default value is 0 - op_types : list of string types of nn.module you want to apply quantization, eg. 'Conv2d' - dummy_input : tuple of tensor inputs to the model, which are used to get the graph of the module. The graph is used to find Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not given, the batch normalization folding would be disabled. """ assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type" super().__init__(model, config_list, optimizer, dummy_input) device = next(model.parameters()).device self.quant_grad = QuantForward() modules_to_compress = self.get_modules_to_compress() self.bound_model.register_buffer("steps", torch.Tensor([1])) for layer, config in modules_to_compress: if "weight" in config.get("quant_types", []): layer.module.register_parameter( "weight_scale", torch.nn.Parameter(torch.Tensor([1.0]))) # todo: support per-channel quantization for weight since TensorRT use it for conv weight weight_bits = get_bits_length(config, "weight") layer.module.register_buffer('weight_bits', torch.Tensor([weight_bits])) qmax = 2**(weight_bits - 1) - 1 qmin = -2**(weight_bits - 1) init_weight_scale = layer.module.weight.data.detach().abs( ).mean() * 2 / (qmax**0.5) layer.module.weight_scale = torch.nn.Parameter( init_weight_scale) layer.module.weight_qmax = qmax layer.module.weight_qmin = qmin self.optimizer.add_param_group( {"params": layer.module.weight_scale}) if "output" in config.get("quant_types", []): # scale of output will be initialized using the first batch data layer.module.register_parameter( "output_scale", torch.nn.Parameter(torch.Tensor([1.0]))) output_bits = get_bits_length(config, "output") layer.module.register_buffer('output_bits', torch.Tensor([output_bits])) qmax = 2**(output_bits - 1) - 1 qmin = -2**(output_bits - 1) layer.module.output_qmax = qmax layer.module.output_qmin = qmin self.optimizer.add_param_group( {"params": layer.module.output_scale}) if "input" in config.get("quant_types", []): # scale of input will be initialized using the first batch data layer.module.register_parameter( "input_scale", torch.nn.Parameter(torch.Tensor([1.0]))) input_bits = get_bits_length(config, "input") layer.module.register_buffer('input_bits', torch.Tensor([input_bits])) qmax = 2**(input_bits - 1) - 1 qmin = -2**(input_bits - 1) layer.module.input_qmax = qmax layer.module.input_qmin = qmin self.optimizer.add_param_group( {"params": layer.module.input_scale}) self.bound_model.to(device)