コード例 #1
0
    def __init__(self, model, config_list, optimizer, dummy_input=None):
        assert isinstance(optimizer, torch.optim.Optimizer), "unrecognized optimizer type"
        super().__init__(model, config_list, optimizer, dummy_input)
        device = next(model.parameters()).device
        self.quant_grad = QuantForward()
        modules_to_compress = self.get_modules_to_compress()
        self.bound_model.register_buffer("steps", torch.Tensor([1]))
        for layer, config in modules_to_compress:
            if "weight" in config.get("quant_types", []):
                layer.module.register_parameter("weight_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                # todo: support per-channel quantization for weight since TensorRT use it for conv weight
                weight_bits = get_bits_length(config, "weight")
                layer.module.register_buffer('weight_bits', torch.Tensor([weight_bits]))
                qmax = 2 ** (weight_bits - 1) - 1
                qmin = -2 ** (weight_bits - 1)
                init_weight_scale = layer.module.weight.data.detach().abs().mean() * 2 / (qmax ** 0.5)
                layer.module.weight_scale = torch.nn.Parameter(init_weight_scale)
                layer.module.weight_qmax = qmax
                layer.module.weight_qmin = qmin

                self.optimizer.add_param_group({"params": layer.module.weight_scale})

            if "output" in config.get("quant_types", []):
                # scale of output will be initialized using the first batch data
                layer.module.register_parameter("output_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                output_bits = get_bits_length(config, "output")
                layer.module.register_buffer('output_bits', torch.Tensor([output_bits]))
                qmax = 2 ** (output_bits - 1) - 1
                qmin = -2 ** (output_bits - 1)
                layer.module.output_qmax = qmax
                layer.module.output_qmin = qmin

                self.optimizer.add_param_group({"params": layer.module.output_scale})

            if "input" in config.get("quant_types", []):
                # scale of input will be initialized using the first batch data
                layer.module.register_parameter("input_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                input_bits = get_bits_length(config, "input")
                layer.module.register_buffer('input_bits', torch.Tensor([input_bits]))
                qmax = 2 ** (input_bits - 1) - 1
                qmin = -2 ** (input_bits - 1)
                layer.module.input_qmax = qmax
                layer.module.input_qmin = qmin

                self.optimizer.add_param_group({"params": layer.module.input_scale})

        self.bound_model.to(device)
コード例 #2
0
ファイル: dorefa_quantizer.py プロジェクト: yinfupai/nni
 def __init__(self, model, config_list, optimizer):
     assert isinstance(optimizer,
                       torch.optim.Optimizer), "unrecognized optimizer type"
     super().__init__(model, config_list, optimizer)
     device = next(model.parameters()).device
     modules_to_compress = self.get_modules_to_compress()
     for layer, config in modules_to_compress:
         if "weight" in config.get("quant_types", []):
             weight_bits = get_bits_length(config, 'weight')
             layer.module.register_buffer('weight_bits',
                                          torch.Tensor([int(weight_bits)]))
     self.bound_model.to(device)
コード例 #3
0
    def __init__(self, model, config_list, optimizer, dummy_input=None):
        """
        Parameters
        ----------
        model : torch.nn.Module
            the model to be quantized
        config_list : list of dict
            list of configurations for quantization
            supported keys for dict:
                - quant_types : list of string
                    type of quantization you want to apply, currently support 'weight', 'input', 'output'
                - quant_bits : int or dict of {str : int}
                    bits length of quantization, key is the quantization type, value is the length, eg. {'weight': 8},
                    when the type is int, all quantization types share same bits length
                - quant_start_step : int
                    disable quantization until model are run by certain number of steps, this allows the network to enter a more stable
                    state where output quantization ranges do not exclude a significant fraction of values, default value is 0
                - op_types : list of string
                    types of nn.module you want to apply quantization, eg. 'Conv2d'
                - dummy_input : tuple of tensor
                    inputs to the model, which are used to get the graph of the module. The graph is used to find
                    Conv-Bn patterns. And then the batch normalization folding would be enabled. If dummy_input is not
                    given, the batch normalization folding would be disabled.
        """
        assert isinstance(optimizer,
                          torch.optim.Optimizer), "unrecognized optimizer type"
        super().__init__(model, config_list, optimizer, dummy_input)
        device = next(model.parameters()).device
        self.quant_grad = QuantForward()
        modules_to_compress = self.get_modules_to_compress()
        self.bound_model.register_buffer("steps", torch.Tensor([1]))
        for layer, config in modules_to_compress:
            if "weight" in config.get("quant_types", []):
                layer.module.register_parameter(
                    "weight_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                # todo: support per-channel quantization for weight since TensorRT use it for conv weight
                weight_bits = get_bits_length(config, "weight")
                layer.module.register_buffer('weight_bits',
                                             torch.Tensor([weight_bits]))
                qmax = 2**(weight_bits - 1) - 1
                qmin = -2**(weight_bits - 1)
                init_weight_scale = layer.module.weight.data.detach().abs(
                ).mean() * 2 / (qmax**0.5)
                layer.module.weight_scale = torch.nn.Parameter(
                    init_weight_scale)
                layer.module.weight_qmax = qmax
                layer.module.weight_qmin = qmin

                self.optimizer.add_param_group(
                    {"params": layer.module.weight_scale})

            if "output" in config.get("quant_types", []):
                # scale of output will be initialized using the first batch data
                layer.module.register_parameter(
                    "output_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                output_bits = get_bits_length(config, "output")
                layer.module.register_buffer('output_bits',
                                             torch.Tensor([output_bits]))
                qmax = 2**(output_bits - 1) - 1
                qmin = -2**(output_bits - 1)
                layer.module.output_qmax = qmax
                layer.module.output_qmin = qmin

                self.optimizer.add_param_group(
                    {"params": layer.module.output_scale})

            if "input" in config.get("quant_types", []):
                # scale of input will be initialized using the first batch data
                layer.module.register_parameter(
                    "input_scale", torch.nn.Parameter(torch.Tensor([1.0])))
                input_bits = get_bits_length(config, "input")
                layer.module.register_buffer('input_bits',
                                             torch.Tensor([input_bits]))
                qmax = 2**(input_bits - 1) - 1
                qmin = -2**(input_bits - 1)
                layer.module.input_qmax = qmax
                layer.module.input_qmin = qmin

                self.optimizer.add_param_group(
                    {"params": layer.module.input_scale})

        self.bound_model.to(device)