Beispiel #1
0
    def test_fake_quant_quant_per_channel_bias(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConv3d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=True,
            quant_desc_weight=QuantDescriptor(axis=(0)))
        test_input = torch.randn(8, _NUM_IN_CHANNELS, 8, 8, 8)

        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        weight_copy = quant_conv_object.weight.clone()
        quant_weight = tensor_quant.fake_tensor_quant(
            weight_copy,
            torch.max(torch.abs(weight_copy).view(_NUM_OUT_CHANNELS, -1),
                      dim=1,
                      keepdim=True)[0].view(_NUM_OUT_CHANNELS, 1, 1, 1, 1))

        out1 = F.conv3d(quant_input, quant_weight, bias=quant_conv_object.bias)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
    def test_cuda_ext(self):
        x_np = np.random.rand(1023).astype('float32')
        x_torch = torch.Tensor(x_np).cuda()

        for num_bits in [3, 4, 5, 7, 8, 11]:
            for unsigned in [True, False]:
                test_utils.compare(cuda_ext.fake_tensor_quant(
                    x_torch, torch.max(torch.abs(x_torch)), num_bits,
                    unsigned),
                                   tensor_quant.fake_tensor_quant(
                                       x_torch, torch.max(torch.abs(x_torch)),
                                       num_bits, unsigned),
                                   rtol=0,
                                   atol=0)

        # Test fp16
        x_np_fp16 = np.random.rand(1023).astype('float16')
        x_torch_fp16 = torch.Tensor(x_np_fp16).cuda().half()
        test_utils.compare(
            cuda_ext.fake_tensor_quant(x_torch_fp16,
                                       torch.max(torch.abs(x_torch_fp16))),
            tensor_quant.fake_tensor_quant(x_torch_fp16,
                                           torch.max(torch.abs(x_torch_fp16))),
            rtol=0,
            atol=0)
Beispiel #3
0
    def test_fake_quant_per_channel_bias(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConv2d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=True,
            quant_desc_weight=tensor_quant.
            QUANT_DESC_8BIT_CONV2D_WEIGHT_PER_CHANNEL)
        test_input = torch.randn(16, _NUM_IN_CHANNELS, 16, 16)

        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        weight_copy = quant_conv_object.weight.clone()
        quant_weight = tensor_quant.fake_tensor_quant(
            weight_copy,
            torch.max(torch.abs(weight_copy).view(_NUM_OUT_CHANNELS, -1),
                      dim=1,
                      keepdim=True)[0].view(_NUM_OUT_CHANNELS, 1, 1, 1))

        out1 = F.conv2d(quant_input, quant_weight, bias=quant_conv_object.bias)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
Beispiel #4
0
    def test_fake_quant_per_channel_bias(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConvTranspose3d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=True,
            quant_desc_weight=tensor_quant.
            QUANT_DESC_8BIT_CONVTRANSPOSE3D_WEIGHT_PER_CHANNEL)
        test_input = torch.randn(2, _NUM_IN_CHANNELS, 2, 2, 2)

        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        weight_copy = quant_conv_object.weight.clone()
        amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2, 3, 4))
        quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax)

        out1 = F.conv_transpose3d(quant_input,
                                  quant_weight,
                                  bias=quant_conv_object.bias)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
Beispiel #5
0
    def test_fake_quant_per_tensor(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConv1d(
            _NUM_IN_CHANNELS, _NUM_OUT_CHANNELS, kernel_size, bias=False, quant_desc_weight=QuantDescriptor())
        test_input = torch.randn(16, _NUM_IN_CHANNELS, 16)

        quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input)))

        weight_copy = quant_conv_object.weight.clone()
        quant_weight = tensor_quant.fake_tensor_quant(weight_copy, torch.max(torch.abs(weight_copy)))

        out1 = F.conv1d(quant_input, quant_weight)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
Beispiel #6
0
    def _compute_amax_mse(self, stride, start_bin):
        """Returns amax that minimizes MSE of the collected histogram"""

        # If calibrator hasn't collected any data, return none
        if self._calib_bin_edges is None and self._calib_hist is None:
            return None

        counts = torch.from_numpy(self._calib_hist[:]).float()
        edges = torch.from_numpy(self._calib_bin_edges[:]).float()
        centers = (edges[1:] + edges[:-1]) / 2

        mses = []
        arguments = []

        for i in range(start_bin, len(centers), stride):

            amax = centers[i]
            quant_centers = fake_tensor_quant(centers, amax, self._num_bits,
                                              self._unsigned)

            mse = ((quant_centers - centers)**2 * counts).mean()

            mses.append(mse)
            arguments.append(i)

        logging.debug("mses={}".format(mses))
        argmin = np.argmin(mses)
        calib_amax = centers[arguments[argmin]]

        return calib_amax
    def test_per_channel_scale(self):
        """ fake_tensor_quant performs per channel quantization
        """
        x_np = np.random.rand(15, 15, 64, 128).astype('float32')
        x_torch = torch.Tensor(x_np).cuda()

        # Pytorch filter layout seems to be KCRS, reduce max to shape [K, 1, 1, 1] to test per channel scale
        # Shrink max a little, so that clip behavior is tested
        amax_x_np = 0.9 * np.max(np.abs(x_np), axis=(1, 2, 3), keepdims=True)
        # Pytorch's max function doesn't support reduces multiple axis, and returns (max, argmax) tuple,
        # so it has to be reduced by multiple torch.max
        amax_x_torch = 0.9 * torch.max(torch.max(
            torch.max(x_torch, dim=1,
                      keepdim=True)[0], dim=2, keepdim=True)[0],
                                       dim=3,
                                       keepdim=True)[0]

        quant_x_np = test_utils.quant_np(x_np, amax_x_np, fake=True)
        quant_x_torch = tensor_quant.fake_tensor_quant(x_torch, amax_x_torch)

        # Pytorch numerics is not the same as numpy, results will be off a little
        # np.testing.assert_array_equal(quant_x_torch.cpu().numpy(), quant_x_np)
        np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(),
                                             quant_x_np,
                                             decimal=2)
        if verbose:
            mismatches = np.where(
                np.abs(quant_x_torch.cpu().numpy() - quant_x_np) >= 1e-5)
            print("Mismatches:")
            print(" Original: ", x_np[mismatches])
            print(" numpy: ", quant_x_np[mismatches])
            print(" Pytorch: ", quant_x_torch.cpu().numpy()[mismatches])
 def test_overflow_fp16(self):
     x_torch = torch.randn(1023).cuda().half()
     quant_x_torch = tensor_quant.fake_tensor_quant(
         x_torch,
         torch.tensor(1e-4).cuda().half(), 8, False)
     assert not (torch.isinf(quant_x_torch).any()
                 or torch.isnan(quant_x_torch).any())
Beispiel #9
0
def quantize_by_range_fused(x_tuple, num_bits):
    """Quantize multiple torch tensors by combined range to num_bits with symmetric zero-mean quantizer."""
    # compute aggregate amax across all tensors
    amax = max([x.abs().max() for x in x_tuple])
    # quantize each tensor with the aggregate amax
    x_q_tuple = tuple(
        tensor_quant.fake_tensor_quant(x, amax, num_bits) for x in x_tuple)
    return x_q_tuple
Beispiel #10
0
 def test_simple_run(self):
     """Quantizer calls fake_tensor_quant by default"""
     x = torch.randn(3, 7).cuda()
     amax_x = torch.max(torch.abs(x))
     fn_quant_x = tensor_quant.fake_tensor_quant(x, amax_x)
     quantizer = tensor_quantizer.TensorQuantizer()
     module_quant_x = quantizer(x)
     np.testing.assert_array_equal(fn_quant_x.cpu().numpy(), module_quant_x.cpu().numpy())
 def test_clip_gradient(self):
     x = torch.randn(3, 7, requires_grad=True).cuda()
     x.retain_grad()
     amax = x.abs().max() / 2
     x_in_range = (-amax <= x) * (x <= amax)
     quant_x = tensor_quant.fake_tensor_quant(x, amax, 8)
     loss = torch.sum((quant_x - 0.5)**2)
     loss.backward()
     np.testing.assert_array_equal(x.grad.cpu().numpy() != 0,
                                   x_in_range.cpu().numpy())
 def test_per_tensor_scale(self):
     """ fake_tensor_quant matches numpy quantization
     """
     x_np = np.random.rand(13).astype('float32')
     print(x_np)
     x_torch = torch.Tensor(x_np).cuda()
     quant_x_np = test_utils.quant_np(x_np, np.max(np.abs(x_np)), fake=True)
     quant_x_torch = tensor_quant.fake_tensor_quant(
         x_torch, torch.max(torch.abs(x_torch)))
     np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(),
                                          quant_x_np)
 def test_unsigned(self):
     x_np = np.random.rand(1023).astype('float32')
     x_torch = torch.Tensor(x_np).cuda()
     quant_x_np = test_utils.quant_np(x_np,
                                      np.max(np.abs(x_np)),
                                      num_bits=9,
                                      fake=True)
     quant_x_torch = tensor_quant.fake_tensor_quant(
         x_torch, torch.max(torch.abs(x_torch)), 8, True)
     np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(),
                                          quant_x_np)
    def test_fake_quant_per_channel(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConvTranspose1d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=False,
            quant_desc_weight=QuantDescriptor(axis=(1)))
        test_input = torch.randn(16, _NUM_IN_CHANNELS, 16)

        quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input)))

        weight_copy = quant_conv_object.weight.clone()
        amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2))
        quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax)

        out1 = F.conv_transpose1d(quant_input, quant_weight)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
    def test_fake_quant_per_tensor(self):
        """quantize everything, activations will scaled per tensor in ALL cases"""
        size_in = 255
        size_out = 257
        quant_linear_object = quant_linear.QuantLinear(
            size_in,
            size_out,
            bias=False,
            quant_desc_weight=tensor_quant.QuantDescriptor())
        test_input = torch.randn(32, size_in)

        weight_copy = quant_linear_object.weight.clone()
        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))
        quant_weight = tensor_quant.fake_tensor_quant(
            weight_copy, torch.max(torch.abs(weight_copy)))

        out1 = F.linear(quant_input, quant_weight)
        out2 = quant_linear_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
 def test_backward(self):
     """ fake_tensor_quant implements straight through estimator on the backward pass
     """
     x = torch.randn(3, 7, requires_grad=True).cuda()
     labels = torch.randint(6, (3, )).type(torch.LongTensor).cuda()
     quant_x = tensor_quant.fake_tensor_quant(x, torch.max(torch.abs(x)), 7)
     x.retain_grad()
     quant_x.retain_grad()
     criterion = torch.nn.CrossEntropyLoss().cuda()
     loss = criterion(quant_x, labels)
     loss.backward()
     np.testing.assert_array_equal(quant_x.grad.cpu().numpy(),
                                   x.grad.cpu().numpy())
Beispiel #17
0
    def test_input_fake_quant(self):
        quant_pooling_object = quant_pooling.QuantAdaptiveAvgPool3d(
            output_size=3)

        test_input = torch.randn(5, 5, 5, 5, dtype=torch.double)

        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        out1 = F.adaptive_avg_pool3d(quant_input, 3)
        out2 = quant_pooling_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
Beispiel #18
0
    def test_input_fake_quant(self):
        quant_pooling_object = quant_pooling.QuantMaxPool2d(kernel_size=3,
                                                            stride=1)

        test_input = torch.randn(1, 5, 5, 5, dtype=torch.double)

        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        out1 = F.max_pool2d(quant_input, 3, 1, 0, 1, False, False)
        out2 = quant_pooling_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
    def test_cuda_ext_with_axis(self):
        x_np = np.random.rand(3, 4, 5, 6).astype('float32')
        x_torch = torch.Tensor(x_np).cuda()

        # amax along axis 1
        amax_torch = torch.tensor([0.8, 0.9, 0.7, 0.6], device="cuda")

        for num_bits in [3, 4, 5, 7, 8, 11]:
            for unsigned in [True, False]:
                cuda_ext_out = cuda_ext.fake_tensor_quant_with_axis(
                    x_torch, amax_torch, 1, num_bits, unsigned)
                pytorch_out = tensor_quant.fake_tensor_quant(
                    x_torch, amax_torch.view(1, -1, 1, 1), num_bits, unsigned)
                test_utils.compare(cuda_ext_out, pytorch_out, rtol=0, atol=0)
    def test_fake_quant_per_channel(self):
        """quantize everything, activations will scaled per tensor in ALL cases"""
        size_in = 255
        size_out = 257
        quant_linear_object = quant_linear.QuantLinear(
            size_in,
            size_out,
            bias=False,
            quant_desc_weight=tensor_quant.
            QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW)
        test_input = torch.randn(32, size_in)

        weight_copy = quant_linear_object.weight.clone()
        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))
        quant_weight = tensor_quant.fake_tensor_quant(
            weight_copy,
            torch.max(torch.abs(weight_copy), dim=1, keepdim=True)[0])

        out1 = F.linear(quant_input, quant_weight)
        out2 = quant_linear_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
 def test_full_range(self):
     """ fake_tensor_quant uses the full integer range when narrow=False
     """
     x_np = np.random.rand(1023).astype('float32')
     x_torch = torch.Tensor(x_np).cuda()
     amax = np.max(np.abs(x_np))
     quant_x_np = test_utils.quant_np(x_np,
                                      amax,
                                      num_bits=9,
                                      fake=True,
                                      narrow_range=False)
     quant_x_torch = tensor_quant.fake_tensor_quant(
         x_torch, torch.max(torch.abs(x_torch)), 8, True, False)
     np.testing.assert_array_almost_equal(quant_x_torch.cpu().numpy(),
                                          quant_x_np)
Beispiel #22
0
    def test_fake_quant_input(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConv1d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=False)
        quant_conv_object.weight_quantizer.disable()
        test_input = torch.randn(20, _NUM_IN_CHANNELS, 50)

        quant_input = tensor_quant.fake_tensor_quant(test_input, torch.max(torch.abs(test_input)))

        out1 = F.conv1d(quant_input, quant_conv_object.weight)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
    def test_test_input_fake_per_tensor(self):
        size_in = 255
        size_out = 257
        quant_linear_object = quant_linear.QuantLinear(size_in,
                                                       size_out,
                                                       bias=False)
        quant_linear_object.weight_quantizer.disable()
        test_input = torch.randn(32, size_in)

        weight_copy = quant_linear_object.weight.clone()
        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        out1 = F.linear(quant_input, weight_copy)
        out2 = quant_linear_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
    def test_fake_quant_per_tensor(self):

        quant_instancenorm_object = quant_instancenorm.QuantInstanceNorm1d(
            NUM_CHANNELS, affine=True, quant_desc_input=QuantDescriptor())

        test_input = torch.randn(8, NUM_CHANNELS, 128)
        quant_input = tensor_quant.fake_tensor_quant(
            test_input, torch.max(torch.abs(test_input)))

        out1 = quant_instancenorm_object(test_input)
        out2 = F.instance_norm(quant_input,
                               quant_instancenorm_object.running_mean,
                               quant_instancenorm_object.running_var,
                               quant_instancenorm_object.weight,
                               quant_instancenorm_object.bias)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())
Beispiel #25
0
    def test_input_variable_bits(self):
        # Repeat checking the output for variable number of bits to QuantDescriptor
        for bits in [2, 4, 6]:
            quant_desc_input = tensor_quant.QuantDescriptor(num_bits=bits)

            quant_pooling.QuantMaxPool2d.set_default_quant_desc_input(
                quant_desc_input)
            quant_pooling_object = quant_pooling.QuantMaxPool2d(kernel_size=3,
                                                                stride=1)

            test_input = torch.randn(1, 5, 5, 5, dtype=torch.double)

            quant_input = tensor_quant.fake_tensor_quant(
                test_input, torch.max(torch.abs(test_input)), bits)

            out1 = F.max_pool2d(quant_input, 3, 1, 0, 1, False, False)
            out2 = quant_pooling_object(test_input)
            np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                          out2.detach().cpu().numpy())
    def test_weight_fake_per_tensor(self):
        with torch.cuda.device(0):
            size = 256
            quant_linear_object = quant_linear.QuantLinear(
                size,
                size,
                bias=False,
                quant_desc_weight=tensor_quant.QuantDescriptor(axis=None))
            quant_linear_object.input_quantizer.disable()
            test_input = torch.randn(size, size)

            weight_copy = quant_linear_object.weight.clone()
            quant_weight = tensor_quant.fake_tensor_quant(
                weight_copy, torch.max(torch.abs(weight_copy)))

            out1 = F.linear(test_input, quant_weight)
            out2 = quant_linear_object(test_input)
            np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                          out2.detach().cpu().numpy())
    def test_weight_fake_quant_per_channel(self):
        kernel_size = 3

        quant_conv_object = quant_conv.QuantConvTranspose2d(
            _NUM_IN_CHANNELS,
            _NUM_OUT_CHANNELS,
            kernel_size,
            bias=False,
            quant_desc_weight=tensor_quant.QUANT_DESC_8BIT_CONVTRANSPOSE2D_WEIGHT_PER_CHANNEL)
        quant_conv_object.input_quantizer.disable()
        test_input = torch.randn(16, _NUM_IN_CHANNELS, 256, 256)

        weight_copy = quant_conv_object.weight.clone()

        amax = quant_utils.reduce_amax(weight_copy, axis=(0, 2, 3))
        quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax)

        out1 = F.conv_transpose2d(test_input, quant_weight)
        out2 = quant_conv_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(), out2.detach().cpu().numpy())
    def _quant_forward(self, inputs):
        """Quantized forward pass."""
        if self._learn_amax:
            inputs = self.clip(inputs)
            amax = torch.max(-self.clip.clip_value_min,
                             self.clip.clip_value_max).detach()
        else:
            amax = self._get_amax(inputs)

        if self._fake_quant:
            if not TensorQuantizer.use_fb_fake_quant:
                outputs = fake_tensor_quant(inputs, amax, self._num_bits,
                                            self._unsigned, self._narrow_range)
            else:
                outputs = self._fb_fake_quant(inputs, amax)
        else:
            outputs, self._scale = tensor_quant(inputs, amax, self._num_bits,
                                                self._unsigned)

        return outputs
Beispiel #29
0
def copy_state_and_quantize_fused(dst, src, num_bits):
    """Copy src to dst, quantize all 'weight' entries to num_bits using the aggregate amax."""
    src_state_dict = src.state_dict()
    dst_state_dict = dict()

    # compute aggregate amax across all weight tensors
    amax = 0
    for key in src_state_dict:
        if 'weight' in key:
            amax = max(amax, src_state_dict[key].abs().max())

    # quantize each weight tensor with the aggregate amax
    for key in src_state_dict:
        if 'weight' in key:
            dst_state_dict[key] = tensor_quant.fake_tensor_quant(
                src_state_dict[key], amax, num_bits)
        else:
            dst_state_dict[key] = src_state_dict[key].clone()

    dst.load_state_dict(dst_state_dict)
    def test_weight_fake_per_channel(self):
        size_in = 255
        size_out = 257
        quant_linear_object = quant_linear.QuantLinear(
            size_in,
            size_out,
            bias=False,
            quant_desc_weight=tensor_quant.
            QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW)
        quant_linear_object.input_quantizer.disable()
        test_input = torch.randn(32, size_in)

        weight_copy = quant_linear_object.weight.clone()
        amax = quant_utils.reduce_amax(weight_copy, axis=1, keepdims=True)
        quant_weight = tensor_quant.fake_tensor_quant(weight_copy, amax)

        out1 = F.linear(test_input, quant_weight)
        out2 = quant_linear_object(test_input)
        np.testing.assert_array_equal(out1.detach().cpu().numpy(),
                                      out2.detach().cpu().numpy())