def test_s_prep_before_fusion(self):
        (
            mod,
            sparsifier,
            sparse_config,
        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
        sparsifier.prepare(mod, config=sparse_config)
        tq.fuse_modules(mod, [["5", "6"]], inplace=True)
        mod[5].qconfig = tq.get_default_qconfig("fbgemm")
        tq.prepare(mod, inplace=True)

        # check that correct modules had parametrizations added and
        # that none were lost during prepare or fusion
        self.assertTrue(hasattr(mod[0], "parametrizations"))
        self.assertTrue(hasattr(mod[5][0], "parametrizations"))

        # check that correct observers were inserted and that matching
        # occured successfully
        self.assertTrue(hasattr(mod[5], "activation_post_process"))
        _squash_mask_calibrate_and_convert(
            mod, sparsifier, torch.randn(1, 4, 4, 4)
        )

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
Beispiel #2
0
    def _get_model_and_sparsifier_and_sparse_config(self, qconfig=None):
        model = nn.Sequential(
            nn.Linear(4, 4),  # 0
            nn.ReLU(),
            nn.Linear(4, 4),  # 2
            nn.ReLU(),
            tq.QuantStub(),
            nn.Linear(4, 4),  # 5
            nn.ReLU(),
            tq.DeQuantStub(),
        )
        if qconfig is None:
            model[4].qconfig = tq.get_default_qconfig("fbgemm")
            model[5].qconfig = tq.get_default_qconfig("fbgemm")
        else:
            model[4].qconfig = qconfig
            model[5].qconfig = qconfig

        sparsifier = sparsity.WeightNormSparsifier(**sparse_defaults)

        sparse_config = [
            {
                "module": model[5],
                "sparsity_level": 0.7,
                "sparse_block_shape": (1, 4),
                "zeros_per_block": 4,
            },
            model[0],
        ]
        return model, sparsifier, sparse_config
    def test_convert_without_squash_mask(self):
        (
            mod,
            sparsifier,
            sparse_config,
        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))

        sparsifier.prepare(mod, config=sparse_config)
        tq.prepare(mod, inplace=True)

        # check that correct modules had parametrizations added and
        # that none were lost during prepare
        self.assertTrue(hasattr(mod[0], "parametrizations"))
        self.assertTrue(hasattr(mod[5], "parametrizations"))

        # check that correct observers were inserted and that matching
        # occured successfully
        self.assertTrue(hasattr(mod[5], "activation_post_process"))
        sparsifier.step()
        sparsity_level = _calculate_sparsity(mod[5].weight)
        mod(torch.randn(1, 4, 4, 4))
        tq.convert(mod, inplace=True)

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))

        # check that module was actually sparsified
        cur_sparsity = _calculate_sparsity(mod[5]._weight_bias()[0])
        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
        self.assertGreaterAlmostEqual(
            sparsity_level, sparse_config[0]["sparsity_level"]
        )
        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
Beispiel #4
0
    def graph_mode_quantize(self, inputs, data_loader, calibration_num_batches=64):
        """Quantize the model during export with graph mode quantization for linformer encoder."""
        if (
            isinstance(self.right_encoder, RoBERTaEncoder)
            and self.right_encoder.use_linformer_encoder
            and isinstance(self.left_encoder, RoBERTaEncoder)
            and self.left_encoder.use_linformer_encoder
        ):
            trace = self.trace(inputs)
            qconfig = get_default_qconfig("fbgemm")
            qconfig_dict = {"": qconfig}
            prepare_m = prepare_jit(trace, qconfig_dict, inplace=False)
            prepare_m.eval()
            with torch.no_grad():
                for i, (_, batch) in enumerate(data_loader):
                    print("Running calibration with batch {}".format(i))
                    input_data = self.onnx_trace_input(batch)
                    prepare_m(*input_data)
                    if i == calibration_num_batches - 1:
                        break
            trace = convert_jit(prepare_m, inplace=True)
        else:
            super().quantize()
            trace = self.trace(inputs)

        return trace
Beispiel #5
0
    def test_sparse_qlinear_serdes(self):
        # Note: At the moment, for sparse kernels
        # fbgemm supports only static quantized sparse linear
        # qnnpack supports only dynamically quantized sparse linear
        # Hence we have two different tests.
        # fbgemm tests static flow, qnnpack tests dynamic.
        # Should be unified later on and tests should be fixed
        # appropriately.
        model_class = SparseQuantizedModel
        fqn_to_check = "linear"
        if qengine_is_fbgemm():
            sparse_mapping = tq.get_default_static_sparse_quant_module_mappings(
            )
            ref_mapping = tq.get_default_static_quant_module_mappings()
            qconfig_dict = {nn.Linear: tq.get_default_qconfig("fbgemm")}
        elif qengine_is_qnnpack():
            sparse_mapping = tq.get_default_dynamic_sparse_quant_module_mappings(
            )
            ref_mapping = tq.get_default_dynamic_quant_module_mappings()
            qconfig_dict = {nn.Linear: tq.qconfig.default_dynamic_qconfig}
        else:
            return

        _sparse_layer_test_helper(
            model_class=model_class,
            sparse_mapping=sparse_mapping,
            ref_mapping=ref_mapping,
            qconfig_dict=qconfig_dict,
            fqn_to_check=fqn_to_check,
            test_class=self,
            test_scripting=True,
        )
Beispiel #6
0
    def graph_mode_quantize(
        self,
        inputs,
        data_loader,
        calibration_num_batches=64,
        qconfig_dict=None,
        force_quantize=False,
    ):
        """Quantize the model during export with graph mode quantization."""
        if force_quantize:
            trace = self.trace(inputs)
            if not qconfig_dict:
                qconfig_dict = {"": get_default_qconfig("fbgemm")}
            prepare_m = prepare_jit(trace, qconfig_dict, inplace=False)
            prepare_m.eval()
            with torch.no_grad():
                for i, (_, batch) in enumerate(data_loader):
                    print("Running calibration with batch {}".format(i))
                    input_data = self.onnx_trace_input(batch)
                    prepare_m(*input_data)
                    if i == calibration_num_batches - 1:
                        break
            trace = convert_jit(prepare_m, inplace=True)
        else:
            super().quantize()
            trace = self.trace(inputs)

        return trace
    def test_fusion_before_s_prep(self):
        (
            mod,
            sparsifier,
            _,
        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))
        tq.fuse_modules(mod, [["5", "6"]], inplace=True)

        # its absolutely broken by fusion but will still work if you put the correct fqn in
        sparse_config = [
            {
                "tensor_fqn": "5.0.weight",
                "sparsity_level": 0.7,
                "sparse_block_shape": (1, 4),
                "zeros_per_block": 4,
            },
            {"tensor_fqn": "0.weight"},
        ]

        sparsifier.prepare(mod, config=sparse_config)
        mod[5].qconfig = tq.get_default_qconfig("fbgemm")
        tq.prepare(mod, inplace=True)

        # check that correct modules had parametrizations added and
        # that none were lost during prepare
        self.assertTrue(hasattr(mod[0], "parametrizations"))
        self.assertTrue(hasattr(mod[5][0], "parametrizations"))

        # check that correct observers were inserted and that matching
        # occured successfully
        self.assertTrue(hasattr(mod[5], "activation_post_process"))
        sparsifier.step()
        sparsity_level = _calculate_sparsity(mod[5][0].weight)
        mod(torch.randn(1, 4, 4, 4))
        tq.convert(mod, inplace=True)

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(mod[5], torch.nn.intrinsic.quantized.LinearReLU))
        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))

        # check that module was actually sparsified
        cur_sparsity = _calculate_sparsity(mod[5]._weight_bias()[0])
        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
        self.assertGreaterAlmostEqual(
            sparsity_level, sparse_config[0]["sparsity_level"]
        )
        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
    def test_q_prep_fx_before_s_prep(self):
        r"""
        This test checks that the ordering of prepare_fx -> sparse prepare -> convert_fx
        compose cleanly without issue and that the final result is sparsified without
        having to call squash mask between sparse prepare and convert_fx. This also tests the
        automatic fusion that occurs during prepare_fx.
        """
        (
            mod,
            sparsifier,
            _,
        ) = _get_model_and_sparsifier_and_sparse_config()

        example = torch.randn(1, 4, 4, 4)
        qconfig = tq.get_default_qconfig("fbgemm")
        qconfig_mapping = tq.QConfigMapping() \
            .set_module_name("4", qconfig) \
            .set_module_name("5", qconfig)


        mod = prepare_fx(mod, qconfig_mapping, (example,))

        # its absolutely broken by auto fusion in fx
        # but will still work if you put the correct fqn in
        sparse_config = [
            {
                "tensor_fqn": "5.0.weight",
                "sparsity_level": 0.7,
                "sparse_block_shape": (1, 4),
                "zeros_per_block": 4,
            },
            {"tensor_fqn": "0.0.weight"},
        ]
        sparsifier.prepare(mod, config=sparse_config)

        # check that correct modules had parametrizations added and
        # that none were lost during prepare
        self.assertTrue(hasattr(fqn_to_module(mod, "0.0"), "parametrizations"))
        self.assertTrue(hasattr(fqn_to_module(mod, "5.0"), "parametrizations"))

        # check that correct observers were inserted and that matching
        # occured successfully
        self.assertTrue(_module_has_activation_post_process(mod, "5"))
        sparsifier.step()
        sparsity_level = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
        mod(example)
        mod = convert_fx(mod)

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.quantized.LinearReLU))
        self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))

        # check that module was actually sparsified
        cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5")._weight_bias()[0])
        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
        self.assertGreaterAlmostEqual(
            sparsity_level, sparse_config[0]["sparsity_level"]
        )
        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
    def test_s_prep_q_prep_fx_ref(self):
        r"""
        This checks that the ordering: sparse prepare -> prepare_fx -> convert_to_reference_fx
        compose cleanly without issue and that the final result is sparsified without
        having to call squash mask before convert_to_reference_fx.
        """
        (
            mod,
            sparsifier,
            sparse_config,
        ) = _get_model_and_sparsifier_and_sparse_config()
        sparsifier.prepare(mod, config=sparse_config)

        example = torch.randn(1, 4, 4, 4)
        qconfig = tq.get_default_qconfig("fbgemm")
        qconfig_mapping = tq.QConfigMapping() \
            .set_module_name("4", qconfig) \
            .set_module_name("5", qconfig)
        mod = prepare_fx(mod, qconfig_mapping, (example,))

        # check that correct modules had parametrizations added and
        # that none were lost during prepare
        self.assertTrue(hasattr(fqn_to_module(mod, "0.0"), "parametrizations"))
        self.assertTrue(hasattr(fqn_to_module(mod, "5.0"), "parametrizations"))

        # check that correct observers were inserted and that matching
        # occured successfully
        self.assertTrue(_module_has_activation_post_process(mod, "5"))
        sparsifier.step()
        sparsity_level = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
        mod(example)
        mod = convert_to_reference_fx(mod)

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(fqn_to_module(mod, "5"), torch.nn.intrinsic.LinearReLU))
        self.assertEqual(mod(example).shape, torch.Size([1, 4, 4, 4]))
        self.assertTrue(isinstance(fqn_to_module(mod, "5.0"), torch.nn.quantized._reference.Linear))

        # check that module was actually sparsified
        cur_sparsity = _calculate_sparsity(fqn_to_module(mod, "5.0.weight"))
        self.assertGreaterAlmostEqual(cur_sparsity, sparsity_level)
        self.assertGreaterAlmostEqual(
            sparsity_level, sparse_config[0]["sparsity_level"]
        )
        self.assertGreaterAlmostEqual(cur_sparsity, sparse_config[0]["sparsity_level"])
Beispiel #10
0
    def test_q_prep_before_s_prep(self):
        (
            mod,
            sparsifier,
            sparse_config,
        ) = _get_model_and_sparsifier_and_sparse_config(tq.get_default_qconfig("fbgemm"))

        tq.prepare(mod, inplace=True)
        sparsifier.prepare(mod, config=sparse_config)

        # check that correct modules had parametrizations added
        self.assertTrue(hasattr(mod[0], "parametrizations"))
        self.assertTrue(hasattr(mod[5], "parametrizations"))
        # check that correct observers were inserted
        self.assertTrue(hasattr(mod[5], "activation_post_process"))

        _squash_mask_calibrate_and_convert(
            mod, sparsifier, torch.randn(1, 4, 4, 4)
        )

        # check that final module is the expected quantized module and that the model runs
        self.assertTrue(isinstance(mod[5], torch.nn.quantized.Linear))
        self.assertEqual(mod(torch.randn(1, 4, 4, 4)).shape, torch.Size([1, 4, 4, 4]))
    def test_post_training_static_quantization(self, root_dir):
        """Validate post-training static quantization."""
        seed_everything(100)

        model = TestModule()
        num_epochs = 4
        static_quantization = PostTrainingQuantization(
            qconfig_dicts={"": {
                "": get_default_qconfig()
            }})
        trainer = Trainer(
            default_root_dir=os.path.join(root_dir, "quantized"),
            enable_checkpointing=False,
            callbacks=[static_quantization],
            max_epochs=num_epochs,
            logger=False,
        )
        # This will both train the model + quantize it.
        trainer.fit(model)

        self.assertIsNotNone(static_quantization.quantized)
        # Default qconfig requires calibration.
        self.assertTrue(static_quantization.should_calibrate)

        test_in = torch.randn(12, 32)
        with mode(model, training=False) as m:
            base_out = m(test_in)
        with mode(static_quantization.quantized, training=False) as q:
            test_out = q(test_in)

        # While quantized/original won't be exact, they should be close.
        self.assertLess(
            ((((test_out - base_out)**2).sum(axis=1))**(1 / 2)).mean(),
            0.015,
            "RMSE should be less than 0.015 between quantized and original.",
        )
Beispiel #12
0
    def test_sparse_qlinear_serdes(self):
        batch_size = 12
        input_channels = 4
        output_channels = 7
        model = self.SparseQuantizedModel(input_channels, output_channels)

        # For sparse kernels both the activation and weight ZP = 0
        X_scale = 0.2
        X_zp = 0
        W_scale = 1e-2
        W_zp = 0

        with override_cpu_allocator_for_qnnpack(qengine_is_qnnpack()):
            X_fp32 = torch.randn(batch_size,
                                 input_channels,
                                 dtype=torch.float32)
            float_bias = torch.randn(output_channels, dtype=torch.float32)

            X_q = torch.quantize_per_tensor(X_fp32,
                                            scale=X_scale,
                                            zero_point=X_zp,
                                            dtype=torch.quint8)
            X_fp32 = X_q.dequantize()

            W_fp32 = torch.randn(output_channels,
                                 input_channels,
                                 dtype=torch.float32)
            mask = torch.randint(0, 2, W_fp32.shape)
            W_fp32 *= mask
            W_q = torch.quantize_per_tensor(W_fp32, W_scale, W_zp, torch.qint8)

            model.linear.weight = nn.Parameter(W_q.dequantize())
            model.linear.sparse_params = {'sparse_block_shape': (1, 4)}
            model.eval()

            # Note: At the moment, for sparse kernels
            # fbgemm supports only static quantized sparse linear
            # qnnpack supports only dynamically quantized sparse linear
            # Hence we have two different tests.
            # fbgemm tests static flow, qnnpack tests dynamic.
            # Should be unified later on and tests should be fixed
            # appropriately.
            if qengine_is_fbgemm():
                model.qconfig = tq.get_default_qconfig('fbgemm')
                qmodel = copy.deepcopy(model)
                sqmodel = copy.deepcopy(model)

                tq.prepare(qmodel, inplace=True)
                tq.prepare(sqmodel, inplace=True)

                with torch.no_grad():
                    qmodel(X_fp32)
                    sqmodel(X_fp32)

                # Make sure the quantization parameters are computed the same way
                qparams = qmodel.linear.qconfig.weight().calculate_qparams()
                sqparams = sqmodel.linear.qconfig.weight().calculate_qparams()
                self.assertEqual(qparams, sqparams)

                # Make sure mapping of sparse kernels does not affect the non-sparse
                sparse_mapping = tq.get_default_static_quant_module_mappings()
                sparse_mapping[nn.Linear] = ao_nn_sq.Linear
                tq.convert(sqmodel, inplace=True, mapping=sparse_mapping)
                tq.convert(qmodel, inplace=True)

                assert isinstance(sqmodel.linear,
                                  ao_nn_sq.Linear), "Convert failed"
                assert isinstance(qmodel.linear,
                                  nn.quantized.Linear), "Mapping failed"

                scripted_sqmodel = torch.jit.script(sqmodel)
                scripted_sqmodel.eval()
                buffer = io.BytesIO()
                torch.jit.save(scripted_sqmodel, buffer)
                buffer.seek(0)
                sqmodel = torch.jit.load(buffer)

                # Make sure numerics are right
                Y_ref = qmodel(X_q)
                Y_hat = sqmodel(X_q)
                self.assertEqual(Y_ref.dequantize(), Y_hat.dequantize())

            elif qengine_is_qnnpack():
                qconfig = {nn.Linear: tq.qconfig.default_dynamic_qconfig}
                dqmodel = copy.deepcopy(model)
                sdqmodel = copy.deepcopy(model)

                tq.propagate_qconfig_(dqmodel, qconfig)
                tq.propagate_qconfig_(sdqmodel, qconfig)

                # Make sure the quantization parameters are computed the same way
                qparams = dqmodel.linear.qconfig.weight().calculate_qparams()
                sqparams = sdqmodel.linear.qconfig.weight().calculate_qparams()
                self.assertEqual(qparams, sqparams)

                # Make sure mapping of sparse kernels does not affect the non-sparse
                sparse_mapping = copy.deepcopy(
                    tq.get_default_dynamic_quant_module_mappings())
                sparse_mapping[nn.Linear] = ao_nn_sq.dynamic.Linear
                with LinearBlockSparsePattern(1, 4):
                    tq.convert(sdqmodel, inplace=True, mapping=sparse_mapping)
                tq.convert(
                    dqmodel,
                    mapping=tq.get_default_dynamic_quant_module_mappings(),
                    inplace=True)

                assert isinstance(sdqmodel.linear,
                                  ao_nn_sq.dynamic.Linear), "Convert failed"
                assert isinstance(
                    dqmodel.linear,
                    nn.quantized.dynamic.Linear), "Mapping failed"

                scripted_sdqmodel = torch.jit.script(sdqmodel)
                scripted_sdqmodel.eval()
                buffer = io.BytesIO()
                torch.jit.save(scripted_sdqmodel, buffer)
                buffer.seek(0)
                sdqmodel = torch.jit.load(buffer)

                # Make sure numerics are right
                Y_ref = dqmodel(X_fp32)
                Y_hat = sdqmodel(X_fp32)
                self.assertEqual(Y_ref, Y_hat)