def quantize_generator(self, quantize=False, embed_quantize=EmbedQuantizeType.NONE): if quantize: qconfig_dict = {torch.nn.Linear: tq.per_channel_dynamic_qconfig} # embedding quantization if embed_quantize != EmbedQuantizeType.NONE: # 8-bit embedding quantization if embed_quantize == EmbedQuantizeType.BIT_8: qconfig_dict[ torch.nn.Embedding] = float_qparams_weight_only_qconfig # 4-bit embedding quantization elif embed_quantize == EmbedQuantizeType.BIT_4: raise NotImplementedError( "4bit embedding quantization not yet supported") else: raise NotImplementedError( "Embedding Quantization should be either 8bit or 4bit") self.model = tq.quantize_dynamic( self.model, qconfig_dict, dtype=torch.qint8, inplace=False, ) self.length_prediction_model = tq.quantize_dynamic( self.length_prediction_model, {torch.nn.Linear: tq.per_channel_dynamic_qconfig}, dtype=torch.qint8, inplace=False, )
def quantize(self): """Quantize the model during export.""" # by default only quantize the linear modules, override this method if your # model wants other modules quantized. # By default we dynamic quantize Linear for PyText models. # Todo: we can also add quantized torch.nn.LSTM/GRU support in the future. tq.quantize_dynamic( self, {torch.nn.Linear, torch.nn.LSTM}, dtype=torch.qint8, inplace=True )
def test_compare_model_outputs_linear_dynamic(self): r"""Compare the output of linear layer in dynamic quantized model and corresponding output of conv layer in float model """ qengine = torch.backends.quantized.engine def compare_and_validate_results(float_model, q_model, data): act_compare_dict = compare_model_outputs(float_model, q_model, data) expected_act_compare_dict_keys = {"fc1.stats"} self.assertTrue( act_compare_dict.keys() == expected_act_compare_dict_keys) for k, v in act_compare_dict.items(): self.assertTrue(len(v["float"]) == len(v["quantized"])) for i, val in enumerate(v["quantized"]): self.assertTrue( v["float"][i].shape == v["quantized"][i].shape) linear_data = self.calib_data[0][0] model_list = [SingleLayerLinearDynamicModel(qengine)] for model in model_list: model.eval() if hasattr(model, "fuse_model"): model.fuse_model() q_model = quantize_dynamic(model) compare_and_validate_results(model, q_model, linear_data)
def test_compare_model_stub_lstm_dynamic(self): r"""Compare the output of dynamic quantized LSTM layer and its float shadow module""" qengine = torch.backends.quantized.engine def compare_and_validate_results(float_model, q_model, module_swap_list, input, hidden): ob_dict = compare_model_stub(float_model, q_model, module_swap_list, input, hidden) self.assertEqual(len(ob_dict), 1) for k, v in ob_dict.items(): self.assertTrue(len(v["float"]) == len(v["quantized"])) for i, val in enumerate(v["quantized"]): self.assertTrue( v["float"][i].shape == v["quantized"][i].shape) lstm_input = torch.rand((1, 1, 2)) lstm_hidden = (torch.rand(1, 1, 2), torch.rand(1, 1, 2)) model_list = [LSTMwithHiddenDynamicModel(qengine)] module_swap_list = [nn.Linear, nn.LSTM] for model in model_list: model.eval() if hasattr(model, "fuse_model"): model.fuse_model() q_model = quantize_dynamic(model) compare_and_validate_results(model, q_model, module_swap_list, lstm_input, lstm_hidden)
def test_compare_model_stub_linear_dynamic(self): r"""Compare the output of dynamic quantized linear layer and its float shadow module""" qengine = torch.backends.quantized.engine def compare_and_validate_results(float_model, q_model, module_swap_list, data): ob_dict = compare_model_stub(float_model, q_model, module_swap_list, data) self.assertEqual(len(ob_dict), 1) for k, v in ob_dict.items(): self.assertTrue(len(v["float"]) == len(v["quantized"])) for i, val in enumerate(v["quantized"]): self.assertTrue( v["float"][i].shape == v["quantized"][i].shape) linear_data = self.calib_data[0][0] model_list = [SingleLayerLinearDynamicModel(qengine)] module_swap_list = [nn.Linear, nn.LSTM] for model in model_list: model.eval() if hasattr(model, "fuse_model"): model.fuse_model() q_model = quantize_dynamic(model) compare_and_validate_results(model, q_model, module_swap_list, linear_data)
def _main(): args = _parse_args() _init_logging(args.debug) loader = Loader() model = _get_model(args.model_file, args.dict_dir).eval() encoder = Encoder(model) decoder = _get_decoder() _LG.info(encoder) if args.quantize: _LG.info('Quantizing the model') model.encoder.transformer.pos_conv_embed.__prepare_scriptable__() encoder = tq.quantize_dynamic(encoder, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) _LG.info(encoder) # test if args.test_file: _LG.info('Testing with %s', args.test_file) waveform = loader(args.test_file) emission = encoder(waveform) transcript = decoder(emission) _LG.info(transcript) torch.jit.script(loader).save(os.path.join(args.output_path, 'loader.zip')) torch.jit.script(decoder).save( os.path.join(args.output_path, 'decoder.zip')) scripted = torch.jit.script(encoder) if args.optimize_for_mobile: scripted = optimize_for_mobile(scripted) scripted.save(os.path.join(args.output_path, 'encoder.zip'))
def _test_quantize_torchscript(self, model): model.eval() batch_size, num_frames = 3, 1024 # Remove the weight normalization forward hook model.encoder.transformer.pos_conv_embed.__prepare_scriptable__() quantized = tq.quantize_dynamic( model, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) # A lazy way to check that Modules are different assert str(quantized) != str(model), "Dynamic quantization did not modify the module." torch.manual_seed(0) waveforms = torch.randn(batch_size, num_frames) lengths = torch.randint(low=0, high=num_frames, size=[batch_size, ]) ref_out, ref_len = quantized(waveforms, lengths) # Script scripted = torch_script(quantized) hyp_out, hyp_len = scripted(waveforms, lengths) self.assertEqual(hyp_out, ref_out) self.assertEqual(hyp_len, ref_len)
def _main(): args = _parse_args() _init_logging(args.debug) _LG.info('Loading model: %s', args.model) model, labels = _get_model(args.model) _LG.info('Labels: %s', labels) _LG.info('Building pipeline') loader = Loader() encoder = Encoder(model) decoder = _get_decoder(labels) _LG.info(encoder) if args.quantize: _LG.info('Quantizing the model') model.encoder.transformer.pos_conv_embed.__prepare_scriptable__() encoder = tq.quantize_dynamic( encoder, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8) _LG.info(encoder) # test if args.test_file: _LG.info('Testing with %s', args.test_file) waveform = loader(args.test_file) emission = encoder(waveform) transcript = decoder(emission) _LG.info(transcript) torch.jit.script(loader).save(os.path.join(args.output_path, 'loader.zip')) torch.jit.script(encoder).save(os.path.join(args.output_path, 'encoder.zip')) torch.jit.script(decoder).save(os.path.join(args.output_path, 'decoder.zip'))
def __init__( self, model_list, tgt_dict_eos, beam_size: int = 2, quantize: bool = False, record_attention: bool = False, ): super().__init__() self.models = model_list self.target_dict_eos = tgt_dict_eos self.beam_size = beam_size self.record_attention = record_attention # Script the encoder model encoder_ens = EncoderEnsemble(self.models, self.beam_size) if quantize: encoder_ens = tq.quantize_dynamic( encoder_ens, {torch.nn.Linear}, # Add after bug fix torch.nn.LSTM dtype=torch.qint8, inplace=False, ) self.encoder_ens = torch.jit.script(encoder_ens) # Script the decoder step decoder_ens = DecoderBatchedStepEnsemble( self.models, beam_size, record_attention=record_attention ) if quantize: decoder_ens = tq.quantize_dynamic( decoder_ens, {torch.nn.Linear}, # Add after bug fix torch.nn.LSTM dtype=torch.qint8, inplace=False, ) self.decoder_ens = torch.jit.script(decoder_ens)
def test_compare_weights_lstm_dynamic(self): r"""Compare the weights of float and dynamic quantized LSTM layer""" qengine = torch.backends.quantized.engine def compare_and_validate_results(float_model, q_model): weight_dict = compare_weights(float_model.state_dict(), q_model.state_dict()) self.assertEqual(len(weight_dict), 1) for k, v in weight_dict.items(): self.assertTrue(len(v["float"]) == len(v["quantized"])) for i, val in enumerate(v["quantized"]): self.assertTrue( v["float"][i].shape == v["quantized"][i].shape) model_list = [LSTMwithHiddenDynamicModel(qengine)] for model in model_list: model.eval() if hasattr(model, "fuse_model"): model.fuse_model() q_model = quantize_dynamic(model) compare_and_validate_results(model, q_model)
def test_compare_model_outputs_lstm_dynamic(self): r"""Compare the output of LSTM layer in dynamic quantized model and corresponding output of conv layer in float model """ qengine = torch.backends.quantized.engine def compare_and_validate_results(float_model, q_model, input, hidden): act_compare_dict = compare_model_outputs(float_model, q_model, input, hidden) expected_act_compare_dict_keys = {"lstm.stats"} self.assertTrue( act_compare_dict.keys() == expected_act_compare_dict_keys) for k, v in act_compare_dict.items(): self.assertTrue(len(v["float"]) == len(v["quantized"])) for i, val in enumerate(v["quantized"]): self.assertTrue( len(v["float"][i]) == len(v["quantized"][i])) if i == 0: self.assertTrue(v["float"][i][0].shape == v["quantized"][i][0].shape) else: self.assertTrue(v["float"][i][0].shape == v["quantized"][i][0].shape) self.assertTrue(v["float"][i][1].shape == v["quantized"][i][1].shape) lstm_input = torch.rand((1, 1, 2)) lstm_hidden = (torch.rand(1, 1, 2), torch.rand(1, 1, 2)) model_list = [LSTMwithHiddenDynamicModel(qengine)] for model in model_list: model.eval() if hasattr(model, "fuse_model"): model.fuse_model() q_model = quantize_dynamic(model) compare_and_validate_results(model, q_model, lstm_input, lstm_hidden)