Beispiel #1
0
def convert_hubert_checkpoint(pytorch_dump_folder_path, config_path=None):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    model = distilhubert().model.model

    if config_path is not None:
        config = HubertConfig.from_pretrained(config_path)
    else:
        config = convert_config(model)
    model = model.eval()

    feature_extractor = Wav2Vec2FeatureExtractor(
        feature_size=1,
        sampling_rate=16000,
        padding_value=0,
        do_normalize=False,
        return_attention_mask=False,
    )
    hf_model = HubertModel(config)

    recursively_load_weights(model, hf_model)

    feature_extractor.save_pretrained(pytorch_dump_folder_path)
    hf_model.save_pretrained(pytorch_dump_folder_path)
    def create_and_check_batch_inference(self, config, input_values, *args):
        # test does not pass for models making use of `group_norm`
        # check: https://github.com/pytorch/fairseq/issues/3227
        model = HubertModel(config=config)
        model.to(torch_device)
        model.eval()

        input_values = input_values[:3]
        attention_mask = torch.ones(input_values.shape,
                                    device=torch_device,
                                    dtype=torch.bool)

        input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]]

        # pad input
        for i in range(len(input_lengths)):
            input_values[i, input_lengths[i]:] = 0.0
            attention_mask[i, input_lengths[i]:] = 0.0

        batch_outputs = model(input_values,
                              attention_mask=attention_mask).last_hidden_state

        for i in range(input_values.shape[0]):
            input_slice = input_values[i:i + 1, :input_lengths[i]]
            output = model(input_slice).last_hidden_state

            batch_output = batch_outputs[i:i + 1, :output.shape[1]]
            self.parent.assertTrue(
                torch.allclose(output, batch_output, atol=1e-3))
 def create_and_check_model(self, config, input_values, attention_mask):
     model = HubertModel(config=config)
     model.to(torch_device)
     model.eval()
     result = model(input_values, attention_mask=attention_mask)
     self.parent.assertEqual(
         result.last_hidden_state.shape,
         (self.batch_size, self.output_seq_length, self.hidden_size))
    def test_inference_distilhubert(self):
        model = HubertModel.from_pretrained("ntu-spml/distilhubert").to(
            torch_device)
        processor = Wav2Vec2FeatureExtractor.from_pretrained(
            "ntu-spml/distilhubert")

        # TODO: can't test on batched inputs due to incompatible padding https://github.com/pytorch/fairseq/pull/3572
        input_speech = self._load_datasamples(1)

        inputs = processor(input_speech, return_tensors="pt", padding=True)

        input_values = inputs.input_values.to(torch_device)

        with torch.no_grad():
            outputs = model(input_values).last_hidden_state

        # expected outputs taken from the original SEW implementation
        expected_outputs_first = torch.tensor(
            [[
                [-0.3505, 0.1167, 0.0608, 0.1294],
                [-0.3085, 0.0481, 0.1106, 0.0955],
                [-0.3107, -0.0391, 0.0739, 0.1360],
                [-0.2385, -0.1795, -0.0928, 0.2389],
            ]],
            device=torch_device,
        )
        expected_outputs_last = torch.tensor(
            [[
                [-0.0732, 0.0255, 0.0529, -0.1372],
                [-0.0812, 0.1259, 0.0564, -0.0438],
                [-0.0054, 0.0758, -0.0002, -0.1617],
                [0.0133, -0.0320, -0.0687, 0.0062],
            ]],
            device=torch_device,
        )
        expected_output_sum = -3776.0730

        self.assertTrue(
            torch.allclose(outputs[:, :4, :4],
                           expected_outputs_first,
                           atol=5e-3))
        self.assertTrue(
            torch.allclose(outputs[:, -4:, -4:],
                           expected_outputs_last,
                           atol=5e-3))
        self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1)
 def test_model_from_pretrained(self):
     model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
     self.assertIsNotNone(model)
def convert_hubert_checkpoint(checkpoint_path,
                              pytorch_dump_folder_path,
                              config_path=None,
                              dict_path=None,
                              is_finetuned=True):
    """
    Copy/paste/tweak model's weights to transformers design.
    """
    if config_path is not None:
        config = HubertConfig.from_pretrained(config_path)
    else:
        config = HubertConfig()

    if is_finetuned:
        if dict_path:
            target_dict = Dictionary.load(dict_path)

            # important change bos & pad token id since CTC symbol is <pad> and
            # not <s> as in fairseq
            config.bos_token_id = target_dict.pad_index
            config.pad_token_id = target_dict.bos_index
            config.eos_token_id = target_dict.eos_index
            config.vocab_size = len(target_dict.symbols)
            vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json")
            if not os.path.isdir(pytorch_dump_folder_path):
                logger.error(
                    "--pytorch_dump_folder_path ({}) should be a directory".
                    format(pytorch_dump_folder_path))
                return
            os.makedirs(pytorch_dump_folder_path, exist_ok=True)
            with open(vocab_path, "w", encoding="utf-8") as vocab_handle:
                json.dump(target_dict.indices, vocab_handle)
            tokenizer = Wav2Vec2CTCTokenizer(
                vocab_path,
                unk_token=target_dict.unk_word,
                pad_token=target_dict.pad_word,
                bos_token=target_dict.bos_word,
                eos_token=target_dict.eos_word,
                word_delimiter_token="|",
                do_lower_case=False,
            )
            return_attention_mask = True if config.feat_extract_norm == "layer" else False
            feature_extractor = Wav2Vec2FeatureExtractor(
                feature_size=1,
                sampling_rate=16000,
                padding_value=0,
                do_normalize=True,
                return_attention_mask=return_attention_mask,
            )
            processor = Wav2Vec2Processor(feature_extractor=feature_extractor,
                                          tokenizer=tokenizer)
            processor.save_pretrained(pytorch_dump_folder_path)

        hf_wav2vec = HubertForCTC(config)
    else:
        hf_wav2vec = HubertModel(config)

    if is_finetuned:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path],
            arg_overrides={"data": "/".join(dict_path.split("/")[:-1])})
    else:
        model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
            [checkpoint_path])

    model = model[0].eval()

    recursively_load_weights(model, hf_wav2vec, is_finetuned)

    hf_wav2vec.save_pretrained(pytorch_dump_folder_path)