def test_inference_pretraining(self): model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv") model.to(torch_device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-xlsr-53") input_speech = self._load_datasamples(2) inputs_dict = feature_extractor(input_speech, return_tensors="pt", padding=True) with torch.no_grad(): torch.manual_seed(0) outputs = model( inputs_dict.input_values.to(torch_device), attention_mask=inputs_dict.attention_mask.to(torch_device), ) # compute cosine similarity cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1) # pretrained model should have learned a high cosine similarity self.assertTrue(cosine_sim.mean() > 0.5) # fmt: off expected_cosine_sim_slice = torch.tensor( [[0.8290, 0.8335, 0.8815, 0.8580, 0.8249], [0.8892, 0.9221, 0.8711, 0.8601, 0.8482]], device=torch_device, ) # fmt: on self.assertTrue(torch.allclose(cosine_sim[:, :5], expected_cosine_sim_slice, atol=1e-3))
def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = UniSpeechConfig.from_pretrained(config_path) else: config = UniSpeechConfig() hf_unispeech = UniSpeechForPreTraining(config) model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task([checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_unispeech) hf_unispeech.save_pretrained(pytorch_dump_folder_path)
def convert_unispeech_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = UniSpeechConfig.from_pretrained(config_path) else: config = UniSpeechConfig() if is_finetuned: if dict_path: target_dict = Dictionary.load_from_json(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) vocab_dict = target_dict.indices # fairseq has the <pad> and <s> switched vocab_dict["<pad>"] = 42 vocab_dict["<s>"] = 43 with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(vocab_dict, vocab_handle) tokenizer = Wav2Vec2PhonemeCTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_unispeech = UniSpeechForCTC(config) else: hf_unispeech = UniSpeechForPreTraining(config) if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={ "data": "/".join(dict_path.split("/")[:-1]), "w2v_path": checkpoint_path }) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_unispeech, is_finetuned) hf_unispeech.save_pretrained(pytorch_dump_folder_path)