def convert_wav2vec2_checkpoint(checkpoint_path, pytorch_dump_folder_path, dict_path=None): """ Copy/paste/tweak model's weights to transformers design. """ hf_wav2vec = Wav2Vec2ForMaskedLM(Wav2Vec2Config()) model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": dict_path}) model = model[0].eval() recursively_load_weights(model, hf_wav2vec) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
def test_inference_masked_lm_normal(self): model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(1) input_values = tokenizer(input_speech, return_tensors="pt").input_values.to(torch_device) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = tokenizer.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = ["a man said to the universe sir i exist"] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def test_inference_masked_lm_normal_batched(self): model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") model.to(torch_device) tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h", do_lower_case=True) input_speech = self._load_datasamples(2) input_values = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True).input_values.to( torch_device ) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = tokenizer.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight lowing cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def test_inference_masked_lm_robust_batched(self): model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-large-960h-lv60-self").to(torch_device) tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", do_lower_case=True) input_speech = self._load_datasamples(4) input_values = tokenizer(input_speech, return_tensors="pt", padding=True, truncation=True).input_values.to( torch_device ) with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = tokenizer.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", "the cut on his chest still dripping blood the ache of his overstrained eyes even the soaring arena around him with the thousands of spectators were trivialities not worth thinking about", "his instant panic was followed by a small sharp blow high on his chest", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
if 'speechmetrics_features' in feature_sets: import speechmetrics_features if 'standard_features' in feature_sets: import standard_features if 'surfboard_features' in feature_sets: import surfboard_features # transcription imports if 'azure' in default_audio_transcribers: import azure.cognitiveservices.speech as speechsdk if 'wav2vec' in default_audio_transcribers: import os, pandas as pd, soundfile as sf, torch, glob from pathlib import Path from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") wav_model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h") else: tokenizer='' wav_model='' if 'hubert' in default_audio_transcribers and 'hubert_features' not in feature_sets: import torch from transformers import HubertModel, HubertConfig from transformers import Wav2Vec2Processor, HubertForCTC import soundfile as sf # Hubert transcript hubert_processor_ = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") hubert_model_ = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") else: hubert_processor_=''