def test_inference_ctc_batched(self): model = HubertForCTC.from_pretrained( "facebook/hubert-large-ls960-ft", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
# only load the relevant featuresets for featurization to save memory if 'allosaurus_features' in feature_sets: import allosaurus_features if 'audioset_features' in feature_sets: import audioset_features if 'audiotext_features' in feature_sets: import audiotext_features if 'hubert_features' in feature_sets: import hubert_features import torch from transformers import HubertModel, HubertConfig from transformers import Wav2Vec2Processor, HubertForCTC import soundfile as sf hubert_processor_ = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") hubert_model_ = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") else: hubert_model_ = '' hubert_processor_ ='' if 'librosa_features' in feature_sets: import librosa_features if 'loudness_features' in feature_sets: import loudness_features if 'meta_features' in feature_sets: import meta_features os.system('pip3 install scikit-learn==0.19.1') if 'mixed_features' in feature_sets: import mixed_features if 'multispeaker_features' in feature_sets: import multispeaker_features if 'myprosody_features' in feature_sets:
sampling_rate = 16000 channels = 1 batch_size = 1 my_dataset = LPAudioSet(os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'audio'), sr=sampling_rate, channels=channels) train_loader = torch.utils.data.DataLoader(my_dataset, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True, collate_fn=LPAudioSet.collate_fn) for idx, audio in enumerate(train_loader): print(idx, audio.shape) #sys.exit(0) processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-xlarge-ls960-ft", cache_dir=os.getenv("cache_dir", "../../models")) model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft", cache_dir=os.getenv( "cache_dir", "../../models")) for idx, audio in enumerate(train_loader): input_values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values # Batch size 1 logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) print(transcription)