def check_ctc_training(self, config, input_values, *args): config.ctc_zero_infinity = True model = HubertForCTC(config=config) model.to(torch_device) model.train() # freeze feature encoder model.freeze_feature_encoder() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths( torch.tensor(input_lengths)) labels = ids_tensor( (input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i]:] = 0.0 if max_length_labels[i] < labels.shape[-1]: # it's important that we make sure that target lenghts are at least # one shorter than logit lenghts to prevent -inf labels[i, max_length_labels[i] - 1:] = -100 loss = model(input_values, labels=labels).loss self.parent.assertFalse(torch.isinf(loss).item()) loss.backward()
def test_inference_ctc_batched(self): model = HubertForCTC.from_pretrained( "facebook/hubert-large-ls960-ft", torch_dtype=torch.float16).to(torch_device) processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-large-ls960-ft", do_lower_case=True) input_speech = self._load_datasamples(2) inputs = processor(input_speech, return_tensors="pt", padding=True) input_values = inputs.input_values.half().to(torch_device) attention_mask = inputs.attention_mask.to(torch_device) with torch.no_grad(): logits = model(input_values, attention_mask=attention_mask).logits predicted_ids = torch.argmax(logits, dim=-1) predicted_trans = processor.batch_decode(predicted_ids) EXPECTED_TRANSCRIPTIONS = [ "a man said to the universe sir i exist", "sweat covered brion's body trickling into the tight loin cloth that was the only garment he wore", ] self.assertListEqual(predicted_trans, EXPECTED_TRANSCRIPTIONS)
def check_ctc_loss(self, config, input_values, *args): model = HubertForCTC(config=config) model.to(torch_device) # make sure that dropout is disabled model.eval() input_values = input_values[:3] attention_mask = torch.ones(input_values.shape, device=torch_device, dtype=torch.long) input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], min(max_length_labels) - 1), model.config.vocab_size) # pad input for i in range(len(input_lengths)): input_values[i, input_lengths[i] :] = 0.0 attention_mask[i, input_lengths[i] :] = 0 model.config.ctc_loss_reduction = "sum" sum_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() model.config.ctc_loss_reduction = "mean" mean_loss = model(input_values, attention_mask=attention_mask, labels=labels).loss.item() self.parent.assertTrue(isinstance(sum_loss, float)) self.parent.assertTrue(isinstance(mean_loss, float))
def check_labels_out_of_vocab(self, config, input_values, *args): model = HubertForCTC(config) model.to(torch_device) model.train() input_values = input_values[:3] input_lengths = [input_values.shape[-1] // i for i in [4, 2, 1]] max_length_labels = model._get_feat_extract_output_lengths(torch.tensor(input_lengths)) labels = ids_tensor((input_values.shape[0], max(max_length_labels) - 2), model.config.vocab_size + 100) with pytest.raises(ValueError): model(input_values, labels=labels)
# only load the relevant featuresets for featurization to save memory if 'allosaurus_features' in feature_sets: import allosaurus_features if 'audioset_features' in feature_sets: import audioset_features if 'audiotext_features' in feature_sets: import audiotext_features if 'hubert_features' in feature_sets: import hubert_features import torch from transformers import HubertModel, HubertConfig from transformers import Wav2Vec2Processor, HubertForCTC import soundfile as sf hubert_processor_ = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft") hubert_model_ = HubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") else: hubert_model_ = '' hubert_processor_ ='' if 'librosa_features' in feature_sets: import librosa_features if 'loudness_features' in feature_sets: import loudness_features if 'meta_features' in feature_sets: import meta_features os.system('pip3 install scikit-learn==0.19.1') if 'mixed_features' in feature_sets: import mixed_features if 'multispeaker_features' in feature_sets: import multispeaker_features if 'myprosody_features' in feature_sets:
def convert_hubert_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path=None, dict_path=None, is_finetuned=True): """ Copy/paste/tweak model's weights to transformers design. """ if config_path is not None: config = HubertConfig.from_pretrained(config_path) else: config = HubertConfig() if is_finetuned: if dict_path: target_dict = Dictionary.load(dict_path) # important change bos & pad token id since CTC symbol is <pad> and # not <s> as in fairseq config.bos_token_id = target_dict.pad_index config.pad_token_id = target_dict.bos_index config.eos_token_id = target_dict.eos_index config.vocab_size = len(target_dict.symbols) vocab_path = os.path.join(pytorch_dump_folder_path, "vocab.json") if not os.path.isdir(pytorch_dump_folder_path): logger.error( "--pytorch_dump_folder_path ({}) should be a directory". format(pytorch_dump_folder_path)) return os.makedirs(pytorch_dump_folder_path, exist_ok=True) with open(vocab_path, "w", encoding="utf-8") as vocab_handle: json.dump(target_dict.indices, vocab_handle) tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token=target_dict.unk_word, pad_token=target_dict.pad_word, bos_token=target_dict.bos_word, eos_token=target_dict.eos_word, word_delimiter_token="|", do_lower_case=False, ) return_attention_mask = True if config.feat_extract_norm == "layer" else False feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16000, padding_value=0, do_normalize=True, return_attention_mask=return_attention_mask, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) processor.save_pretrained(pytorch_dump_folder_path) hf_wav2vec = HubertForCTC(config) else: hf_wav2vec = HubertModel(config) if is_finetuned: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path], arg_overrides={"data": "/".join(dict_path.split("/")[:-1])}) else: model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path]) model = model[0].eval() recursively_load_weights(model, hf_wav2vec, is_finetuned) hf_wav2vec.save_pretrained(pytorch_dump_folder_path)
sampling_rate = 16000 channels = 1 batch_size = 1 my_dataset = LPAudioSet(os.path.join( os.path.dirname(os.path.abspath(__file__)), '..', 'data', 'audio'), sr=sampling_rate, channels=channels) train_loader = torch.utils.data.DataLoader(my_dataset, batch_size=batch_size, shuffle=True, num_workers=1, drop_last=True, collate_fn=LPAudioSet.collate_fn) for idx, audio in enumerate(train_loader): print(idx, audio.shape) #sys.exit(0) processor = Wav2Vec2Processor.from_pretrained( "facebook/hubert-xlarge-ls960-ft", cache_dir=os.getenv("cache_dir", "../../models")) model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft", cache_dir=os.getenv( "cache_dir", "../../models")) for idx, audio in enumerate(train_loader): input_values = processor(audio, sampling_rate=sampling_rate, return_tensors="pt").input_values # Batch size 1 logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) print(transcription)