def test_special_characters_in_vocab(self):
        sent = "ʈʰ æ æ̃ ˧ kʰ"

        vocab_dict = {
            k: v
            for v, k in enumerate({phoneme
                                   for phoneme in sent.split()})
        }
        vocab_file = os.path.join(self.tmpdirname, "vocab_special.json")

        with open(vocab_file, "w") as f:
            json.dump(vocab_dict, f)

        tokenizer = Wav2Vec2CTCTokenizer(vocab_file)

        expected_sent = tokenizer.decode(tokenizer(sent).input_ids,
                                         spaces_between_special_tokens=True)
        self.assertEqual(sent, expected_sent)

        tokenizer.save_pretrained(
            os.path.join(self.tmpdirname, "special_tokenizer"))
        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
            os.path.join(self.tmpdirname, "special_tokenizer"))

        expected_sent = tokenizer.decode(tokenizer(sent).input_ids,
                                         spaces_between_special_tokens=True)
        self.assertEqual(sent, expected_sent)
def load_tokenizer():
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(recognizer_dir,
                                                     unk_token='[UNK]',
                                                     pad_token='[PAD]',
                                                     word_delemiter_token='|',
                                                     cache_dir=cache_dir)
    return tokenizer
Beispiel #3
0
 def create_processor(self, model_args: ModelArguments) -> Wav2Vec2Processor:
     feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
         model_args.model_name_or_path, cache_dir=model_args.cache_dir
     )
     if self.vocab_file:
         tokenizer = Wav2Vec2CTCTokenizer(
             self.vocab_file,
             cache_dir=model_args.cache_dir,
             do_lower_case=self.do_lower_case,
             word_delimiter_token=self.word_delimiter_token,
         )
     else:
         tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
             model_args.model_name_or_path,
             cache_dir=model_args.cache_dir,
             do_lower_case=self.do_lower_case,
             word_delimiter_token=self.word_delimiter_token,
         )
     return Wav2Vec2Processor(feature_extractor, tokenizer)
 def get_tokenizer(self, **kwargs):
     kwargs.update(self.special_tokens_map)
     return Wav2Vec2CTCTokenizer.from_pretrained(self.tmpdirname, **kwargs)
Beispiel #5
0
# In[15]:


import json
if from_start:
    with open('vocab.json', 'w') as vocab_file:
        json.dump(vocab_dict, vocab_file)


# In[16]:


from transformers import Wav2Vec2CTCTokenizer

if from_start:
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("./", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")


# In[17]:


repo_name = "wav2vec2-large-xls-r-300m-irish-colab"


# In[16]:


from transformers import Wav2Vec2CTCTokenizer

if not from_start:
    tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(repo_name, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
Beispiel #6
0
    def call_huggingface(self, df):
        assert self.model_url != '', "Error! A model URL is needed for HuggingFace scoring, but --asr_download_model is empty"
        if self.tokenizer_url == '':
            print(
                f"Setting empty --tokenizer_url field identically to --asr_download_model: {self.model_url}"
            )
            self.tokenizer_url = self.model_url

        if self.scoring_sorting == 'ascending':
            df = df.sort_values(by=['n_frames']).reset_index(drop=True)
        elif self.scoring_sorting == 'descending':
            df = df.sort_values(by=['n_frames'],
                                ascending=False).reset_index(drop=True)
        elif self.scoring_sorting == '':
            pass
        else:
            raise NotImplementedError

        print(f"Preparing dataloader for manifest {self.manifest}...")
        dataset = AudioDataset(df)
        dataloader = DataLoader(dataset,
                                batch_size=self.batch_size,
                                collate_fn=dataset.collater,
                                num_workers=self.num_workers,
                                pin_memory=True)

        if self.hf_username == 'facebook':
            print(f"Downloading tokenizer: {self.tokenizer_url}")
            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                self.tokenizer_url)

            print(f"Downloading model: {self.model_url}")
            model = Wav2Vec2ForCTC.from_pretrained(self.model_url)
        elif self.hf_username == 'speechbrain':
            if torch.cuda.is_available():
                run_opts = {"device": "cuda"}
            else:
                run_opts = {"device": "cpu"}
            print(f"Downloading model: {self.model_url}")
            model = EncoderDecoderASR.from_hparams(source=self.model_url,
                                                   run_opts=run_opts,
                                                   savedir=os.path.join(
                                                       'pretrained_models',
                                                       self.hf_modelname))
        else:
            raise NotImplementedError

        model.eval()

        print("Scoring dataset...")
        df['wer'] = np.nan

        for batch in tqdm(dataloader):
            indexes, waveforms, transcripts, wav_lens = batch

            if self.hf_username == 'facebook':
                output_logits = model(waveforms.squeeze()).logits
                predicted_ids = torch.argmax(output_logits, dim=-1)
                pred_transcripts = tokenizer.batch_decode(predicted_ids)
            elif self.hf_username == 'speechbrain':
                waveforms = waveforms.squeeze()
                #waveforms = model.audio_normalizer(waveforms, self.sampling_rate)
                pred_transcripts = model.transcribe_batch(waveforms,
                                                          wav_lens)[0]

            for index, ref in enumerate(transcripts):
                sample_id = indexes[index]
                ref = transcripts[index]
                pred = pred_transcripts[index]
                measures = jiwer.compute_measures(ref, pred)
                wer = measures['wer'] * 100.0
                assert (
                    ref == df.loc[int(sample_id), 'tgt_text']
                ), "The reference text indicated by the sample ID in the transcripts file does not match with the one stored in the dataset!"
                df.at[int(sample_id), 'wer'] = wer

        return df
Beispiel #7
0
    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}


# dir_path="/media/nas/samir-data/wav2vec2_models/inputs"
dir_path = "/media/nas/samir-data/asr_transformers"

test_data_folder = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/clips"
test_annotation_file = "/media/nas/CORPUS_FINAL/Corpus_audio/Corpus_FR/COMMONVOICE/common-voice-fr/test1.tsv"
test_set = AudioDataset(test_annotation_file, test_data_folder, MAX_LEN)
test_generator = torch.utils.data.DataLoader(test_set, batch_size=1)

processor = Wav2Vec2Processor.from_pretrained(
    "/media/nas/samir-data/wav2vec2_models/checkpoint-94000")
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
    "/media/nas/samir-data/wav2vec2_models/checkpoint-94000")

model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53-french")
model.eval()

wer_metric = load_metric("wer")

# initialize the prediction
predictions = []
references = []
for audio_file in test_generator:
    # for block in sf.blocks(audio_file["input_values"], blocksize=50000):
    input_dict = processor(np.squeeze(audio_file["input_values"], 0),
                           return_attention_mask=False,
                           return_tensors="pt",
Beispiel #8
0
 def __setstate__(self, state):
     self.__dict__.update(state)
     self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)
Beispiel #9
0
 def tokenizer(self):
     if self.backbone is not None and self.backbone != self._backbone:
         self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
             self.backbone)
         self._backbone = self.backbone
     return self._tokenizer
    def __init__(self, backbone: str):
        super().__init__()

        self.backbone = backbone
        self._tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(self.backbone)