def transcribeAudio(self, utterances: list[Utterance], batch_size: Optional[int] = 1, decoder: Optional[Decoder] = None) -> list[dict]: """ Transcribe audio files. Parameters: ---------- paths: list[Utterance] List of audio utterances to transcribe batch_size: Optional[int] = 1 Batch size to use for inference decoder: Optional[Decoder] = None Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder. Returns: ---------- list[dict]: A list of dictionaries containing the transcription for each audio file: [{ "transcription": str, "start_timesteps": list[int], "end_timesteps": list[int], "probabilities": list[float] }, ...] """ if not self.is_finetuned: raise ValueError("Not fine-tuned model! Please, fine-tune the model first.") if decoder is None: decoder = GreedyDecoder(self.token_set) sampling_rate = self.processor.feature_extractor.sampling_rate result = [] for utts_batch in tqdm(list(get_chunks(utterances, batch_size))): #waveforms = get_waveforms(paths_batch, sampling_rate) waveforms = [] for utt in utts_batch: waveforms.append(utt.audio) inputs = self.processor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, do_normalize=True) with torch.no_grad(): logits = self.model(inputs.input_values.to(self.device), attention_mask=inputs.attention_mask.to(self.device)).logits batchResults = decoder(logits) for index, br in enumerate(batchResults): if(len(br['transcription'].strip()) == 0): continue; br['utterance_start'] = '%.2f' % utts_batch[index].timestamp br['utterance_duration'] = '%.2f' % utts_batch[index].duration br['tokens'] = self.convertToCTM(br) result.append(br) return result
def transcribeFiles(self, paths: list[str], batch_size: Optional[int] = 1, decoder: Optional[Decoder] = None) -> list[dict]: """ Transcribe audio files. Parameters: ---------- paths: list[str] List of paths to audio files to transcribe batch_size: Optional[int] = 1 Batch size to use for inference decoder: Optional[Decoder] = None Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder. Returns: ---------- list[dict]: A list of dictionaries containing the transcription for each audio file: [{ "transcription": str, "start_timesteps": list[int], "end_timesteps": list[int], "probabilities": list[float] }, ...] """ if not self.is_finetuned: raise ValueError( "Not fine-tuned model! Please, fine-tune the model first.") if decoder is None: decoder = GreedyDecoder(self.token_set) sampling_rate = self.processor.feature_extractor.sampling_rate result = [] for paths_batch in tqdm(list(get_chunks(paths, batch_size))): waveforms = get_waveforms(paths_batch, sampling_rate) inputs = self.processor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, do_normalize=True) with torch.no_grad(): logits = self.model(inputs.input_values.to(self.device), attention_mask=inputs.attention_mask.to( self.device)).logits result += decoder(logits) return result