Exemple #1
0
    def transcribeAudio(self, utterances: list[Utterance], batch_size: Optional[int] = 1, decoder: Optional[Decoder] = None) -> list[dict]:
        """ 
        Transcribe audio files.

        Parameters:
        ----------
            paths: list[Utterance]
                List of audio utterances to transcribe

            batch_size: Optional[int] = 1
                Batch size to use for inference

            decoder: Optional[Decoder] = None
                Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder.

        Returns:
        ----------
            list[dict]:
                A list of dictionaries containing the transcription for each audio file:

                [{
                    "transcription": str,
                    "start_timesteps": list[int],
                    "end_timesteps": list[int],
                    "probabilities": list[float]
                }, ...]
        """

        if not self.is_finetuned:
            raise ValueError("Not fine-tuned model! Please, fine-tune the model first.")

        if decoder is None:
            decoder = GreedyDecoder(self.token_set)

        sampling_rate = self.processor.feature_extractor.sampling_rate
        result = []

        for utts_batch in tqdm(list(get_chunks(utterances, batch_size))):

            #waveforms = get_waveforms(paths_batch, sampling_rate)
            waveforms = []
            for utt in utts_batch:
                waveforms.append(utt.audio)

            inputs = self.processor(waveforms, sampling_rate=sampling_rate, return_tensors="pt", padding=True, do_normalize=True)

            with torch.no_grad():
                logits = self.model(inputs.input_values.to(self.device), attention_mask=inputs.attention_mask.to(self.device)).logits

            batchResults = decoder(logits)
            for index, br in enumerate(batchResults):
                if(len(br['transcription'].strip()) == 0):
                    continue;
                br['utterance_start']    = '%.2f' % utts_batch[index].timestamp
                br['utterance_duration'] = '%.2f' % utts_batch[index].duration
                br['tokens'] = self.convertToCTM(br)
                result.append(br)

        return result
    def transcribeFiles(self,
                        paths: list[str],
                        batch_size: Optional[int] = 1,
                        decoder: Optional[Decoder] = None) -> list[dict]:
        """ 
        Transcribe audio files.

        Parameters:
        ----------
            paths: list[str]
                List of paths to audio files to transcribe

            batch_size: Optional[int] = 1
                Batch size to use for inference

            decoder: Optional[Decoder] = None
                Decoder to use for transcription. If you don't specify this, the engine will use the GreedyDecoder.

        Returns:
        ----------
            list[dict]:
                A list of dictionaries containing the transcription for each audio file:

                [{
                    "transcription": str,
                    "start_timesteps": list[int],
                    "end_timesteps": list[int],
                    "probabilities": list[float]
                }, ...]
        """

        if not self.is_finetuned:
            raise ValueError(
                "Not fine-tuned model! Please, fine-tune the model first.")

        if decoder is None:
            decoder = GreedyDecoder(self.token_set)

        sampling_rate = self.processor.feature_extractor.sampling_rate
        result = []

        for paths_batch in tqdm(list(get_chunks(paths, batch_size))):

            waveforms = get_waveforms(paths_batch, sampling_rate)

            inputs = self.processor(waveforms,
                                    sampling_rate=sampling_rate,
                                    return_tensors="pt",
                                    padding=True,
                                    do_normalize=True)

            with torch.no_grad():
                logits = self.model(inputs.input_values.to(self.device),
                                    attention_mask=inputs.attention_mask.to(
                                        self.device)).logits

            result += decoder(logits)

        return result