class FullTranscriber(): def __init__(self, resources, nthreads=2): self.available = False if nthreads <= 0: return if not os.path.exists(resources.full_hclg_path): return queue = kaldi_queue.build(resources, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(queue, nthreads=nthreads) self.available = True def transcribe(self, wavfile, progress_cb=None, logging=None): words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb) return self.make_transcription_alignment(words) @staticmethod def make_transcription_alignment(trans): # Spoof the `diff_align` output format transcript = "" words = [] for t_wd in trans: word = transcription.Word( case=transcription.Word.SUCCESS, startOffset=len(transcript), endOffset=len(transcript) + len(t_wd.word), word=t_wd.word, alignedWord=t_wd.word, phones=t_wd.phones, start=t_wd.start, end=t_wd.end) words.append(word) transcript += word.word + " " return Transcription(words=words, transcript=transcript)
class FullTranscriber(): def __init__(self, resources, nthreads=2): self.available = False if nthreads <= 0: return if not os.path.exists(resources.full_hclg_path): return queue = kaldi_queue.build(resources, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(queue, nthreads=nthreads) self.available = True def transcribe(self, wavfile, progress_cb=None, logging=None): words = self.mtt.transcribe(wavfile, progress_cb=progress_cb) return self.make_transcription_alignment(words) @staticmethod def make_transcription_alignment(trans): # Spoof the `diff_align` output format transcript = "" words = [] for t_wd in trans: word = transcription.Word(case="success", startOffset=len(transcript), endOffset=len(transcript) + len(t_wd.word), word=t_wd.word, alignedWord=t_wd.word, phones=t_wd.phones, start=t_wd.start, end=t_wd.end) words.append(word) transcript += word.word + " " return Transcription(words=words, transcript=transcript)
class ForcedAligner(): def __init__(self, resources, transcript, nthreads=4, context_width=3, **kwargs): self.kwargs = kwargs self.nthreads = nthreads self.transcript = transcript self.resources = resources self.context_width = context_width self.ms = metasentence.MetaSentence(transcript, resources.vocab) ks = self.ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model( ks, resources.proto_langdir, context_width=context_width, **kwargs) self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads) def transcribe(self, wavfile, progress_cb=None, logging=None): words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb) # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = self.queue.get() k.stop() # Align words words = diff_align.align(words, self.ms, **self.kwargs) # Perform a second-pass with unaligned words if logging is not None: logging.info( "%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) if progress_cb is not None: progress_cb({'status': 'ALIGNING'}) words = multipass.realign(wavfile, words, self.ms, context_width=self.context_width, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb) if logging is not None: logging.info( "after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) words = AdjacencyOptimizer(words, duration).optimize() return Transcription(words=words, transcript=self.transcript)
def test_transcriber(self): from gentle import resampled, kaldi_queue, standard_kaldi, Resources from gentle.transcriber import MultiThreadedTranscriber resources = Resources() k_queue = kaldi_queue.build(resources, 1) trans = MultiThreadedTranscriber(k_queue) with resampled('examples/data/lucier.mp3', 10.5, 2.5) as filename: words, duration = trans.transcribe(filename) self.assertEqual(words[0].word, "different")
def test_transcriber(self): import subprocess from gentle import resampled, kaldi_queue, standard_kaldi, Resources from gentle.transcriber import MultiThreadedTranscriber standard_kaldi.STDERR = subprocess.STDOUT resources = Resources() k_queue = kaldi_queue.build(resources, 1) trans = MultiThreadedTranscriber(k_queue) with resampled(self.audio, 10.5, 2.5) as filename: words, duration = trans.transcribe(filename) self.assertEqual(words[0].word, "different")
class ForcedAligner(): def __init__(self, resources, transcript, nthreads=4, **kwargs): self.kwargs = kwargs self.nthreads = nthreads self.transcript = transcript self.resources = resources self.ms = metasentence.MetaSentence(transcript, resources.vocab) ks = self.ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs) self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads) def transcribe(self, wavfile, progress_cb=None, logging=None): words, duration = self.mtt.transcribe(wavfile, progress_cb=progress_cb) # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = self.queue.get() k.stop() # Align words words = diff_align.align(words, self.ms, **self.kwargs) # Perform a second-pass with unaligned words if logging is not None: logging.info("%d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) if progress_cb is not None: progress_cb({'status': 'ALIGNING'}) words = multipass.realign(wavfile, words, self.ms, resources=self.resources, nthreads=self.nthreads, progress_cb=progress_cb) if logging is not None: logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in words if X.not_found_in_audio()]), len(words))) words = AdjacencyOptimizer(words, duration).optimize() return Transcription(words=words, transcript=self.transcript)