def realign(chunk): wav_obj = wave.open(wavfile, 'r') if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0].startOffset chunk_len = chunk['words'][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.end is not None: # Apply timing offset wd.start += start_t wd.end += start_t if wd.endOffset is not None: wd.startOffset += offset_offset wd.endOffset += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def __init__(self, resources, transcript, nthreads=4, **kwargs): self.kwargs = kwargs self.nthreads = nthreads self.transcript = transcript self.resources = resources self.ms = metasentence.MetaSentence(transcript, resources.vocab) ks = self.ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs) self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)
def realign_sub(chunk): with wave.open(wavfile, "rb") as wav_obj: if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug( "cannot realign %d words with duration %f", len(chunk["words"]), duration, ) return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) with wave.open(wavfile, "rb") as wav_obj: wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def realign(chunk): wav_obj = wave.open(wavfile, 'r') start_t = (chunk["start"] or {"end": 0})["end"] end_t = chunk["end"] if end_t is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = end_t["start"] duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0]['startOffset'] chunk_len = chunk['words'][-1]['endOffset'] - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, proto_langdir) k = standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), chunk_gen_hclg_filename, proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = k.get_final() k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.get("end"): # Apply timing offset wd['start'] += start_t wd['end'] += start_t if wd.get("endOffset"): wd['startOffset'] += offset_offset wd['endOffset'] += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({"percent": len(realignments) / float(len(to_realign))})
def create_language_model(self): "sets and returns hclg_filename" gen_hclg_filename = language_model.make_bigram_language_model(self._command_seqs.values(), proto_langdir, conservative=True) # Overwrite old gen_hclg_filen if hasattr(self, 'gen_hclg_filename'): shutil.move(gen_hclg_filename, self.gen_hclg_filename) gen_hclg_filename = self.gen_hclg_filename return gen_hclg_filename
def create_language_model(self): "sets and returns hclg_filename" gen_hclg_filename = language_model.make_bigram_language_model( self._command_seqs.values(), proto_langdir, conservative=True) # Overwrite old gen_hclg_filen if hasattr(self, 'gen_hclg_filename'): shutil.move(gen_hclg_filename, self.gen_hclg_filename) gen_hclg_filename = self.gen_hclg_filename return gen_hclg_filename
def realign(chunk): wav_obj = wave.open(wavfile, 'rb') if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0].startOffset chunk_len = chunk['words'][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi( resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) wav_obj = wave.open(wavfile, 'rb') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({"percent": len(realignments) / float(len(to_realign))})
def realign(chunk): nonlocal ignored if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = final_end_t else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk["words"]), duration)) ignored += 1 return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") realign_transcript = chunk_transcript.decode("utf-8").replace( "\n", " ") logging.debug("realign transcript: %s", realign_transcript) chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) wav_obj = wave.open(wavfile, "rb") wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) wav_obj.close() retries = 5 while retries > 0: try: k = standard_kaldi.Kaldi( resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir, ) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() break except BrokenPipeError: retries -= 1 if retries == 0: raise word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({ "percent": (ignored + len(realignments)) / float(len(to_realign)) })
def transcribe(self, uid, transcript, audio, async): proto_langdir = get_resource('PROTO_LANGDIR') status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with codecs.open(tran_path, 'w', 'utf-8') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k,v in p.items(): status[k] = v if len(transcript.strip()) > 0: ms = metasentence.MetaSentence(transcript, self.vocab) ks = ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir) kaldi_queue = Queue() for i in range(self.nthreads): kaldi_queue.put(standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), gen_hclg_filename, proto_langdir) ) mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads) elif hasattr(self, 'full_transcriber'): mtt = self.full_transcriber else: status['status'] = 'ERROR' status['error'] = 'No transcript provided and no language model for full transcription' return words = mtt.transcribe(wavfile, progress_cb=on_progress) output = {} if len(transcript.strip()) > 0: # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = kaldi_queue.get() k.stop() # Align words output['words'] = diff_align.align(words, ms) output['transcript'] = transcript # Perform a second-pass with unaligned words logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) status['status'] = 'ALIGNING' output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress) logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) else: # Match format output = make_transcription_alignment({"words": words}) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
# Generating HCLG.fst (using Gentle here) txt_in = open(text).read() vocab_in = ms.load_vocabulary( open(proto_dir + "/tdnn_7b_chain_online/graph_pp/words.txt") ) print("My Vocab", vocab_in) source_words_list = txt_in.split(" ")[1:] # We must supply a version of `words_in` that only has words within our vocabulary (ie. proto_langdir/words.txt) new_wdlist = [] for wd in source_words_list: if wd not in vocab_in: new_wdlist.append(lm.OOV_TERM) else: new_wdlist.append(wd) print("Supplying these words", new_wdlist) HCLGFile = lm.make_bigram_language_model([new_wdlist], proto_dir) print(HCLGFile) # saving HCLG.fst in proto_langdir # renaming temp_HCLG.fst to HCLG.fst # storing it in proto_langdir (argv[2]) shutil.move(HCLGFile, proto_dir + "/tdnn_7b_chain_online/graph_pp/HCLG.fst")