def realign(chunk): wav_obj = wave.open(wavfile, 'r') if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0].startOffset chunk_len = chunk['words'][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.end is not None: # Apply timing offset wd.start += start_t wd.end += start_t if wd.endOffset is not None: wd.startOffset += offset_offset wd.endOffset += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def onchange(self, sender, change_doc): update = False if change_doc.get("doc", {}).get("type") == "command": # Save kaldi-sequence from the text seq = metasentence.MetaSentence(change_doc["doc"].get("text", ""), vocab).get_kaldi_sequence() change_doc["doc"]["_ks"] = seq self._command_seqs[change_doc["id"]] = seq # Set "sender" to None so that all peers get a change update sender = None update = True elif change_doc["type"] == 'delete' and change_doc[ "id"] in self._command_seqs: del self._command_seqs[change_doc["id"]] update = True elif change_doc.get("doc", {}).get("type") == "audio-command": print 'got new audio command', change_doc['doc'] self._pending_audio_commands.append(change_doc["doc"]) self.subdir_resources['factory'].check_pending_audio_commands() minidb.DBFactory.onchange(self, sender, change_doc) if update: self.create_language_model() reactor.callInThread( self.subdir_resources['factory'].re_run_everything)
def __init__(self, resources, transcript, nthreads=4, **kwargs): self.kwargs = kwargs self.nthreads = nthreads self.transcript = transcript self.resources = resources self.ms = metasentence.MetaSentence(transcript, resources.vocab) ks = self.ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, resources.proto_langdir, **kwargs) self.queue = kaldi_queue.build(resources, hclg_path=gen_hclg_filename, nthreads=nthreads) self.mtt = MultiThreadedTranscriber(self.queue, nthreads=nthreads)
def realign_sub(chunk): with wave.open(wavfile, "rb") as wav_obj: if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug( "cannot realign %d words with duration %f", len(chunk["words"]), duration, ) return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) k = standard_kaldi.Kaldi(resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir) with wave.open(wavfile, "rb") as wav_obj: wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
for op, s1, e1, s2, e2 in opcodes: if op == 'delete': for i in range(s1, e1): yield (op, i, i+1, s2, s2) elif op == 'insert': for i in range(s2, e2): yield (op, s1, s1, i, i+1) else: len1 = e1-s1 len2 = e2-s2 for i1, i2 in zip(range(s1, e1), range(s2, e2)): yield (op, i1, i1 + 1, i2, i2 + 1) if len1 > len2: for i in range(s1 + len2, e1): yield ('delete', i, i+1, e2, e2) if len2 > len1: for i in range(s2 + len1, e2): yield ('insert', s1, s1, i, i+1) if __name__=='__main__': TEXT_FILE = sys.argv[1] JSON_FILE = sys.argv[2] OUTPUT_FILE = sys.argv[3] ms = metasentence.MetaSentence(open(TEXT_FILE).read(), Resources().vocab) alignment = json.load(open(JSON_FILE))['words'] out = align(alignment, ms) json.dump(out, open(OUTPUT_FILE, 'w'), indent=2)
def realign(chunk): nonlocal ignored if chunk["start"] is None: start_t = 0 else: start_t = chunk["start"].end if chunk["end"] is None: end_t = final_end_t else: end_t = chunk["end"].start duration = end_t - start_t # XXX: the minimum length seems bigger now (?) if duration < 0.75 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk["words"]), duration)) ignored += 1 return # Create a language model offset_offset = chunk["words"][0].startOffset chunk_len = chunk["words"][-1].endOffset - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") realign_transcript = chunk_transcript.decode("utf-8").replace( "\n", " ") logging.debug("realign transcript: %s", realign_transcript) chunk_ms = metasentence.MetaSentence(chunk_transcript, resources.vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, resources.proto_langdir) wav_obj = wave.open(wavfile, "rb") wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) wav_obj.close() retries = 5 while retries > 0: try: k = standard_kaldi.Kaldi( resources.nnet_gpu_path, chunk_gen_hclg_filename, resources.proto_langdir, ) k.push_chunk(buf) ret = [transcription.Word(**wd) for wd in k.get_final()] k.stop() break except BrokenPipeError: retries -= 1 if retries == 0: raise word_alignment = diff_align.align(ret, chunk_ms) for wd in word_alignment: wd.shift(time=start_t, offset=offset_offset) # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({ "percent": (ignored + len(realignments)) / float(len(to_realign)) })
def transcribe(self, uid, transcript, audio, async): proto_langdir = get_resource('PROTO_LANGDIR') status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with codecs.open(tran_path, 'w', 'utf-8') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k,v in p.items(): status[k] = v if len(transcript.strip()) > 0: ms = metasentence.MetaSentence(transcript, self.vocab) ks = ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir) kaldi_queue = Queue() for i in range(self.nthreads): kaldi_queue.put(standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), gen_hclg_filename, proto_langdir) ) mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads) elif hasattr(self, 'full_transcriber'): mtt = self.full_transcriber else: status['status'] = 'ERROR' status['error'] = 'No transcript provided and no language model for full transcription' return words = mtt.transcribe(wavfile, progress_cb=on_progress) output = {} if len(transcript.strip()) > 0: # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = kaldi_queue.get() k.stop() # Align words output['words'] = diff_align.align(words, ms) output['transcript'] = transcript # Perform a second-pass with unaligned words logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) status['status'] = 'ALIGNING' output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress) logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) else: # Match format output = make_transcription_alignment({"words": words}) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output