def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2): self.data_dir = data_dir self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads proto_langdir = get_resource('PROTO_LANGDIR') vocab_path = os.path.join(proto_langdir, "graphdir/words.txt") with open(vocab_path) as f: self.vocab = metasentence.load_vocabulary(f) # load kaldi instances for full transcription gen_hclg_filename = get_resource('data/graph/HCLG.fst') if os.path.exists(gen_hclg_filename) and self.ntranscriptionthreads > 0: proto_langdir = get_resource('PROTO_LANGDIR') nnet_gpu_path = get_resource('data/nnet_a_gpu_online') kaldi_queue = Queue() for i in range(self.ntranscriptionthreads): kaldi_queue.put(standard_kaldi.Kaldi( nnet_gpu_path, gen_hclg_filename, proto_langdir) ) self.full_transcriber = MultiThreadedTranscriber(kaldi_queue, nthreads=self.ntranscriptionthreads) self._status_dicts = {}
def serve(port=8765, interface='0.0.0.0', installSignalHandlers=0, data_dir=get_datadir('webdata')): logging.info("SERVE %d, %s, %d", port, interface, installSignalHandlers) if not os.path.exists(data_dir): os.makedirs(data_dir) zip_dir = os.path.join(data_dir, 'zip') if not os.path.exists(zip_dir): os.makedirs(zip_dir) f = File(data_dir) f.putChild('', File(get_resource('www/index.html'))) f.putChild('status.html', File(get_resource('www/status.html'))) f.putChild('preloader.gif', File(get_resource('www/preloader.gif'))) trans = Transcriber(data_dir) trans_ctrl = TranscriptionsController(trans) f.putChild('transcriptions', trans_ctrl) trans_zippr = TranscriptionZipper(zip_dir, trans) f.putChild('zip', trans_zippr) s = Site(f) logging.info("about to listen") default_reactor.listenTCP(port, s, interface=interface) logging.info("listening") default_reactor.run(installSignalHandlers=installSignalHandlers)
def get_kaldi(self): # In theory, we could preserve these instances through a # session. return standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), self.gen_hclg_filename, get_resource('PROTO_LANGDIR'))
def realign(chunk): wav_obj = wave.open(wavfile, 'r') start_t = (chunk["start"] or {"end": 0})["end"] end_t = chunk["end"] if end_t is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = end_t["start"] duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0]['startOffset'] chunk_len = chunk['words'][-1]['endOffset'] - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset + chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model( chunk_ks, proto_langdir) k = standard_kaldi.Kaldi(get_resource('data/nnet_a_gpu_online'), chunk_gen_hclg_filename, proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = k.get_final() k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.get("end"): # Apply timing offset wd['start'] += start_t wd['end'] += start_t if wd.get("endOffset"): wd['startOffset'] += offset_offset wd['endOffset'] += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb( {"percent": len(realignments) / float(len(to_realign))})
def realign(chunk): wav_obj = wave.open(wavfile, 'r') start_t = (chunk["start"] or {"end": 0})["end"] end_t = chunk["end"] if end_t is None: end_t = wav_obj.getnframes() / float(wav_obj.getframerate()) else: end_t = end_t["start"] duration = end_t - start_t if duration < 0.01 or duration > 60: logging.debug("cannot realign %d words with duration %f" % (len(chunk['words']), duration)) return # Create a language model offset_offset = chunk['words'][0]['startOffset'] chunk_len = chunk['words'][-1]['endOffset'] - offset_offset chunk_transcript = ms.raw_sentence[offset_offset:offset_offset+chunk_len].encode("utf-8") chunk_ms = metasentence.MetaSentence(chunk_transcript, vocab) chunk_ks = chunk_ms.get_kaldi_sequence() chunk_gen_hclg_filename = language_model.make_bigram_language_model(chunk_ks, proto_langdir) k = standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), chunk_gen_hclg_filename, proto_langdir) wav_obj = wave.open(wavfile, 'r') wav_obj.setpos(int(start_t * wav_obj.getframerate())) buf = wav_obj.readframes(int(duration * wav_obj.getframerate())) k.push_chunk(buf) ret = k.get_final() k.stop() word_alignment = diff_align.align(ret, chunk_ms) # Adjust startOffset, endOffset, and timing to match originals for wd in word_alignment: if wd.get("end"): # Apply timing offset wd['start'] += start_t wd['end'] += start_t if wd.get("endOffset"): wd['startOffset'] += offset_offset wd['endOffset'] += offset_offset # "chunk" should be replaced by "words" realignments.append({"chunk": chunk, "words": word_alignment}) if progress_cb is not None: progress_cb({"percent": len(realignments) / float(len(to_realign))})
def re_run(self, utt): if 'wavpath' not in utt: return k = Kaldi( get_resource('data/nnet_a_gpu_online'), self.gen_hclg_filename, get_resource('PROTO_LANGDIR')) audio = numm3.sound2np( os.path.join(self.resources['attach'].attachdir, utt['wavpath']), nchannels=1, R=8000) k.push_chunk(audio.tostring()) wds = k.get_final() k.stop() for wd in wds: del wd['phones'] utt['command_words'] = wds utt['command'] = ' '.join([X['word'] for X in wds]) reactor.callFromThread(self.db.onchange, None, {"type": "change", "id": utt["_id"], "doc": utt})
def re_run(self, utt): if 'wavpath' not in utt: return k = Kaldi(get_resource('data/nnet_a_gpu_online'), self.gen_hclg_filename, get_resource('PROTO_LANGDIR')) audio = numm3.sound2np(os.path.join(self.resources['attach'].attachdir, utt['wavpath']), nchannels=1, R=8000) k.push_chunk(audio.tostring()) wds = k.get_final() k.stop() for wd in wds: del wd['phones'] utt['command_words'] = wds utt['command'] = ' '.join([X['word'] for X in wds]) reactor.callFromThread(self.db.onchange, None, { "type": "change", "id": utt["_id"], "doc": utt })
def render_POST(self, req): uid = self.transcriber.next_id() tran = req.args.get('transcript', [''])[0] audio = req.args['audio'][0] disfluency = True if 'disfluency' in req.args else False conservative = True if 'conservative' in req.args else False kwargs = {'disfluency': disfluency, 'conservative': conservative, 'disfluencies': set(['uh', 'um'])} async = True if 'async' in req.args and req.args['async'][0] == 'false': async = False # We need to make the transcription directory here, so that # when we redirect the user we are sure that there's a place # for them to go. outdir = os.path.join(self.transcriber.data_dir, 'transcriptions', uid) os.makedirs(outdir) # Copy over the HTML shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html')) result_promise = threads.deferToThreadPool( reactor, reactor.getThreadPool(), self.transcriber.transcribe, uid, tran, audio, async, **kwargs) if not async: def write_result(result): '''Write JSON to client on completion''' req.setHeader("Content-Type", "application/json") req.write(json.dumps(result, indent=2)) req.finish() result_promise.addCallback(write_result) result_promise.addErrback(lambda _: None) # ignore errors req.notifyFinish().addErrback(lambda _: result_promise.cancel()) return NOT_DONE_YET req.setResponseCode(FOUND) req.setHeader(b"Location", "/transcriptions/%s" % (uid)) return ''
def render_POST(self, req): uid = self.transcriber.next_id() tran = req.args['transcript'][0] audio = req.args['audio'][0] async = True if 'async' in req.args and req.args['async'][0] == 'false': async = False # We need to make the transcription directory here, so that # when we redirect the user we are sure that there's a place # for them to go. outdir = os.path.join(self.transcriber.data_dir, 'transcriptions', uid) os.makedirs(outdir) # Copy over the HTML shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html')) result_promise = threads.deferToThreadPool( self.reactor, self.reactor.getThreadPool(), self.transcriber.transcribe, uid, tran, audio, async) if not async: def write_result(result): '''Write JSON to client on completion''' req.headers["Content-Type"] = "application/json" req.write(json.dumps(result, indent=2)) req.finish() result_promise.addCallback(write_result) result_promise.addErrback(lambda _: None) # ignore errors req.notifyFinish().addErrback(lambda _: result_promise.cancel()) return NOT_DONE_YET req.setResponseCode(FOUND) req.setHeader(b"Location", "/transcriptions/%s" % (uid)) return ''
import multiprocessing from multiprocessing.pool import ThreadPool as Pool import numpy as np import os import shutil import tempfile import time import zipfile from gentle.paths import get_resource from gentle.standard_kaldi import Kaldi import gentle.metasentence as metasentence import gentle.language_model as language_model # kaldi quirk... proto_langdir = get_resource('PROTO_LANGDIR') vocab_path = os.path.join(proto_langdir, "graphdir/words.txt") with open(vocab_path) as f: vocab = metasentence.load_vocabulary(f) class AudioConferenceFactory(WebSocketServerFactory): def __init__(self, resources, dbdir="db", db=None): WebSocketServerFactory.__init__(self, None) self.clients = {} # peerstr -> client self.resources = resources self.db = db self.gen_hclg_filename = db.gen_hclg_filename if db else None
def transcribe(self, uid, transcript, audio): output = { 'status': 'STARTED', 'transcript': transcript, } def save(): with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) outdir = os.path.join(self.data_dir, 'transcriptions', uid) os.makedirs(outdir) # Copy over the HTML shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html')) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) output['status'] = 'ENCODING' with open(os.path.join(outdir, 'align.json'), 'w') as alignfile: json.dump(output, alignfile, indent=2) wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: output['status'] = 'ERROR' output[ 'error'] = "Encoding failed. Make sure that you've uploaded a valid media file." save() return output['status'] = 'TRANSCRIBING' save() # Run transcription progress = lm_transcribe_progress( wavfile, transcript, # XXX: should be configurable get_resource('PROTO_LANGDIR'), get_resource('data/nnet_a_gpu_online')) result = None for result in progress: output['words'] = result['words'] output['transcript'] = result['transcript'] save() # ...and remove the original upload os.unlink(os.path.join(outdir, 'upload')) output['status'] = 'OK' save() # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))) open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) logging.info('done with transcription.') return result
def transcribe(self, uid, transcript, audio, async): status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } def save(): with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' # with open(os.path.join(outdir, 'align.json'), 'w') as alignfile: # json.dump(output, alignfile, indent=2) wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return # Find the duration #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' # Run transcription progress = align_progress( wavfile, transcript, # XXX: should be configurable get_resource('PROTO_LANGDIR'), get_resource('data/nnet_a_gpu_online'), want_progress=True) result = None for result in progress: if result.get("preview") is not None: status["message"] = result["preview"] status["t"] = result["t"] else: output['words'] = result['words'] output['transcript'] = result['transcript'] #save() # ...and remove the original upload os.unlink(os.path.join(outdir, 'upload')) save() # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return result
def transcribe(self, uid, transcript, audio, async): proto_langdir = get_resource('PROTO_LANGDIR') status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with codecs.open(tran_path, 'w', 'utf-8') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k,v in p.items(): status[k] = v if len(transcript.strip()) > 0: ms = metasentence.MetaSentence(transcript, self.vocab) ks = ms.get_kaldi_sequence() gen_hclg_filename = language_model.make_bigram_language_model(ks, proto_langdir) kaldi_queue = Queue() for i in range(self.nthreads): kaldi_queue.put(standard_kaldi.Kaldi( get_resource('data/nnet_a_gpu_online'), gen_hclg_filename, proto_langdir) ) mtt = MultiThreadedTranscriber(kaldi_queue, nthreads=self.nthreads) elif hasattr(self, 'full_transcriber'): mtt = self.full_transcriber else: status['status'] = 'ERROR' status['error'] = 'No transcript provided and no language model for full transcription' return words = mtt.transcribe(wavfile, progress_cb=on_progress) output = {} if len(transcript.strip()) > 0: # Clear queue (would this be gc'ed?) for i in range(self.nthreads): k = kaldi_queue.get() k.stop() # Align words output['words'] = diff_align.align(words, ms) output['transcript'] = transcript # Perform a second-pass with unaligned words logging.info("%d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) status['status'] = 'ALIGNING' output['words'] = multipass.realign(wavfile, output['words'], ms, nthreads=self.nthreads, progress_cb=on_progress) logging.info("after 2nd pass: %d unaligned words (of %d)" % (len([X for X in output['words'] if X.get("case") == "not-found-in-audio"]), len(output['words']))) else: # Match format output = make_transcription_alignment({"words": words}) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
def transcribe(self, uid, transcript, audio): output = { 'status': 'STARTED', 'transcript': transcript, } def save(): with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) outdir = os.path.join(self.data_dir, 'transcriptions', uid) os.makedirs(outdir) # Copy over the HTML shutil.copy(get_resource('www/view_alignment.html'), os.path.join(outdir, 'index.html')) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) output['status'] = 'ENCODING' with open(os.path.join(outdir, 'align.json'), 'w') as alignfile: json.dump(output, alignfile, indent=2) wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: output['status'] = 'ERROR' output['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." save() return output['status'] = 'TRANSCRIBING' save() # Run transcription progress = lm_transcribe_progress( wavfile, transcript, # XXX: should be configurable get_resource('PROTO_LANGDIR'), get_resource('data/nnet_a_gpu_online')) result = None for result in progress: output['words'] = result['words'] output['transcript'] = result['transcript'] save() # ...and remove the original upload os.unlink(os.path.join(outdir, 'upload')) output['status'] = 'OK' save() # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) logging.info('done with transcription.') return result
import logging from multiprocessing.pool import ThreadPool as Pool import os import wave from gentle import standard_kaldi from gentle import metasentence from gentle import language_model from gentle.paths import get_resource from gentle import diff_align # XXX: refactor out somewhere proto_langdir = get_resource('PROTO_LANGDIR') vocab_path = os.path.join(proto_langdir, "graphdir/words.txt") with open(vocab_path) as f: vocab = metasentence.load_vocabulary(f) def prepare_multipass(alignment): to_realign = [] last_aligned_word = None cur_unaligned_words = [] for wd_idx,wd in enumerate(alignment): if wd['case'] == 'not-found-in-audio': cur_unaligned_words.append(wd) elif wd['case'] == 'success': if len(cur_unaligned_words) > 0: to_realign.append({ "start": last_aligned_word, "end": wd, "words": cur_unaligned_words})
def transcribe(self, uid, transcript, audio, async): status = self.get_status(uid) status['status'] = 'STARTED' output = {'transcript': transcript} def save(): with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: json.dump(output, jsfile, indent=2) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(to_csv(output)) outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' # with open(os.path.join(outdir, 'align.json'), 'w') as alignfile: # json.dump(output, alignfile, indent=2) wavfile = os.path.join(outdir, 'a.wav') if to_wav(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status[ 'error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return # Find the duration #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float( wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' # Run transcription progress = align_progress( wavfile, transcript, # XXX: should be configurable get_resource('PROTO_LANGDIR'), get_resource('data/nnet_a_gpu_online'), want_progress=True) result = None for result in progress: if result.get("error") is not None: status["status"] = "ERROR" status["error"] = result["error"] # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file # XXX(2): duplicated code. with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return elif result.get("preview") is not None: status["message"] = result["preview"] status["t"] = result["t"] else: output['words'] = result['words'] output['transcript'] = result['transcript'] #save() # ...and remove the original upload os.unlink(os.path.join(outdir, 'upload')) save() # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (json.dumps(output))) open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return result