def create_asr(): import config from kaldi.utils import lattice_to_nbest, wst2dict from kaldi.decoders import PyOnlineLatgenRecogniser from asr_utils import lattice_calibration recogniser = PyOnlineLatgenRecogniser() recogniser.setup(config.kaldi_config) dictionary = wst2dict(config.wst_path) path_to_text = PathToText(dictionary) to_nbest = ToNBest(path_to_text, lattice_to_nbest, lattice_calibration) to_best_path = ToBestPath(path_to_text) return ASR(recogniser, to_nbest, to_best_path)
def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None): wst = wst2dict(wst_path) d = PyOnlineLatgenRecogniser() d.setup(argv) for wav_name, wav_path in wav_paths: sw, sr = 2, 16000 # 16-bit audio so 1 sample_width = 2 chars pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr) print('%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr)) lat, lik, decoded_frames = decode(d, pcm) lat.isyms = lat.osyms = fst.read_symbols_text(wst_path) if DEBUG: with open('pykaldi_%s.svg' % wav_name, 'w') as f: f.write(lat._repr_svg_()) lat.write('%s_pykaldi.fst' % wav_name) print( "Log-likelihood per frame for utterance %s is %f over %d frames" % (wav_name, int(lik / decoded_frames), decoded_frames)) word_ids = lattice_to_nbest(lat, n=10) write_decoded(file_output, wav_name, word_ids, wst)
def decode_wrap(argv, audio_batch_size, wav_paths, file_output, wst_path=None): wst = wst2dict(wst_path) d = PyOnlineLatgenRecogniser() d.setup(argv) for wav_name, wav_path in wav_paths: sw, sr = 2, 16000 # 16-bit audio so 1 sample_width = 2 chars pcm = load_wav(wav_path, def_sample_width=sw, def_sample_rate=sr) print '%s has %f sec' % (wav_name, (float(len(pcm)) / sw) / sr) lat, lik, decoded_frames = decode(d, pcm) lat.isyms = lat.osyms = fst.read_symbols_text(wst_path) if DEBUG: with open('pykaldi_%s.svg' % wav_name, 'w') as f: f.write(lat._repr_svg_()) lat.write('%s_pykaldi.fst' % wav_name) print "Log-likelihood per frame for utterance %s is %f over %d frames" % ( wav_name, (lik / decoded_frames), decoded_frames) word_ids = lattice_to_nbest(lat, n=10) write_decoded(file_output, wav_name, word_ids, wst)
decoded = ' '.join([wst[w] for w in best_path]) else: decoded = 'Empty hypothesis' print( "%s secs, frames: %d, prob: %f, %s " % (str(time.time() - start), self.utt_frames, prob, decoded)) self.utt_frames = 0 self.d.reset(keep_buffer_data=False) if self.dialog_end: self.save_wav() break def save_wav(self): wf = wave.open('live-demo-record.wav', 'wb') wf.setnchannels(CHANNELS) wf.setframerate(RATE) wf.setsampwidth(self.pin.get_sample_size(FORMAT)) wf.writeframes(b''.join(self.frames)) wf.close() if __name__ == '__main__': audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2] argv = sys.argv[3:] print('Python args: %s' % str(sys.argv), file=sys.stderr) wst = wst2dict(wst_path) demo = LiveDemo(audio_batch_size, wst, argv) demo.setup() demo.run()
def create_dictionary(basedir): global wst wst = wst2dict('%s/models/words.txt' % basedir)
d.reset(keep_buffer_data=False) return result def get_audio_callback(): """Returns a callback - function which handle incomming audio""" def frame_in(in_data, frame_count, time_info, status): d.frame_in(in_data) return in_data, pyaudio.paContinue return frame_in @app.route('/') def index(): return render_template('index.html') audio_batch_size, wst_path = int(sys.argv[1]), sys.argv[2] argv = sys.argv[3:] print >> sys.stderr, 'Python args: %s' % str(sys.argv) wst = wst2dict(wst_path) d.setup(argv) pin = pyaudio.PyAudio() stream = pin.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=audio_batch_size, stream_callback=get_audio_callback()) app.run(host='0.0.0.0', debug=True)