def rec_wav_file(output_dir, wav_path): """ Recognise speech in wav file and profile speech recognition. The decoding and ASR output extraction times are estimated. Args: cfg (dict): Alex configuration with setting for speech recognition wav_path (str): Path to Wave file which is recognised Returns: Tuple of decodeded ASR hypothesis, time of decoding, time of hypothesis extraction """ pcm = load_wav(cfg, wav_path) frame = Frame(pcm) start = time.time() asr.rec_in(frame) rec_in_end = time.time() res = asr.hyp_out() hyp_out_end = time.time() try: save_lattice(asr.get_last_lattice(), output_dir, wav_path) except AttributeError: pass asr.flush() return res, rec_in_end - start, hyp_out_end - rec_in_end
def read_write_audio(self): """Send as much possible of the available data to the output and read as much as possible from the input. It should be a non-blocking operation. """ if (self.local_audio_play and (self.mem_player.get_write_available() > self.cfg['Audio']['samples_per_frame'] * 2)): # send a frame from input to be played data_play = self.local_audio_play.popleft() if self.audio_playing and isinstance(data_play, Frame): if len(data_play ) == self.cfg['Audio']['samples_per_frame'] * 2: self.last_frame_id = self.mem_player.put_frame( data_play.payload) self.cfg['Logging']['session_logger'].rec_write( self.audio_playing, data_play.payload) elif isinstance(data_play, Command): if data_play.parsed['__name__'] == 'utterance_start': self.audio_playing = data_play.parsed['fname'] self.message_queue.append((Command( 'play_utterance_start(user_id="{uid}",fname="{fname}")' .format(uid=data_play.parsed['user_id'], fname=data_play.parsed['fname']), 'VoipIO', 'HUB'), self.last_frame_id)) try: if data_play.parsed['log'] == "true": self.cfg['Logging']['session_logger'].rec_start( "system", data_play.parsed['fname']) except SessionLoggerException as e: self.cfg['Logging']['system_logger'].exception(e) if self.audio_playing and data_play.parsed[ '__name__'] == 'utterance_end': self.audio_playing = None self.message_queue.append((Command( 'play_utterance_end(user_id="{uid}",fname="{fname})'. format(uid=data_play.parsed['user_id'], fname=data_play.parsed['fname']), 'VoipIO', 'HUB'), self.last_frame_id)) try: if data_play.parsed['log'] == "true": self.cfg['Logging']['session_logger'].rec_end( data_play.parsed['fname']) except SessionLoggerException as e: self.cfg['Logging']['system_logger'].exception(e) if (self.mem_capture.get_read_available() > self.cfg['Audio']['samples_per_frame'] * 2): # Get and send recorded data, it must be read at the other end. data_rec = self.mem_capture.get_frame() # send the audio only if the call is connected # ignore any audio signal left after the call was disconnected if self.audio_recording: self.audio_record.send(Frame(data_rec))
def send_wav(self, filename, stream=None): """Send given wavfile to the dialogue system as if it was said throught microphone.""" # load wav wav = load_wav(self.cfg, filename) wav = various.split_to_bins( wav, 2 * self.cfg['Audio']['samples_per_frame']) # frame by frame send it for frame in wav: if stream is not None: stream.write(frame) self.audio_record.send(Frame(frame)) # send some silence so that VAD recognizes end of recording for _ in range(10): self.audio_record.send(Frame(b"\x00\x00" * self.cfg['Audio']['samples_per_frame']))
def on_client_message_received(self, payload): msg = ClientToAlex() msg.ParseFromString(payload) if msg.key == self.key: decoded = msg.speech self.audio_record.send(Frame(decoded)) self.update_current_utterance_id(msg.currently_playing_utterance)
def synthesize(self, user_id, text, log="true"): if text == "_silence_" or text == "silence()": # just let the TTS generate an empty wav text == "" wav = [] timestamp = datetime.now().strftime('%Y-%m-%d--%H-%M-%S.%f') fname = 'tts-{stamp}.wav'.format(stamp=timestamp) self.commands.send( Command( 'tts_start(user_id="%s",text="%s",fname="%s")' % (user_id, text, fname), 'TTS', 'HUB')) self.audio_out.send( Command( 'utterance_start(user_id="%s",text="%s",fname="%s",log="%s")' % (user_id, text, fname, log), 'TTS', 'AudioOut')) segments = self.parse_into_segments(text) for i, segment_text in enumerate(segments): segment_wav = self.tts.synthesize(segment_text) segment_wav = self.remove_start_and_final_silence(segment_wav) if i < len(segments) - 1: # add silence only for non-final segments segment_wav += self.gen_silence() wav.append(segment_wav) segment_wav = various.split_to_bins( segment_wav, 2 * self.cfg['Audio']['samples_per_frame']) for frame in segment_wav: self.audio_out.send(Frame(frame)) self.commands.send( Command( 'tts_end(user_id="%s",text="%s",fname="%s")' % (user_id, text, fname), 'TTS', 'HUB')) self.audio_out.send( Command( 'utterance_end(user_id="%s",text="%s",fname="%s",log="%s")' % (user_id, text, fname, log), 'TTS', 'AudioOut'))
def rec_wav_file(self, wav_path): pcm = load_wav(self.cfg, wav_path) frame = Frame(pcm) res = self.rec_wave(frame) self.flush() return res
# Actively call a number configured. # vio_commands.send(Command('make_call(destination="sip:4366@SECRET:5066")', 'HUB', 'VoipIO')) count = 0 max_count = 50000 wav = None while count < max_count: time.sleep(cfg['Hub']['main_loop_sleep_time']) count += 1 # write one frame into the audio output if wav: data_play = wav.pop(0) #print len(wav), len(data_play) vio_play.send(Frame(data_play)) # read all recorded audio if vio_record.poll(): data_rec = vio_record.recv() # read all messages from VoipIO if vio_commands.poll(): command = vio_commands.recv() if isinstance(command, Command): if command.parsed[ '__name__'] == "incoming_call" or command.parsed[ '__name__'] == "make_call": wav = audio.load_wav(cfg, './resources/test16k-mono.wav') # split audio into frames
def read_write_audio(self, p, stream, wf, play_buffer): """Send some of the available data to the output. It should be a non-blocking operation. Therefore: 1) do not send more then play_buffer_frames 2) send only if stream.get_write_available() is more then the frame size """ if self.audio_play.poll(): while self.audio_play.poll() \ and len(play_buffer) < self.cfg['AudioIO']['play_buffer_size'] \ and stream.get_write_available() > self.cfg['Audio']['samples_per_frame']: # send to play frames from input data_play = self.audio_play.recv() if isinstance(data_play, Frame): stream.write(data_play.payload) play_buffer.append(data_play) if self.cfg['AudioIO']['debug']: print '.', sys.stdout.flush() elif isinstance(data_play, Command): if data_play.parsed['__name__'] == 'utterance_start': self.commands.send( Command('play_utterance_start()', 'AudioIO', 'HUB')) if data_play.parsed['__name__'] == 'utterance_end': self.commands.send( Command('play_utterance_end()', 'AudioIO', 'HUB')) else: data_play = Frame(b"\x00\x00" * self.cfg['Audio']['samples_per_frame']) play_buffer.append(data_play) if self.cfg['AudioIO']['debug']: print '.', sys.stdout.flush() # record one packet of audio data # it will be blocked until the data is recorded data_rec = stream.read(self.cfg['Audio']['samples_per_frame']) # send recorded data it must be read at the other end self.audio_record.send(Frame(data_rec)) # get played audio block data_play = play_buffer.pop(0) # send played audio # FIXME: I should save what I am playing # self.audio_played.send(data_play) # save the recorded and played data data_stereo = bytearray() for i in range(self.cfg['Audio']['samples_per_frame']): data_stereo.extend(data_rec[i * 2]) data_stereo.extend(data_rec[i * 2 + 1]) # there might not be enough data to be played # then add zeros try: data_stereo.extend(data_play[i * 2]) except IndexError: data_stereo.extend(b'\x00') try: data_stereo.extend(data_play[i * 2 + 1]) except IndexError: data_stereo.extend(b'\x00') wf.writeframes(data_stereo)
def main(dirname, outfname, cfg, skip=0, ignore_list_file=None): """ Arguments: dirname -- the directory to search for WAVs outfname -- path towards the file to output to cfg -- a configuration dictionary (of the Config class) skip -- how many wavs to skip (default: 0) ignore_list_file -- a file open for reading whose lines specify path globs for logs that should be ignored, or None if no such file should be used. The format of this file is described in some alex/corpustools scripts. """ # Fetch relevant config arguments. frame_size = cfg['corpustools']['get_jasr_confnets']['frame_size'] rt_ratio = cfg['corpustools']['get_jasr_confnets']['rt_ratio'] sleep_time = rt_ratio * frame_size / 32000. wavs = sorted(get_wav_fnames(dirname, ignore_list_file), key=itemgetter(1)) jul = None try: with codecs.open(outfname, 'a+', encoding='UTF-8') as outfile: for wav_fname, wav_id in wavs[skip:]: # Load the wav. mywav = load_wav(cfg, wav_fname) # Start Julius. if jul is None: jul, grep, errfile = start_julius(cfg, on_no_context) # Insist on feeding all the input data to Julius, regardless of # how many times it crashes. exception = 1 while exception: try: for startidx in xrange(0, len(mywav), frame_size): jul.rec_in( Frame(mywav[startidx:startidx + frame_size])) sleep(sleep_time) # sleep(rt_ratio * len(mywav) / 32000.) except socket.error as e: # Julius crashing results in # error: [Errno 104] Connection reset by peer # Catch only that one. if e.errno != 104: raise e exception = e traceback.print_exc() print "get_jasr_confnets: Restarting Julius." clean_up(jul, grep, errfile) jul, grep, errfile = start_julius(cfg, on_no_context) else: exception = None exception = None try: hyp = jul.hyp_out() except ASRException as e: exception = e except socket.error as e: # Julius crashing results in # error: [Errno 104] Connection reset by peer # Catch only that one. if e.errno != 104: raise e exception = e if exception is not None: traceback.print_exc() clean_up(jul, grep, errfile) jul = None hyp = 'None' exception = None outfile.write('{id_} => {hyp!r}\n'.format(id_=wav_id, hyp=hyp)) sys.stderr.write('.') sys.stderr.flush() finally: if jul is not None: clean_up(jul, grep, errfile)