def __init__(self, cfg): self.cfg = cfg if self.cfg['VAD']['ffnn']['frontend'] != 'MFCC': raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], )) super(FFNNVAD, self).__init__(self.cfg['VAD']['ffnn']['model'], self.cfg['VAD']['ffnn']['filter_length'], self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['frameshift'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames'], self.cfg['VAD']['ffnn']['n_prev_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'], self.cfg['VAD']['ffnn']['mel_banks_only'])
def hyp_out(self): """ This defines asynchronous interface for speech recognition. Returns recognizer's hypotheses about the input speech audio. """ raise ASRException("Not implemented")
def __init__(self, cfg, commands, audio_in, audio_out, close_event): multiprocessing.Process.__init__(self) self.cfg = cfg self.system_logger = cfg['Logging']['system_logger'] self.session_logger = cfg['Logging']['session_logger'] self.commands = commands self.local_commands = deque() self.audio_in = audio_in self.local_audio_in = deque() self.audio_out = audio_out self.close_event = close_event self.vad_fname = None if self.cfg['VAD']['type'] == 'power': self.vad = PVAD.PowerVAD(cfg) elif self.cfg['VAD']['type'] == 'gmm': self.vad = GVAD.GMMVAD(cfg) elif self.cfg['VAD']['type'] == 'ffnn': self.vad = NNVAD.FFNNVAD(cfg) else: raise ASRException('Unsupported VAD engine: %s' % (self.cfg['VAD']['type'], )) # stores information about each frame whether it was classified as speech or non speech self.detection_window_speech = deque( maxlen=self.cfg['VAD']['decision_frames_speech']) self.detection_window_sil = deque( maxlen=self.cfg['VAD']['decision_frames_sil']) self.deque_audio_in = deque( maxlen=self.cfg['VAD']['speech_buffer_frames']) # keeps last decision about whether there is speech or non speech self.last_vad = False
def rec_in(self, frame): """ This defines asynchronous interface for speech recognition. Call this input function with audio data belonging into one speech segment that should be recognized. Output hypothesis is obtained by calling hyp_out(). """ raise ASRException("Not implemented")
def asr_factory(cfg, asr_type=None): ''' Returns instance of specified ASR decoder in asr_type. The ASR decoders are imported on the fly, because they need external non Python libraries. ''' if asr_type is None: asr_type = get_asr_type(cfg) t = get_asr_type(cfg) if t == 'Kaldi': from alex.components.asr.pykaldi import KaldiASR asr = KaldiASR(cfg) elif t == 'Google': from alex.components.asr.google import GoogleASR asr = GoogleASR(cfg) else: raise ASRException('Unsupported ASR decoder: %s' % asr_type) return asr
def __init__(self, cfg): self.cfg = cfg self.audio_recorded_in = [] self.ffnn = TheanoFFNN() self.ffnn.load(self.cfg['VAD']['ffnn']['model']) self.log_probs_speech = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.log_probs_sil = deque( maxlen=self.cfg['VAD']['ffnn']['filter_length']) self.last_decision = 0.0 if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC': self.front_end = MFCCFrontEnd( self.cfg['Audio']['sample_rate'], self.cfg['VAD']['ffnn']['framesize'], self.cfg['VAD']['ffnn']['usehamming'], self.cfg['VAD']['ffnn']['preemcoef'], self.cfg['VAD']['ffnn']['numchans'], self.cfg['VAD']['ffnn']['ceplifter'], self.cfg['VAD']['ffnn']['numceps'], self.cfg['VAD']['ffnn']['enormalise'], self.cfg['VAD']['ffnn']['zmeansource'], self.cfg['VAD']['ffnn']['usepower'], self.cfg['VAD']['ffnn']['usec0'], self.cfg['VAD']['ffnn']['usecmn'], self.cfg['VAD']['ffnn']['usedelta'], self.cfg['VAD']['ffnn']['useacc'], self.cfg['VAD']['ffnn']['n_last_frames'] + self.cfg['VAD']['ffnn']['n_prev_frames'], self.cfg['VAD']['ffnn']['lofreq'], self.cfg['VAD']['ffnn']['hifreq'], self.cfg['VAD']['ffnn']['mel_banks_only']) else: raise ASRException('Unsupported frontend: %s' % (self.cfg['VAD']['ffnn']['frontend'], ))
def flush(self): """ Should reset the decoder immediately in order to be ready for next recognition task """ raise ASRException("Not implemented")
def read_audio_write_asr_hypotheses(self): # Read input audio. if self.local_audio_in: if len(self.local_audio_in) > 40: print "ASR unprocessed frames:", len(self.local_audio_in) if len(self.local_audio_in) > 200: print "ASR too many unprocessed frames:", len( self.local_audio_in) print " skipping everything until the end of the segment:", len( self.local_audio_in) while len(self.local_audio_in) > 2 and isinstance( self.local_audio_in[0], Frame): skip = self.local_audio_in.popleft() # read recorded audio data_rec = self.local_audio_in.popleft() if isinstance(data_rec, Frame): if self.recognition_on: self.asr.rec_in(data_rec) elif isinstance(data_rec, Command): dr_speech_start = False fname = None if data_rec.parsed['__name__'] == "speech_start": # check whether there are more then one speech segments segments = [ cmd for cmd in self.local_audio_in if isinstance(cmd, Command) and cmd.parsed['__name__'] == "speech_start" ] if len(segments): # there are multiple unprocessed segments in the queue # remove all unprocessed segments except the last print "ASR too many unprocessed speech segments:", len( segments) print " removed all segments but the last" removed_segments = 0 while removed_segments < len(segments): data_rec = self.local_audio_in.popleft() if isinstance(data_rec, Command) and data_rec.parsed[ '__name__'] == "speech_start": removed_segments += 1 dr_speech_start = "speech_start" fname = data_rec.parsed['fname'] elif data_rec.parsed['__name__'] == "speech_end": dr_speech_start = "speech_end" fname = data_rec.parsed['fname'] # Check consistency of the input command. if dr_speech_start: if ((not self.recognition_on and dr_speech_start != "speech_start") or (self.recognition_on and dr_speech_start != "speech_end")): msg = ('Commands received by the ASR component are ' 'inconsistent (recognition_on: {rec}; the new ' 'command: {cmd}').format( rec=self.recognition_on, cmd=dr_speech_start) self.system_logger.exception(msg) if dr_speech_start == "speech_start": self.commands.send( Command('asr_start(fname="%s")' % fname, 'ASR', 'HUB')) self.recognition_on = True if self.cfg['ASR']['debug']: self.system_logger.debug( 'ASR: speech_start(fname="%s")' % fname) elif dr_speech_start == "speech_end": self.recognition_on = False if self.cfg['ASR']['debug']: self.system_logger.debug( 'ASR: speech_end(fname="%s")' % fname) try: asr_hyp = self.asr.hyp_out() if self.cfg['ASR']['debug']: msg = list() msg.append("ASR Hypothesis") msg.append("-" * 60) msg.append(unicode(asr_hyp)) msg.append(u"") msg = u'\n'.join(msg) self.system_logger.debug(msg) except (ASRException, JuliusASRTimeoutException): self.system_logger.debug("Julius ASR Result Timeout.") if self.cfg['ASR']['debug']: msg = list() msg.append("ASR Alternative hypothesis") msg.append("-" * 60) msg.append("sil") msg.append("") msg = u'\n'.join(msg) self.system_logger.debug(msg) asr_hyp = UtteranceConfusionNetwork() asr_hyp.add([ [1.0, "_other_"], ]) # The ASR component can return either NBList or a confusion # network. if isinstance(asr_hyp, UtteranceNBList): self.session_logger.asr("user", fname, asr_hyp, None) elif isinstance(asr_hyp, UtteranceConfusionNetwork): self.session_logger.asr("user", fname, asr_hyp.get_utterance_nblist(), asr_hyp) else: self.session_logger.asr("user", fname, [(-1, asr_hyp)], None) self.commands.send( Command('asr_end(fname="%s")' % fname, 'ASR', 'HUB')) self.commands.send(ASRHyp(asr_hyp, fname=fname)) self.asr_hypotheses_out.send(ASRHyp(asr_hyp, fname=fname)) else: raise ASRException('Unsupported input.')
def on_no_context(outstr): if DEBUG: print "REACHED ON_NO_CONTEXT" raise ASRException('Julius said: "NO CONTEXT?"')