Ejemplo n.º 1
0
    def __init__(self, cfg):
        self.cfg = cfg
        if self.cfg['VAD']['ffnn']['frontend'] != 'MFCC':
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['ffnn']['frontend'], ))

        super(FFNNVAD,
              self).__init__(self.cfg['VAD']['ffnn']['model'],
                             self.cfg['VAD']['ffnn']['filter_length'],
                             self.cfg['Audio']['sample_rate'],
                             self.cfg['VAD']['ffnn']['framesize'],
                             self.cfg['VAD']['ffnn']['frameshift'],
                             self.cfg['VAD']['ffnn']['usehamming'],
                             self.cfg['VAD']['ffnn']['preemcoef'],
                             self.cfg['VAD']['ffnn']['numchans'],
                             self.cfg['VAD']['ffnn']['ceplifter'],
                             self.cfg['VAD']['ffnn']['numceps'],
                             self.cfg['VAD']['ffnn']['enormalise'],
                             self.cfg['VAD']['ffnn']['zmeansource'],
                             self.cfg['VAD']['ffnn']['usepower'],
                             self.cfg['VAD']['ffnn']['usec0'],
                             self.cfg['VAD']['ffnn']['usecmn'],
                             self.cfg['VAD']['ffnn']['usedelta'],
                             self.cfg['VAD']['ffnn']['useacc'],
                             self.cfg['VAD']['ffnn']['n_last_frames'],
                             self.cfg['VAD']['ffnn']['n_prev_frames'],
                             self.cfg['VAD']['ffnn']['lofreq'],
                             self.cfg['VAD']['ffnn']['hifreq'],
                             self.cfg['VAD']['ffnn']['mel_banks_only'])
Ejemplo n.º 2
0
    def hyp_out(self):
        """
        This defines asynchronous interface for speech recognition.
        Returns recognizer's hypotheses about the input speech audio.

        """
        raise ASRException("Not implemented")
Ejemplo n.º 3
0
    def __init__(self, cfg, commands, audio_in, audio_out, close_event):
        multiprocessing.Process.__init__(self)

        self.cfg = cfg
        self.system_logger = cfg['Logging']['system_logger']
        self.session_logger = cfg['Logging']['session_logger']
        self.commands = commands
        self.local_commands = deque()
        self.audio_in = audio_in
        self.local_audio_in = deque()
        self.audio_out = audio_out
        self.close_event = close_event

        self.vad_fname = None

        if self.cfg['VAD']['type'] == 'power':
            self.vad = PVAD.PowerVAD(cfg)
        elif self.cfg['VAD']['type'] == 'gmm':
            self.vad = GVAD.GMMVAD(cfg)
        elif self.cfg['VAD']['type'] == 'ffnn':
            self.vad = NNVAD.FFNNVAD(cfg)
        else:
            raise ASRException('Unsupported VAD engine: %s' %
                               (self.cfg['VAD']['type'], ))

        # stores information about each frame whether it was classified as speech or non speech
        self.detection_window_speech = deque(
            maxlen=self.cfg['VAD']['decision_frames_speech'])
        self.detection_window_sil = deque(
            maxlen=self.cfg['VAD']['decision_frames_sil'])
        self.deque_audio_in = deque(
            maxlen=self.cfg['VAD']['speech_buffer_frames'])

        # keeps last decision about whether there is speech or non speech
        self.last_vad = False
Ejemplo n.º 4
0
    def rec_in(self, frame):
        """
        This defines asynchronous interface for speech recognition.

        Call this input function with audio data belonging into one speech
        segment that should be recognized.

        Output hypothesis is obtained by calling hyp_out().

        """
        raise ASRException("Not implemented")
Ejemplo n.º 5
0
def asr_factory(cfg, asr_type=None):
    ''' Returns instance of specified ASR decoder in asr_type.

    The ASR decoders are imported on the fly,
    because they need external non Python libraries.
    '''
    if asr_type is None:
        asr_type = get_asr_type(cfg)
    t = get_asr_type(cfg)

    if t == 'Kaldi':
        from alex.components.asr.pykaldi import KaldiASR
        asr = KaldiASR(cfg)
    elif t == 'Google':
        from alex.components.asr.google import GoogleASR
        asr = GoogleASR(cfg)
    else:
        raise ASRException('Unsupported ASR decoder: %s' % asr_type)

    return asr
Ejemplo n.º 6
0
    def __init__(self, cfg):
        self.cfg = cfg

        self.audio_recorded_in = []

        self.ffnn = TheanoFFNN()
        self.ffnn.load(self.cfg['VAD']['ffnn']['model'])

        self.log_probs_speech = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])
        self.log_probs_sil = deque(
            maxlen=self.cfg['VAD']['ffnn']['filter_length'])

        self.last_decision = 0.0

        if self.cfg['VAD']['ffnn']['frontend'] == 'MFCC':
            self.front_end = MFCCFrontEnd(
                self.cfg['Audio']['sample_rate'],
                self.cfg['VAD']['ffnn']['framesize'],
                self.cfg['VAD']['ffnn']['usehamming'],
                self.cfg['VAD']['ffnn']['preemcoef'],
                self.cfg['VAD']['ffnn']['numchans'],
                self.cfg['VAD']['ffnn']['ceplifter'],
                self.cfg['VAD']['ffnn']['numceps'],
                self.cfg['VAD']['ffnn']['enormalise'],
                self.cfg['VAD']['ffnn']['zmeansource'],
                self.cfg['VAD']['ffnn']['usepower'],
                self.cfg['VAD']['ffnn']['usec0'],
                self.cfg['VAD']['ffnn']['usecmn'],
                self.cfg['VAD']['ffnn']['usedelta'],
                self.cfg['VAD']['ffnn']['useacc'],
                self.cfg['VAD']['ffnn']['n_last_frames'] +
                self.cfg['VAD']['ffnn']['n_prev_frames'],
                self.cfg['VAD']['ffnn']['lofreq'],
                self.cfg['VAD']['ffnn']['hifreq'],
                self.cfg['VAD']['ffnn']['mel_banks_only'])
        else:
            raise ASRException('Unsupported frontend: %s' %
                               (self.cfg['VAD']['ffnn']['frontend'], ))
Ejemplo n.º 7
0
    def flush(self):
        """
        Should reset the decoder immediately in order to be ready for next recognition task

        """
        raise ASRException("Not implemented")
Ejemplo n.º 8
0
    def read_audio_write_asr_hypotheses(self):
        # Read input audio.
        if self.local_audio_in:
            if len(self.local_audio_in) > 40:
                print "ASR unprocessed frames:", len(self.local_audio_in)

            if len(self.local_audio_in) > 200:
                print "ASR too many unprocessed frames:", len(
                    self.local_audio_in)
                print "    skipping everything until the end of the segment:", len(
                    self.local_audio_in)
                while len(self.local_audio_in) > 2 and isinstance(
                        self.local_audio_in[0], Frame):
                    skip = self.local_audio_in.popleft()

            # read recorded audio
            data_rec = self.local_audio_in.popleft()

            if isinstance(data_rec, Frame):
                if self.recognition_on:
                    self.asr.rec_in(data_rec)
            elif isinstance(data_rec, Command):
                dr_speech_start = False
                fname = None

                if data_rec.parsed['__name__'] == "speech_start":
                    # check whether there are more then one speech segments
                    segments = [
                        cmd for cmd in self.local_audio_in
                        if isinstance(cmd, Command)
                        and cmd.parsed['__name__'] == "speech_start"
                    ]
                    if len(segments):
                        # there are multiple unprocessed segments in the queue
                        # remove all unprocessed segments except the last
                        print "ASR too many unprocessed speech segments:", len(
                            segments)
                        print "    removed all segments but the last"
                        removed_segments = 0
                        while removed_segments < len(segments):
                            data_rec = self.local_audio_in.popleft()
                            if isinstance(data_rec,
                                          Command) and data_rec.parsed[
                                              '__name__'] == "speech_start":
                                removed_segments += 1

                    dr_speech_start = "speech_start"
                    fname = data_rec.parsed['fname']
                elif data_rec.parsed['__name__'] == "speech_end":
                    dr_speech_start = "speech_end"
                    fname = data_rec.parsed['fname']

                # Check consistency of the input command.
                if dr_speech_start:
                    if ((not self.recognition_on
                         and dr_speech_start != "speech_start")
                            or (self.recognition_on
                                and dr_speech_start != "speech_end")):
                        msg = ('Commands received by the ASR component are '
                               'inconsistent (recognition_on: {rec}; the new '
                               'command: {cmd}').format(
                                   rec=self.recognition_on,
                                   cmd=dr_speech_start)
                        self.system_logger.exception(msg)

                if dr_speech_start == "speech_start":
                    self.commands.send(
                        Command('asr_start(fname="%s")' % fname, 'ASR', 'HUB'))
                    self.recognition_on = True

                    if self.cfg['ASR']['debug']:
                        self.system_logger.debug(
                            'ASR: speech_start(fname="%s")' % fname)

                elif dr_speech_start == "speech_end":
                    self.recognition_on = False

                    if self.cfg['ASR']['debug']:
                        self.system_logger.debug(
                            'ASR: speech_end(fname="%s")' % fname)

                    try:
                        asr_hyp = self.asr.hyp_out()

                        if self.cfg['ASR']['debug']:
                            msg = list()
                            msg.append("ASR Hypothesis")
                            msg.append("-" * 60)
                            msg.append(unicode(asr_hyp))
                            msg.append(u"")
                            msg = u'\n'.join(msg)
                            self.system_logger.debug(msg)

                    except (ASRException, JuliusASRTimeoutException):
                        self.system_logger.debug("Julius ASR Result Timeout.")
                        if self.cfg['ASR']['debug']:
                            msg = list()
                            msg.append("ASR Alternative hypothesis")
                            msg.append("-" * 60)
                            msg.append("sil")
                            msg.append("")
                            msg = u'\n'.join(msg)
                            self.system_logger.debug(msg)

                        asr_hyp = UtteranceConfusionNetwork()
                        asr_hyp.add([
                            [1.0, "_other_"],
                        ])

                    # The ASR component can return either NBList or a confusion
                    # network.
                    if isinstance(asr_hyp, UtteranceNBList):
                        self.session_logger.asr("user", fname, asr_hyp, None)
                    elif isinstance(asr_hyp, UtteranceConfusionNetwork):
                        self.session_logger.asr("user", fname,
                                                asr_hyp.get_utterance_nblist(),
                                                asr_hyp)
                    else:
                        self.session_logger.asr("user", fname, [(-1, asr_hyp)],
                                                None)

                    self.commands.send(
                        Command('asr_end(fname="%s")' % fname, 'ASR', 'HUB'))
                    self.commands.send(ASRHyp(asr_hyp, fname=fname))
                    self.asr_hypotheses_out.send(ASRHyp(asr_hyp, fname=fname))
            else:
                raise ASRException('Unsupported input.')
Ejemplo n.º 9
0
def on_no_context(outstr):
    if DEBUG:
        print "REACHED ON_NO_CONTEXT"
    raise ASRException('Julius said: "NO CONTEXT?"')