Ejemplo n.º 1
0
def main():
    # Initialise a decoder with the default configuration.
    # ps = PocketSphinx()

    # The decoder optionally accepts an command-line argument list for
    # decoder configuration. The following will suppress log output.
    ps = PocketSphinx(["-logfn", os.devnull])

    # Set up callback functions.
    ps.speech_start_callback = speech_start_callback
    ps.hypothesis_callback = hyp_callback

    # Recognise from the mic in a loop.
    ad = AudioDevice()
    ad.open()
    ad.record()
    while True:
        audio = ad.read_audio()
        ps.process_audio(audio)
        time.sleep(0.02)
Ejemplo n.º 2
0
class SphinxEngine(EngineBase, DelegateTimerManagerInterface):
    """ Speech recognition engine back-end for CMU Pocket Sphinx. """

    _name = "sphinx"
    DictationContainer = DictationContainerBase

    def __init__(self):
        EngineBase.__init__(self)
        DelegateTimerManagerInterface.__init__(self)

        # Set up the engine logger
        logging.basicConfig()

        if not ENGINE_AVAILABLE:
            self._log.error("%s: Failed to import jsgf, pyaudio and/or "
                            "sphinxwrapper. Are they installed?" % self)
            raise EngineError("Failed to import Pocket Sphinx engine "
                              "dependencies.")

        # Set the default engine configuration.
        # This can be changed later using the config property.
        self._config = None
        self.config = EngineConfig

        # Set other variables
        self._decoder = None
        self._audio_buffers = []
        self.compiler = SphinxJSGFCompiler(self)
        self._recognition_observer_manager = SphinxRecObsManager(self)
        self._keyphrase_thresholds = {}
        self._keyphrase_functions = {}
        self._training_session_active = False
        self._default_search_result = None

        # Timer-related members.
        self._timer_manager = SphinxTimerManager(0.02, self)

        # Set up keyphrase search names and valid search names for grammars.
        self._keyphrase_search_names = ["_key_phrases", "_wake_phrase"]
        self._valid_searches = set()

        # Recognising loop members.
        self._recorder = PyAudioRecorder(self.config)
        self._cancel_recognition_next_time = False
        self._recognising = False
        self._recognition_paused = False

    @property
    def config(self):
        """
        Python module/object containing engine configuration.

        You will need to restart the engine with :meth:`disconnect` and
        :meth:`connect` if the configuration has been changed after
        :meth:`connect` has been called.

        :returns: config module/object
        """
        return self._config

    @config.setter
    def config(self, value):
        # Validate configuration object.
        self.validate_config(value)
        self._config = value

    @classmethod
    def validate_config(cls, engine_config):
        # Check configuration options and set defaults where appropriate.
        # Set a new decoder config if necessary.
        if not hasattr(engine_config, "DECODER_CONFIG"):
            setattr(engine_config, "DECODER_CONFIG",
                    get_decoder_config_object())
        options = [
            "LANGUAGE",
            "START_ASLEEP",
            "WAKE_PHRASE",
            "WAKE_PHRASE_THRESHOLD",
            "SLEEP_PHRASE",
            "SLEEP_PHRASE_THRESHOLD",
            "TRAINING_DATA_DIR",
            "TRANSCRIPT_NAME",
            "START_TRAINING_PHRASE",
            "START_TRAINING_PHRASE_THRESHOLD",
            "END_TRAINING_PHRASE",
            "END_TRAINING_PHRASE_THRESHOLD",
            "CHANNELS",
            "RATE",
            "SAMPLE_WIDTH",
            "FRAMES_PER_BUFFER",
        ]

        # Get default values and set them they are missing.
        for option in options:
            if hasattr(engine_config, option):
                continue

            default_value = getattr(EngineConfig, option)
            if "PHRASE" in option:
                # Disable missing phrases by default if using a language
                # other than English.
                if not engine_config.LANGUAGE.startswith("en"):
                    default_value = "" if option.endswith("PHRASE") else 0.0
            setattr(engine_config, option, default_value)

    def connect(self):
        """
        Set up the CMU Pocket Sphinx decoder.

        This method does nothing if the engine is already connected.
        """
        if self._decoder:
            return

        # Initialise a new decoder with the given configuration
        decoder_config = self._config.DECODER_CONFIG
        self._decoder = PocketSphinx(decoder_config)
        self._valid_searches.add(self._default_search_name)

        # Set up callback function wrappers
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        def speech_start():
            # Reset the default search result and call the engine's callback
            # method.
            self._default_search_result = None
            return self._speech_start_callback(False)

        self._decoder.hypothesis_callback = hypothesis
        self._decoder.speech_start_callback = speech_start

        # Set up built-in keyphrases if they set. Catch and log any
        # UnknownWordErrors because all keyphrases are optional.
        def get_phrase_values(name):
            phrase_attr = name + "_PHRASE"
            threshold_attr = name + "_PHRASE_THRESHOLD"
            return (getattr(self.config, phrase_attr,
                            ""), getattr(self.config, threshold_attr, 0))

        def safe_set_keyphrase(name, func):
            phrase, threshold = get_phrase_values(name)
            if phrase and threshold:
                try:
                    self.set_keyphrase(phrase, threshold, func)
                except UnknownWordError as e:
                    self._log.error(e)

        # Set the wake phrase using set_kws_list directly because it uses a
        # different search.
        wake_phrase, wake_threshold = get_phrase_values("WAKE")
        if wake_phrase and wake_threshold:
            try:
                self._validate_words(wake_phrase.split(), "keyphrase")
                self._decoder.set_kws_list("_wake_phrase",
                                           {wake_phrase: wake_threshold})
            except UnknownWordError as e:
                self._log.error(e)

        # Set the other keyphrases using safe_set_keyphrase().
        safe_set_keyphrase("SLEEP", self.pause_recognition)
        safe_set_keyphrase("START_TRAINING", self.start_training_session)
        safe_set_keyphrase("END_TRAINING", self.end_training_session)

        # Set the PyAudioRecorder instance's config object.
        self._recorder.config = self.config

        # Start in sleep mode if requested.
        if self.config.START_ASLEEP:
            self.pause_recognition()
            self._log.warning("Starting in sleep mode as requested.")

    def _free_engine_resources(self):
        """
        Internal method for freeing the resources used by the engine.
        """
        # Stop the audio recorder if it is running.
        self._recognising = False
        self._recorder.stop()

        # Free the decoder and clear audio buffers.
        self._decoder = None
        self._audio_buffers = []

        # Reset other variables
        self._cancel_recognition_next_time = False
        self._training_session_active = False
        self._recognition_paused = False

        # Clear dictionaries and sets
        self._grammar_wrappers.clear()
        self._valid_searches.clear()
        self._keyphrase_thresholds.clear()
        self._keyphrase_functions.clear()

    def disconnect(self):
        """
        Deallocate the CMU Sphinx decoder and any other resources used by
        it.

        This method effectively unloads all loaded grammars and key
        phrases.
        """
        # Free resources if the decoder isn't currently being used to
        # recognise, otherwise stop the recognising loop, which will free
        # the resources safely.
        if not self.recognising:
            self._free_engine_resources()
        else:
            self._recognising = False
            self._recorder.stop()

    # -----------------------------------------------------------------------
    # Multiplexing timer methods.

    def create_timer(self, callback, interval, repeating=True):
        """
        Create and return a timer using the specified callback and repeat
        interval.

        **Note**: Timers will not run unless the engine is recognising
        audio. Normal threads can be used instead with no downsides.
        """
        if not self.recognising:
            self._log.warning("Timers will not run unless the engine is "
                              "recognising audio.")

        return super(SphinxEngine, self).create_timer(callback, interval,
                                                      repeating)

    # -----------------------------------------------------------------------
    # Methods for working with grammars.

    def check_valid_word(self, word):
        """
        Check if a word is in the current Sphinx pronunciation dictionary.

        :rtype: bool
        """
        if not self._decoder:
            self.connect()

        return bool(self._decoder.lookup_word(word.lower()))

    def _validate_words(self, words, search_type):
        unknown_words = []

        # Use 'set' to de-duplicate the 'words' list.
        for word in set(words):
            if not self.check_valid_word(word):
                unknown_words.append(word)

        if unknown_words:
            # Sort the word list before using it.
            unknown_words.sort()
            raise UnknownWordError(
                "%s used words not found in the pronunciation dictionary: "
                "%s" % (search_type, ", ".join(unknown_words)))

    def _build_grammar_wrapper(self, grammar):
        return GrammarWrapper(grammar, self,
                              self._recognition_observer_manager)

    def _set_grammar(self, wrapper, activate, partial=False):
        if not wrapper:
            return

        # Connect to the engine if it isn't connected already.
        self.connect()

        def activate_search_if_necessary():
            if activate:
                self._decoder.end_utterance()
                self._decoder.active_search = wrapper.search_name

        # Check if the wrapper's search name is valid.
        # Set the search (again) if necessary.
        valid_search = wrapper.search_name in self._valid_searches
        if valid_search and not wrapper.set_search:
            # wrapper.search_name is a valid search, so return.
            activate_search_if_necessary()
            return

        # Return early if 'partial' is True as an optimisation to avoid
        # recompiling grammars for every rule activation/deactivation.
        # Also return if the search doesn't need to be set.
        if partial or not wrapper.set_search:
            return

        # Compile and set the jsgf search.
        compiled = wrapper.compile_jsgf()
        self._log.debug(compiled)

        # Raise an error if there are no active public rules.
        if "public <root> = " not in compiled:
            raise EngineError("no public rules found in the grammar")

        # Set the JSGF search.
        self._decoder.end_utterance()
        self._decoder.set_jsgf_string(wrapper.search_name, compiled)
        activate_search_if_necessary()

        # Grammar search has been loaded, so set the wrapper's flag.
        wrapper.set_search = False

    def _unset_search(self, name):
        # Unset a Pocket Sphinx search with the given name.
        # Don't unset the default or keyphrase searches.
        default_search = self._default_search_name
        reserved = [default_search] + self._keyphrase_search_names
        if name in reserved:
            return

        # Unset the Pocket Sphinx search.
        if name in self._valid_searches:
            # Unfortunately, the C function for doing this (ps_unset_search)
            # is not exposed. Pocket Sphinx searches are pretty lighweight
            # however. This would only be an issue on hardware with limited
            # memory.

            # Remove the search from the valid searches set.
            self._valid_searches.remove(name)

        # Change to the default search to avoid possible segmentation faults
        # from Pocket Sphinx which crash Python.
        self._set_default_search()

    # TODO Add optional context parameter
    def set_keyphrase(self, keyphrase, threshold, func):
        """
        Add a keyphrase to listen for.

        Key phrases take precedence over grammars as they are processed first.
        They cannot be set for specific contexts (yet).

        :param keyphrase: keyphrase to add.
        :param threshold: keyphrase threshold value to use.
        :param func: function or method to call when the keyphrase is heard.
        :type keyphrase: str
        :type threshold: float
        :type func: callable
        :raises: UnknownWordError
        """
        # Check that all words in the keyphrase are in the pronunciation dictionary.
        # This can raise an UnknownWordError.
        self._validate_words(keyphrase.split(), "keyphrase")

        # Check that the threshold is a float.
        if not isinstance(threshold, float):
            raise TypeError("threshold must be a float, not %s" % threshold)

        # Add parameters to the relevant dictionaries.
        self._keyphrase_thresholds[keyphrase] = threshold
        self._keyphrase_functions[keyphrase] = func

        # Set the keyphrase search (again)
        self._decoder.end_utterance()
        self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)

    def unset_keyphrase(self, keyphrase):
        """
        Remove a set keyphrase so that the engine no longer listens for it.

        :param keyphrase: keyphrase to remove.
        :type keyphrase: str
        """
        # Remove parameters from the relevant dictionaries. Don't raise an error
        # if there is no such keyphrase.
        self._keyphrase_thresholds.pop(keyphrase, None)
        self._keyphrase_functions.pop(keyphrase, None)

        # Set the keyphrase search (again)
        self._decoder.end_utterance()
        self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)

    def _set_default_search(self):
        # Change the active search to the one used for processing speech as
        # it is heard.
        swap_to_wake_search = (self.recognition_paused
                               and self.config.WAKE_PHRASE
                               and self.config.WAKE_PHRASE_THRESHOLD)

        # Ensure we're not processing.
        self._decoder.end_utterance()
        if swap_to_wake_search:
            self._decoder.active_search = "_wake_phrase"
        else:
            self._decoder.active_search = self._default_search_name

    def _load_grammar(self, grammar):
        """ Load the given *grammar* and return a wrapper. """
        self._log.debug("Engine %s: loading grammar %s." %
                        (self, grammar.name))

        grammar.engine = self
        # Dependency checking.
        memo = []
        for r in grammar.rules:
            for d in r.dependencies(memo):
                grammar.add_dependency(d)

        wrapper = self._build_grammar_wrapper(grammar)

        # Check that the engine doesn't already have a grammar with the same
        # search name. This will include grammars with the same reference
        # name, e.g. "some grammar" and "some_grammar".
        if wrapper.search_name in self._valid_searches:
            message = "Failed to load grammar %s: multiple grammars with " \
                "the same name are not allowed" % grammar
            self._log.error(message)
            raise EngineError(message)

        # Attempt to set the grammar search.
        try:
            self._set_grammar(wrapper, False)
        except Exception as e:
            self._log.exception("Failed to load grammar %s: %s." %
                                (grammar, e))
            raise EngineError("Failed to load grammar %s: %s." % (grammar, e))

        # Set the grammar wrapper's search name as valid and return the
        # wrapper.
        self._valid_searches.add(wrapper.search_name)
        return wrapper

    def _unload_grammar(self, grammar, wrapper):
        try:
            # Unset the search names for the grammar.
            self._unset_search(wrapper.search_name)
        except Exception as e:
            self._log.exception("Failed to unload grammar %s: %s." %
                                (grammar, e))

    def activate_grammar(self, grammar):
        self._log.debug("Activating grammar %s." % grammar.name)

    def deactivate_grammar(self, grammar):
        self._log.debug("Deactivating grammar %s." % grammar.name)

    def activate_rule(self, rule, grammar):
        self._log.debug("Activating rule %s in grammar %s." %
                        (rule.name, grammar.name))
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return
        try:
            wrapper.enable_rule(rule.name)
            self._set_grammar(wrapper, False, True)
        except Exception as e:
            self._log.exception("Failed to activate grammar %s: %s." %
                                (grammar, e))

    def deactivate_rule(self, rule, grammar):
        self._log.debug("Deactivating rule %s in grammar %s." %
                        (rule.name, grammar.name))
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return
        try:
            wrapper.disable_rule(rule.name)
            self._set_grammar(wrapper, False, True)
        except Exception as e:
            self._log.exception("Failed to activate grammar %s: %s." %
                                (grammar, e))

    def update_list(self, lst, grammar):
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return

        # Unfortunately there is no way to update lists for Pocket Sphinx
        # without reloading the grammar, so we'll update the list's JSGF
        # rule and reload.
        wrapper.update_list(lst)

        # Reload the grammar.
        try:
            self._set_grammar(wrapper, False)
        except Exception as e:
            self._log.exception("Failed to update list %s: %s." % (lst, e))

    def set_exclusiveness(self, grammar, exclusive):
        # Disable/enable each grammar.
        for g in self.grammars:
            if exclusive:
                g.disable()
            else:
                g.enable()

        # Enable the specified grammar if it was supposed to be exclusive.
        if exclusive:
            grammar.enable()

    # -----------------------------------------------------------------------
    # Miscellaneous methods.

    @property
    def recognising(self):
        """
        Whether the engine is currently recognising speech.

        To stop recognition, use :meth:`disconnect`.

        :rtype: bool
        """
        return self._recorder.recording or self._recognising

    @property
    def default_search_result(self):
        """
        The last hypothesis object of the default search.

        This does not currently reach recognition observers because it is
        intended to be used for dictation results, which are currently
        disabled. Nevertheless this object can be useful sometimes.

        :returns: Sphinx Hypothesis object | None
        """
        return self._default_search_result

    @property
    def _default_search_name(self):
        # The name of the Pocket Sphinx search used for processing speech as
        # it is heard.
        return "_default"

    def _get_best_hypothesis(self, hypotheses):
        """
        Take a list of speech hypotheses and return the most likely one.

        :type hypotheses: iterable
        :return: str | None
        """
        # Get all distinct, non-null hypotheses.
        distinct = tuple([h for h in set(hypotheses) if bool(h)])
        if not distinct:
            return None
        elif len(distinct) == 1:
            return distinct[0]  # only one choice

        # Decide between non-null hypotheses using a Pocket Sphinx search with
        # each hypothesis as a grammar rule.
        grammar = RootGrammar()
        grammar.language_name = self.language
        for i, hypothesis in enumerate(distinct):
            grammar.add_rule(PublicRule("rule%d" % i, Literal(hypothesis)))

        compiled = grammar.compile_grammar()
        name = "_temp"

        # Store the current search name.
        original = self._decoder.active_search

        # Note that there is no need to validate words in this case because
        # each literal in the _temp grammar came from a Pocket Sphinx
        # hypothesis.
        self._decoder.end_utterance()
        self._decoder.set_jsgf_string(name, compiled)
        self._decoder.active_search = name

        # Do the processing.
        hyp = self._decoder.batch_process(self._audio_buffers,
                                          use_callbacks=False)
        result = hyp.hypstr if hyp else None

        # Switch back to the previous search.
        self._decoder.end_utterance()  # just in case
        self._decoder.active_search = original
        return result

    def _speech_start_callback(self, mimicking):
        # Get context info. Dragonfly has a handy static method for this:
        fg_window = Window.get_foreground()

        # Call process_begin for all grammars so that any out of context
        # grammar will not be used.
        for wrapper in self._grammar_wrappers.values():
            wrapper.process_begin(fg_window)

        if not mimicking:
            # Trim excess audio buffers from the start of the list. Keep a maximum 1
            # second of silence before speech start was detected. This should help
            # increase the performance of batch reprocessing later.
            chunk = self.config.FRAMES_PER_BUFFER
            rate = self.config.RATE
            seconds = 1
            n_buffers = int(rate / chunk * seconds)
            self._audio_buffers = self._audio_buffers[-1 * n_buffers:]

        # Notify observers
        self._recognition_observer_manager.notify_begin()

    def _hypothesis_callback(self, speech, mimicking):
        """
        Internal Pocket Sphinx hypothesis callback method. Calls _process_hypothesis
        and does post-processing afterwards.
        :param speech: speech hypothesis
        :type speech: str | None
        :param mimicking:  whether to treat speech as mimicked speech.
        :rtype: bool
        """
        # Clear any recorded audio buffers.
        self._recorder.clear_buffers()

        # Process speech. We should get back a boolean for whether processing
        # occurred as well as the final speech hypothesis.
        processing_occurred, final_speech = self._process_hypotheses(
            speech, mimicking)

        # Notify observers of failure.
        if not processing_occurred:
            self._recognition_observer_manager.notify_failure()

        # Write the training data files if necessary.
        data_dir = self.config.TRAINING_DATA_DIR
        if not mimicking and data_dir and os.path.isdir(data_dir):
            # Use the default search's hypothesis if final_speech was nil.
            if not final_speech:
                final_speech = speech
            try:
                write_training_data(self.config, self._audio_buffers,
                                    final_speech)
            except Exception as e:
                self._log.exception("Failed to write training data: %s" % e)

        # Clear audio buffer list because utterance processing has finished.
        self._audio_buffers = []

        # Ensure that the correct search is used.
        self._set_default_search()

        # Return whether processing occurred in case this method was called
        # by mimic.
        return processing_occurred

    def _process_key_phrases(self, speech, mimicking):
        """
        Processing key phrase searches and return the matched keyphrase
        (if any).

        :type speech: str
        :param mimicking: whether to treat speech as mimicked speech.
        :rtype: str
        """
        # Return if speech is empty/null or if there are no key phrases set.
        if not (speech and self._keyphrase_thresholds):
            return ""  # no matches

        if not mimicking:
            # Reprocess using the key phrases search
            self._decoder.end_utterance()
            self._decoder.active_search = "_key_phrases"
            hyp = self._decoder.batch_process(self._audio_buffers,
                                              use_callbacks=False)

            # Get the hypothesis string.
            speech = hyp.hypstr if hyp else ""

            # Restore search to the default search.
            self._set_default_search()

            # Return if no key phrase matched.
            if not speech:
                return ""

            # Handle multiple matching key phrases. This appears to be a
            # quirk of how Pocket Sphinx 'kws' searches work. Get the best
            # match instead if this is the case.
            recognised_phrases = speech.split("  ")
            if len(recognised_phrases) > 1:
                # Remove trailing space from the last phrase.
                recognised_phrases[len(recognised_phrases) - 1].rstrip()
                speech = self._get_best_hypothesis(recognised_phrases)
            else:
                speech = speech.rstrip()  # remove trailing whitespace.

        # Notify observers if a keyphrase was matched.
        result = speech if speech in self._keyphrase_functions else ""
        if result:
            words = tuple(result.split())
            self._recognition_observer_manager.notify_recognition(words)

        # Call the registered function if there was a match and the function
        # is callable.
        func = self._keyphrase_functions.get(speech, None)
        if callable(func):
            try:
                func()
            except Exception as e:
                self._log.exception(
                    "Exception caught when executing the function for "
                    "keyphrase '%s': %s" % (speech, e))

        return result

    @classmethod
    def _generate_words_rules(cls, words, mimicking):
        # Convert words to Unicode, treat all uppercase words as dictation
        # words and other words as grammar words.
        # Minor note: this won't work for languages without capitalisation.
        result = []
        for word in words.split():
            if PY2 and isinstance(word, str):
                word = text_type(word, encoding="utf-8")
            if word.isupper() and mimicking:
                # Convert dictation words to lowercase for consistent
                # output.
                result.append((word.lower(), 1000000))
            else:
                result.append((word, 0))
        return tuple(result)

    def _process_hypotheses(self, speech, mimicking):
        """
        Internal method to process speech hypotheses. This should only be called
        from 'SphinxEngine._hypothesis_callback' because that method does important
        post processing.

        :param speech: speech
        :param mimicking: whether to treat speech as mimicked speech.
        :rtype: tuple
        """
        # Check key phrases search first.
        keyphrase = self._process_key_phrases(speech, mimicking)
        if keyphrase:
            # Keyphrase search matched.
            return True, keyphrase

        # Otherwise do grammar processing.
        processing_occurred = False
        hypotheses = {}

        # Collect each active grammar's GrammarWrapper.
        wrappers = [
            w for w in self._grammar_wrappers.values() if w.grammar_active
        ]

        # No grammar has been loaded.
        if not wrappers:
            # TODO What should we do here? Output formatted Dictation like DNS?
            return processing_occurred, speech

        # Batch process audio buffers for each active grammar. Store each
        # hypothesis.
        for wrapper in wrappers:
            if mimicking:
                # Just use 'speech' for everything if mimicking.
                hyp = speech
            else:
                # Switch to the search for this grammar and re-process the
                # audio.
                self._set_grammar(wrapper, True)
                hyp = self._decoder.batch_process(self._audio_buffers,
                                                  use_callbacks=False)
                if hyp:
                    hyp = hyp.hypstr

            # Set the hypothesis in the dictionary.
            hypotheses[wrapper.search_name] = hyp

        # Get the best hypothesis.
        speech = self._get_best_hypothesis(list(hypotheses.values()))
        if not speech:
            return processing_occurred, speech

        # Process speech using the first matching grammar.
        words_rules = self._generate_words_rules(speech, mimicking)
        for wrapper in wrappers:
            if hypotheses[wrapper.search_name] != speech:
                continue

            processing_occurred = wrapper.process_words(words_rules)
            if processing_occurred:
                break

        # Return whether processing occurred and the final speech hypothesis for
        # post processing.
        return processing_occurred, speech

    def process_buffer(self, buf):
        """
        Recognise speech from an audio buffer.

        This method is meant to be called in sequence for multiple audio
        buffers. It will do nothing if :meth:`connect` hasn't been called.

        :param buf: audio buffer
        :type buf: str
        """
        if not self._decoder:
            return

        # Cancel current recognition if it has been requested.
        if self._cancel_recognition_next_time:
            self._decoder.end_utterance()
            self._audio_buffers = []
            self._cancel_recognition_next_time = False

        # Keep a list of buffers for possible reprocessing using different Pocket
        # Sphinx searches later.
        self._audio_buffers.append(buf)

        # Call the timer callback if it is set.
        self.call_timer_callback()

        # Process audio.
        try:
            self._recognising = True
            self._decoder.process_audio(buf)
        finally:
            self._recognising = False

    def process_wave_file(self, path):
        """
        Recognise speech from a wave file and return the recognition results.

        This method checks that the wave file is valid. It raises an error
        if the file doesn't exist, if it can't be read or if the WAV header
        values do not match those in the engine configuration.

        If recognition is paused (sleep mode), this method will call
        :meth:`resume_recognition`.

        The wave file must use the same sample width, sample rate and number
        of channels that the acoustic model uses.

        If the file is valid, :meth:`process_buffer` is then used to process
        the audio.

        Multiple utterances are supported.

        :param path: wave file path
        :raises: IOError | OSError | ValueError
        :returns: recognition results
        :rtype: generator
        """
        if not self._decoder:
            self.connect()

        # This method's implementation has been adapted from the PyAudio
        # play wave example:
        # http://people.csail.mit.edu/hubert/pyaudio/#play-wave-example

        # Check that path is a valid file.
        if not os.path.isfile(path):
            raise IOError(
                "'%s' is not a file. Please use a different file path.")

        # Get required audio configuration from the engine config.
        channels, sample_width, rate, chunk = (self.config.CHANNELS,
                                               self.config.SAMPLE_WIDTH,
                                               self.config.RATE,
                                               self.config.FRAMES_PER_BUFFER)

        # Make sure recognition is not paused.
        if self.recognition_paused:
            self.resume_recognition(notify=False)

        # Open the wave file. Use contextlib to make sure that the file is
        # closed whether errors are raised or not.
        # Also register a custom recognition observer for the duration.
        obs = WaveRecognitionObserver(self)
        with contextlib.closing(wave.open(path, "rb")) as wf, obs as obs:
            # Validate the wave file's header.
            if wf.getnchannels() != channels:
                message = ("WAV file '%s' should use %d channel(s), not %d!" %
                           (path, channels, wf.getnchannels()))
            elif wf.getsampwidth() != sample_width:
                message = ("WAV file '%s' should use sample width %d, not "
                           "%d!" % (path, sample_width, wf.getsampwidth()))
            elif wf.getframerate() != rate:
                message = ("WAV file '%s' should use sample rate %d, not "
                           "%d!" % (path, rate, wf.getframerate()))
            else:
                message = None

            if message:
                raise ValueError(message)

            # Use process_buffer to process each buffer.
            for _ in range(0, int(wf.getnframes() / chunk) + 1):
                data = wf.readframes(chunk)
                if not data:
                    break

                self.process_buffer(data)

                # Get the results from the observer.
                if obs.words:
                    yield obs.words
                    obs.words = ""

        # Log warnings if speech start or end weren't detected.
        if not obs.complete:
            self._log.warning("Speech start/end wasn't detected in the wave "
                              "file!")
            self._log.warning("Perhaps the Sphinx '-vad_prespeech' value "
                              "should be higher?")
            self._log.warning("Or maybe '-vad_startspeech' or "
                              "'-vad_postspeech' should be lower?")

    def recognise_forever(self):
        """
        Start recognising from the default recording device until
        :meth:`disconnect` is called.

        Recognition can be paused and resumed using either the sleep/wake
        key phrases or by calling :meth:`pause_recognition` or
        :meth:`resume_recognition`.

        To configure audio input settings, modify the engine's ``CHANNELS``,
        ``RATE``, ``SAMPLE_WIDTH`` and/or ``FRAMES_PER_BUFFER``
        configuration options.
        """
        if not self._decoder:
            self.connect()

        # Start recognising in a loop.
        self._recorder.start()
        self._cancel_recognition_next_time = False
        while self.recognising:
            for buf in self._recorder.get_buffers():
                self.process_buffer(buf)

        # Free engine resources after recognition has stopped.
        self._free_engine_resources()

    def mimic(self, words):
        """ Mimic a recognition of the given *words* """
        if isinstance(words, (list, tuple)):
            words = " ".join(words)

        if self.recognition_paused and words == self.config.WAKE_PHRASE:
            self.resume_recognition()
            return

        # Pretend that Sphinx has started processing speech
        self._speech_start_callback(True)

        # Process the words as if they were spoken
        result = self._hypothesis_callback(words, True)
        if not result:
            raise MimicFailure("No matching rule found for words %s." % words)

    def mimic_phrases(self, *phrases):
        """
        Mimic a recognition of the given *phrases*.

        This method accepts variable phrases instead of a list of words.
        """
        # Pretend that Sphinx has started processing speech
        self._speech_start_callback(True)

        # Process phrases as if they were spoken
        wake_phrase = self.config.WAKE_PHRASE
        for phrase in phrases:
            if self.recognition_paused and phrase == wake_phrase:
                self.resume_recognition()
                continue

            result = self._hypothesis_callback(phrase, True)
            if not result:
                raise MimicFailure("No matching rule found for words %s." %
                                   phrase)

    def speak(self, text):
        """"""
        self._log.warning("text-to-speech is not implemented for this "
                          "engine.")
        self._log.warning("Printing text instead.")
        print(text)

    def _get_language(self):
        return self.config.LANGUAGE

    # ----------------------------------------------------------------------
    # Training-related methods

    def write_transcript_files(self, fileids_path, transcription_path):
        """
        Write .fileids and .transcription files for files in the training
        data directory and write them to the specified file paths.

        This method will raise an error if the ``TRAINING_DATA_DIR``
        configuration option is not set to an existing directory.

        :param fileids_path: path to .fileids file to create.
        :param transcription_path: path to .transcription file to create.
        :type fileids_path: str
        :type transcription_path: str
        :raises: IOError | OSError
        """
        write_transcript_files(self.config, fileids_path, transcription_path)

    @property
    def training_session_active(self):
        """
        Whether a training session is in progress.

        :rtype: bool
        """
        return self._training_session_active

    def start_training_session(self):
        """
        Start the training session. This will stop recognition processing
        until either :meth:`end_training_session` is called or the end
        training keyphrase is heard.
        """
        data_dir = self.config.TRAINING_DATA_DIR
        if not data_dir or not os.path.isdir(data_dir):
            self._log.warning("Training data will not be recorded; '%s' is "
                              "not a directory" % data_dir)

        if not self._training_session_active:
            self._log.info("Training session has started. No rule "
                           "actions will be processed. ")
            self._log.info("Say '%s' to end the session." %
                           self.config.END_TRAINING_PHRASE)
            self._training_session_active = True

    def end_training_session(self):
        """
        End the training if one is in progress. This will allow recognition
        processing once again.
        """
        if self._training_session_active:
            self._log.info("Ending training session.")
            self._log.info("Rule actions will now be processed normally "
                           "again.")
            self._training_session_active = False

    # ----------------------------------------------------------------------
    # Recognition loop control methods
    # Stopping recognition loop is done using disconnect()

    @property
    def recognition_paused(self):
        """
        Whether the engine is waiting for the wake phrase to be heard or for
        :meth:`resume_recognition` to be called.

        :rtype: bool
        """
        return self._recognition_paused

    def pause_recognition(self):
        """
        Pause recognition and wait for :meth:`resume_recognition` to be
        called or for the wake keyphrase to be spoken.
        """
        if not self._decoder:
            return

        self._recognition_paused = True

        # Switch to the wake keyphrase search if a wake keyphrase has been
        # set.
        self._set_default_search()
        if not self.config.WAKE_PHRASE:
            self._log.warning("No wake phrase has been set.")
            self._log.warning("Use engine.resume_recognition() to wake up.")

        # Define temporary callback for the decoder.
        def hypothesis(hyp):
            # Clear any recorded audio buffers.
            self._recorder.clear_buffers()
            s = hyp.hypstr if hyp else None

            # Resume recognition if s is the wake keyphrase.
            if s and s.strip() == self.config.WAKE_PHRASE.strip():
                self.resume_recognition()
            elif self.config.WAKE_PHRASE:
                self._log.debug("Didn't hear %s" % self.config.WAKE_PHRASE)

            # Clear audio buffers
            self._audio_buffers = []

        # Override decoder hypothesis callback.
        self._decoder.hypothesis_callback = hypothesis

    def resume_recognition(self, notify=True):
        """
        Resume listening for grammar rules and key phrases.
        """
        if not self._decoder:
            return

        self._recognition_paused = False

        # Notify observers about recognition resume.
        keyphrase = self.config.WAKE_PHRASE
        words = tuple(keyphrase.strip().split())
        if words and notify:
            self._recognition_observer_manager.notify_recognition(words)

        # Restore the callbacks to normal
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        self._decoder.hypothesis_callback = hypothesis

        # Switch to the default search.
        self._set_default_search()

    def cancel_recognition(self):
        """
        If a recognition was in progress, cancel it before processing the
        next audio buffer.
        """
        self._cancel_recognition_next_time = True
Ejemplo n.º 3
0
    def connect(self):
        """
        Set up the CMU Pocket Sphinx decoder.

        This method does nothing if the engine is already connected.
        """
        if self._decoder:
            return

        # Initialise a new decoder with the given configuration
        decoder_config = self._config.DECODER_CONFIG
        self._decoder = PocketSphinx(decoder_config)
        self._valid_searches.add(self._default_search_name)

        # Set up callback function wrappers
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        def speech_start():
            # Reset the default search result and call the engine's callback
            # method.
            self._default_search_result = None
            return self._speech_start_callback(False)

        self._decoder.hypothesis_callback = hypothesis
        self._decoder.speech_start_callback = speech_start

        # Set up built-in keyphrases if they set. Catch and log any
        # UnknownWordErrors because all keyphrases are optional.
        def get_phrase_values(name):
            phrase_attr = name + "_PHRASE"
            threshold_attr = name + "_PHRASE_THRESHOLD"
            return (getattr(self.config, phrase_attr,
                            ""), getattr(self.config, threshold_attr, 0))

        def safe_set_keyphrase(name, func):
            phrase, threshold = get_phrase_values(name)
            if phrase and threshold:
                try:
                    self.set_keyphrase(phrase, threshold, func)
                except UnknownWordError as e:
                    self._log.error(e)

        # Set the wake phrase using set_kws_list directly because it uses a
        # different search.
        wake_phrase, wake_threshold = get_phrase_values("WAKE")
        if wake_phrase and wake_threshold:
            try:
                self._validate_words(wake_phrase.split(), "keyphrase")
                self._decoder.set_kws_list("_wake_phrase",
                                           {wake_phrase: wake_threshold})
            except UnknownWordError as e:
                self._log.error(e)

        # Set the other keyphrases using safe_set_keyphrase().
        safe_set_keyphrase("SLEEP", self.pause_recognition)
        safe_set_keyphrase("START_TRAINING", self.start_training_session)
        safe_set_keyphrase("END_TRAINING", self.end_training_session)

        # Set the PyAudioRecorder instance's config object.
        self._recorder.config = self.config

        # Start in sleep mode if requested.
        if self.config.START_ASLEEP:
            self.pause_recognition()
            self._log.warning("Starting in sleep mode as requested.")
Ejemplo n.º 4
0
class SphinxEngine(EngineBase):
    """ Speech recognition engine back-end for CMU Pocket Sphinx. """

    _name = "sphinx"
    DictationContainer = SphinxDictationContainer

    def __init__(self):
        EngineBase.__init__(self)

        # Set up the engine logger
        logging.basicConfig()

        if not ENGINE_AVAILABLE:
            self._log.error("%s: Failed to import jsgf, pyaudio and/or "
                            "sphinxwrapper. Are they installed?" % self)
            raise EngineError("Failed to import Pocket Sphinx engine "
                              "dependencies.")

        # Set the default engine configuration.
        # This can be changed later using the config property.
        self._config = None
        self.config = EngineConfig

        # Set other variables
        self._decoder = None
        self._audio_buffers = []
        self.compiler = SphinxJSGFCompiler()
        self._recognition_observer_manager = SphinxRecObsManager(self)
        self._keyphrase_thresholds = {}
        self._keyphrase_functions = {}
        self._training_session_active = False
        self._default_search_result = None

        # Timer-related members.
        self._timer_manager = SphinxTimerManager(0.02, self)
        self._timer_callback = None
        self._timer_interval = None
        self._timer_next_time = 0

        # Set up keyphrase search names and valid search names for grammars.
        self._keyphrase_search_names = ["_key_phrases", "_wake_phrase"]
        self._valid_searches = set()

        # Recognising loop members.
        self._recorder = PyAudioRecorder(self.config)
        self._cancel_recognition_next_time = False
        self._recognising = False
        self._recognition_paused = False

    @property
    def config(self):
        """
        Python module/object containing engine configuration.

        You will need to restart the engine with :meth:`disconnect` and
        :meth:`connect` if the configuration has been changed after
        :meth:`connect` has been called.

        :returns: config module/object
        """
        return self._config

    @config.setter
    def config(self, value):
        # Validate configuration object.
        self.validate_config(value)
        self._config = value

    @classmethod
    def validate_config(cls, engine_config):
        # Check configuration options and set defaults where appropriate.
        # Set a new decoder config if necessary.
        if not hasattr(engine_config, "DECODER_CONFIG"):
            setattr(engine_config, "DECODER_CONFIG",
                    get_decoder_config_object())
        options = [
            "LANGUAGE",

            "START_ASLEEP",
            "WAKE_PHRASE",
            "WAKE_PHRASE_THRESHOLD",
            "SLEEP_PHRASE",
            "SLEEP_PHRASE_THRESHOLD",

            "TRAINING_DATA_DIR",
            "TRANSCRIPT_NAME",
            "START_TRAINING_PHRASE",
            "START_TRAINING_PHRASE_THRESHOLD",
            "END_TRAINING_PHRASE",
            "END_TRAINING_PHRASE_THRESHOLD",

            "CHANNELS",
            "RATE",
            "SAMPLE_WIDTH",
            "FRAMES_PER_BUFFER",
        ]

        # Get default values and set them they are missing.
        for option in options:
            if hasattr(engine_config, option):
                continue

            default_value = getattr(EngineConfig, option)
            if "PHRASE" in option:
                # Disable missing phrases by default if using a language
                # other than English.
                if not engine_config.LANGUAGE.startswith("en"):
                    default_value = "" if option.endswith("PHRASE") else 0.0
            setattr(engine_config, option, default_value)

    def connect(self):
        """
        Set up the CMU Pocket Sphinx decoder.

        This method does nothing if the engine is already connected.
        """
        if self._decoder:
            return

        # Initialise a new decoder with the given configuration
        decoder_config = self._config.DECODER_CONFIG
        self._decoder = PocketSphinx(decoder_config)
        self._valid_searches.add(self._default_search_name)

        # Set up callback function wrappers
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        def speech_start():
            # Reset the default search result and call the engine's callback
            # method.
            self._default_search_result = None
            return self._speech_start_callback(False)

        self._decoder.hypothesis_callback = hypothesis
        self._decoder.speech_start_callback = speech_start

        # Set up built-in keyphrases if they set. Catch and log any
        # UnknownWordErrors because all keyphrases are optional.
        def get_phrase_values(name):
            phrase_attr = name + "_PHRASE"
            threshold_attr = name + "_PHRASE_THRESHOLD"
            return (getattr(self.config, phrase_attr, ""),
                    getattr(self.config, threshold_attr, 0))

        def safe_set_keyphrase(name, func):
            phrase, threshold = get_phrase_values(name)
            if phrase and threshold:
                try:
                    self.set_keyphrase(phrase, threshold, func)
                except UnknownWordError as e:
                    self._log.error(e)

        # Set the wake phrase using set_kws_list directly because it uses a
        # different search.
        wake_phrase, wake_threshold = get_phrase_values("WAKE")
        if wake_phrase and wake_threshold:
            try:
                self._validate_words(wake_phrase.split(), "keyphrase")
                self._decoder.set_kws_list("_wake_phrase", {
                    wake_phrase: wake_threshold
                })
            except UnknownWordError as e:
                self._log.error(e)

        # Set the other keyphrases using safe_set_keyphrase().
        safe_set_keyphrase("SLEEP", self.pause_recognition)
        safe_set_keyphrase("START_TRAINING",
                           self.start_training_session)
        safe_set_keyphrase("END_TRAINING",
                           self.end_training_session)

        # Set the PyAudioRecorder instance's config object.
        self._recorder.config = self.config

        # Start in sleep mode if requested.
        if self.config.START_ASLEEP:
            self.pause_recognition()
            self._log.warning("Starting in sleep mode as requested.")

    def _free_engine_resources(self):
        """
        Internal method for freeing the resources used by the engine.
        """
        # Stop the audio recorder if it is running.
        self._recognising = False
        self._recorder.stop()

        # Free the decoder and clear audio buffers.
        self._decoder = None
        self._audio_buffers = []

        # Reset other variables
        self._cancel_recognition_next_time = False
        self._training_session_active = False
        self._recognition_paused = False

        # Clear dictionaries and sets
        self._grammar_wrappers.clear()
        self._valid_searches.clear()
        self._keyphrase_thresholds.clear()
        self._keyphrase_functions.clear()

    def disconnect(self):
        """
        Deallocate the CMU Sphinx decoder and any other resources used by
        it.

        This method effectively unloads all loaded grammars and key
        phrases.
        """
        # Free resources if the decoder isn't currently being used to
        # recognise, otherwise stop the recognising loop, which will free
        # the resources safely.
        if not self.recognising:
            self._free_engine_resources()
        else:
            self._recognising = False
            self._recorder.stop()

    # -----------------------------------------------------------------------
    # Multiplexing timer methods.

    def create_timer(self, callback, interval, repeating=True):
        """
        Create and return a timer using the specified callback and repeat
        interval.

        **Note**: Timers will not run unless the engine is recognising
        audio. Normal threads can be used instead with no downsides.
        """
        if not self.recognising:
            self._log.warning("Timers will not run unless the engine is "
                              "recognising audio.")

        return super(SphinxEngine, self).create_timer(callback, interval,
                                                      repeating)

    def set_timer_callback(self, callback, sec):
        """"""
        # This method should really only be called by the timer manager, not
        # directly.
        self._timer_callback = callback
        self._timer_interval = sec
        self._timer_next_time = time.time()

    def _call_timer_callback(self):
        if not (callable(self._timer_callback) or self._timer_interval):
            return

        now = time.time()
        if self._timer_next_time < now:
            self._timer_next_time = now + self._timer_interval
            self._timer_callback()

    # -----------------------------------------------------------------------
    # Methods for working with grammars.

    def check_valid_word(self, word):
        """
        Check if a word is in the current Sphinx pronunciation dictionary.

        This will always return False if :meth:`connect` hasn't been called.

        :rtype: bool
        """
        if self._decoder:
            return bool(self._decoder.lookup_word(word))

        return False

    def _validate_words(self, words, search_type):
        unknown_words = []

        # Use 'set' to de-duplicate the 'words' list.
        for word in set(words):
            if not self.check_valid_word(word):
                unknown_words.append(word)

        if unknown_words:
            # Sort the word list before using it.
            unknown_words.sort()
            raise UnknownWordError(
                "%s used words not found in the pronunciation dictionary: "
                "%s" % (search_type, ", ".join(unknown_words)))

    def _build_grammar_wrapper(self, grammar):
        return GrammarWrapper(grammar, self,
                              self._recognition_observer_manager)

    def _set_grammar(self, wrapper, activate, partial=False):
        if not wrapper:
            return

        # Connect to the engine if it isn't connected already.
        self.connect()

        def activate_search_if_necessary():
            if activate:
                self._decoder.end_utterance()
                self._decoder.active_search = wrapper.search_name

        # Check if the wrapper's search name is valid.
        # Set the search (again) if necessary.
        valid_search = wrapper.search_name in self._valid_searches
        if valid_search and not wrapper.set_search:
            # wrapper.search_name is a valid search, so return.
            activate_search_if_necessary()
            return

        # Return early if 'partial' is True as an optimisation to avoid
        # recompiling grammars for every rule activation/deactivation.
        # Also return if the search doesn't need to be set.
        if partial or not wrapper.set_search:
            return

        # Compile and set the jsgf search.
        compiled = wrapper.compile_jsgf()

        # Raise an error if there are no active public rules.
        if "public <root> = " not in compiled:
            raise EngineError("no public rules found in the grammar")

        # Check that each word in the grammar is in the pronunciation
        # dictionary. This will raise an UnknownWordError if one or more
        # aren't.
        self._validate_words(wrapper.grammar_words,
                             "grammar '%s'" % wrapper.grammar.name)

        # Set the JSGF search.
        self._decoder.end_utterance()
        self._decoder.set_jsgf_string(wrapper.search_name, compiled)
        activate_search_if_necessary()

        # Grammar search has been loaded, so set the wrapper's flag.
        wrapper.set_search = False

    def _unset_search(self, name):
        # Unset a Pocket Sphinx search with the given name.
        # Don't unset the default or keyphrase searches.
        default_search = self._default_search_name
        reserved = [default_search] + self._keyphrase_search_names
        if name in reserved:
            return

        # Unset the Pocket Sphinx search.
        if name in self._valid_searches:
            # Unfortunately, the C function for doing this (ps_unset_search)
            # is not exposed. Pocket Sphinx searches are pretty lighweight
            # however. This would only be an issue on hardware with limited
            # memory.

            # Remove the search from the valid searches set.
            self._valid_searches.remove(name)

        # Change to the default search to avoid possible segmentation faults
        # from Pocket Sphinx which crash Python.
        self._set_default_search()

    # TODO Add optional context parameter
    def set_keyphrase(self, keyphrase, threshold, func):
        """
        Add a keyphrase to listen for.

        Key phrases take precedence over grammars as they are processed first.
        They cannot be set for specific contexts (yet).

        :param keyphrase: keyphrase to add.
        :param threshold: keyphrase threshold value to use.
        :param func: function or method to call when the keyphrase is heard.
        :type keyphrase: str
        :type threshold: float
        :type func: callable
        :raises: UnknownWordError
        """
        # Check that all words in the keyphrase are in the pronunciation dictionary.
        # This can raise an UnknownWordError.
        self._validate_words(keyphrase.split(), "keyphrase")

        # Check that the threshold is a float.
        if not isinstance(threshold, float):
            raise TypeError("threshold must be a float, not %s" % threshold)

        # Add parameters to the relevant dictionaries.
        self._keyphrase_thresholds[keyphrase] = threshold
        self._keyphrase_functions[keyphrase] = func

        # Set the keyphrase search (again)
        self._decoder.end_utterance()
        self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)

    def unset_keyphrase(self, keyphrase):
        """
        Remove a set keyphrase so that the engine no longer listens for it.

        :param keyphrase: keyphrase to remove.
        :type keyphrase: str
        """
        # Remove parameters from the relevant dictionaries. Don't raise an error
        # if there is no such keyphrase.
        self._keyphrase_thresholds.pop(keyphrase, None)
        self._keyphrase_functions.pop(keyphrase, None)

        # Set the keyphrase search (again)
        self._decoder.end_utterance()
        self._decoder.set_kws_list("_key_phrases", self._keyphrase_thresholds)

    def _set_default_search(self):
        # Change the active search to the one used for processing speech as
        # it is heard.
        swap_to_wake_search = (
            self.recognition_paused and self.config.WAKE_PHRASE and
            self.config.WAKE_PHRASE_THRESHOLD
        )

        # Ensure we're not processing.
        self._decoder.end_utterance()
        if swap_to_wake_search:
            self._decoder.active_search = "_wake_phrase"
        else:
            self._decoder.active_search = self._default_search_name

    def _load_grammar(self, grammar):
        """ Load the given *grammar* and return a wrapper. """
        self._log.debug("Engine %s: loading grammar %s."
                        % (self, grammar.name))

        grammar.engine = self
        # Dependency checking.
        memo = []
        for r in grammar.rules:
            for d in r.dependencies(memo):
                grammar.add_dependency(d)

        wrapper = self._build_grammar_wrapper(grammar)

        # Check that the engine doesn't already have a grammar with the same
        # search name. This will include grammars with the same reference
        # name, e.g. "some grammar" and "some_grammar".
        if wrapper.search_name in self._valid_searches:
            message = "Failed to load grammar %s: multiple grammars with " \
                "the same name are not allowed" % grammar
            self._log.error(message)
            raise EngineError(message)

        # Attempt to set the grammar search.
        try:
            self._set_grammar(wrapper, False)
        except UnknownWordError as e:
            # Unknown words should be logged as plain error messages, not
            # exception stack traces.
            self._log.error(e)
            raise EngineError("Failed to load grammar %s: %s."
                              % (grammar, e))
        except Exception as e:
            self._log.exception("Failed to load grammar %s: %s."
                                % (grammar, e))
            raise EngineError("Failed to load grammar %s: %s."
                              % (grammar, e))

        # Set the grammar wrapper's search name as valid and return the
        # wrapper.
        self._valid_searches.add(wrapper.search_name)
        return wrapper

    def _unload_grammar(self, grammar, wrapper):
        try:
            # Unset the search names for the grammar.
            self._unset_search(wrapper.search_name)
        except Exception as e:
            self._log.exception("Failed to unload grammar %s: %s."
                                % (grammar, e))

    def activate_grammar(self, grammar):
        self._log.debug("Activating grammar %s." % grammar.name)

    def deactivate_grammar(self, grammar):
        self._log.debug("Deactivating grammar %s." % grammar.name)

    def activate_rule(self, rule, grammar):
        self._log.debug("Activating rule %s in grammar %s."
                        % (rule.name, grammar.name))
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return
        try:
            wrapper.enable_rule(rule.name)
            self._set_grammar(wrapper, False, True)
        except UnknownWordError as e:
            self._log.error(e)
        except Exception as e:
            self._log.exception("Failed to activate grammar %s: %s."
                                % (grammar, e))

    def deactivate_rule(self, rule, grammar):
        self._log.debug("Deactivating rule %s in grammar %s."
                        % (rule.name, grammar.name))
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return
        try:
            wrapper.disable_rule(rule.name)
            self._set_grammar(wrapper, False, True)
        except UnknownWordError as e:
            self._log.error(e)
        except Exception as e:
            self._log.exception("Failed to activate grammar %s: %s."
                                % (grammar, e))

    def update_list(self, lst, grammar):
        wrapper = self._get_grammar_wrapper(grammar)
        if not wrapper:
            return

        # Unfortunately there is no way to update lists for Pocket Sphinx
        # without reloading the grammar, so we'll update the list's JSGF
        # rule and reload.
        wrapper.update_list(lst)

        # Reload the grammar.
        try:
            self._set_grammar(wrapper, False)
        except Exception as e:
            self._log.exception("Failed to update list %s: %s."
                                % (lst, e))

    def set_exclusiveness(self, grammar, exclusive):
        # Disable/enable each grammar.
        for g in self.grammars:
            if exclusive:
                g.disable()
            else:
                g.enable()

        # Enable the specified grammar if it was supposed to be exclusive.
        if exclusive:
            grammar.enable()

    # -----------------------------------------------------------------------
    # Miscellaneous methods.

    @property
    def recognising(self):
        """
        Whether the engine is currently recognising speech.

        To stop recognition, use :meth:`disconnect`.

        :rtype: bool
        """
        return self._recorder.recording or self._recognising

    @property
    def default_search_result(self):
        """
        The last hypothesis object of the default search.

        This does not currently reach recognition observers because it is
        intended to be used for dictation results, which are currently
        disabled. Nevertheless this object can be useful sometimes.

        :returns: Sphinx Hypothesis object | None
        """
        return self._default_search_result

    @property
    def _default_search_name(self):
        # The name of the Pocket Sphinx search used for processing speech as
        # it is heard.
        return "_default"

    def _get_best_hypothesis(self, hypotheses):
        """
        Take a list of speech hypotheses and return the most likely one.

        :type hypotheses: iterable
        :return: str | None
        """
        # Get all distinct, non-null hypotheses.
        distinct = tuple([h for h in set(hypotheses) if bool(h)])
        if not distinct:
            return None
        elif len(distinct) == 1:
            return distinct[0]  # only one choice

        # Decide between non-null hypotheses using a Pocket Sphinx search with
        # each hypothesis as a grammar rule.
        grammar = RootGrammar()
        grammar.language_name = self.language
        for i, hypothesis in enumerate(distinct):
            grammar.add_rule(PublicRule("rule%d" % i, Literal(hypothesis)))

        compiled = grammar.compile_grammar()
        name = "_temp"

        # Store the current search name.
        original = self._decoder.active_search

        # Note that there is no need to validate words in this case because
        # each literal in the _temp grammar came from a Pocket Sphinx
        # hypothesis.
        self._decoder.end_utterance()
        self._decoder.set_jsgf_string(name, compiled)
        self._decoder.active_search = name

        # Do the processing.
        hyp = self._decoder.batch_process(
            self._audio_buffers,
            use_callbacks=False
        )
        result = hyp.hypstr if hyp else None

        # Switch back to the previous search.
        self._decoder.end_utterance()  # just in case
        self._decoder.active_search = original
        return result

    def _speech_start_callback(self, mimicking):
        # Get context info. Dragonfly has a handy static method for this:
        fg_window = Window.get_foreground()

        # Call process_begin for all grammars so that any out of context
        # grammar will not be used.
        for wrapper in self._grammar_wrappers.values():
            wrapper.process_begin(fg_window)

        if not mimicking:
            # Trim excess audio buffers from the start of the list. Keep a maximum 1
            # second of silence before speech start was detected. This should help
            # increase the performance of batch reprocessing later.
            chunk = self.config.FRAMES_PER_BUFFER
            rate = self.config.RATE
            seconds = 1
            n_buffers = int(rate / chunk * seconds)
            self._audio_buffers = self._audio_buffers[-1 * n_buffers:]

        # Notify observers
        self._recognition_observer_manager.notify_begin()

    def _hypothesis_callback(self, speech, mimicking):
        """
        Internal Pocket Sphinx hypothesis callback method. Calls _process_hypothesis
        and does post-processing afterwards.
        :param speech: speech hypothesis
        :type speech: str | None
        :param mimicking:  whether to treat speech as mimicked speech.
        :rtype: bool
        """
        # Clear any recorded audio buffers.
        self._recorder.clear_buffers()

        # Process speech. We should get back a boolean for whether processing
        # occurred as well as the final speech hypothesis.
        processing_occurred, final_speech = self._process_hypotheses(
            speech, mimicking
        )

        # Notify observers of failure.
        if not processing_occurred:
            self._recognition_observer_manager.notify_failure()

        # Write the training data files if necessary.
        data_dir = self.config.TRAINING_DATA_DIR
        if not mimicking and data_dir and os.path.isdir(data_dir):
            # Use the default search's hypothesis if final_speech was nil.
            if not final_speech:
                final_speech = speech
            try:
                write_training_data(self.config, self._audio_buffers,
                                    final_speech)
            except Exception as e:
                self._log.exception("Failed to write training data: %s" % e)

        # Clear audio buffer list because utterance processing has finished.
        self._audio_buffers = []

        # Ensure that the correct search is used.
        self._set_default_search()

        # Return whether processing occurred in case this method was called
        # by mimic.
        return processing_occurred

    def _process_key_phrases(self, speech, mimicking):
        """
        Processing key phrase searches and return the matched keyphrase
        (if any).

        :type speech: str
        :param mimicking: whether to treat speech as mimicked speech.
        :rtype: str
        """
        # Return if speech is empty/null or if there are no key phrases set.
        if not (speech and self._keyphrase_thresholds):
            return ""  # no matches

        if not mimicking:
            # Reprocess using the key phrases search
            self._decoder.end_utterance()
            self._decoder.active_search = "_key_phrases"
            hyp = self._decoder.batch_process(self._audio_buffers,
                                              use_callbacks=False)

            # Get the hypothesis string.
            speech = hyp.hypstr if hyp else ""

            # Restore search to the default search.
            self._set_default_search()

            # Return if no key phrase matched.
            if not speech:
                return ""

            # Handle multiple matching key phrases. This appears to be a
            # quirk of how Pocket Sphinx 'kws' searches work. Get the best
            # match instead if this is the case.
            recognised_phrases = speech.split("  ")
            if len(recognised_phrases) > 1:
                # Remove trailing space from the last phrase.
                recognised_phrases[len(recognised_phrases) - 1].rstrip()
                speech = self._get_best_hypothesis(recognised_phrases)
            else:
                speech = speech.rstrip()  # remove trailing whitespace.

        # Notify observers if a keyphrase was matched.
        result = speech if speech in self._keyphrase_functions else ""
        if result:
            words = tuple(result.split())
            self._recognition_observer_manager.notify_recognition(words)

        # Call the registered function if there was a match and the function
        # is callable.
        func = self._keyphrase_functions.get(speech, None)
        if callable(func):
            try:
                func()
            except Exception as e:
                self._log.exception(
                    "Exception caught when executing the function for "
                    "keyphrase '%s': %s" % (speech, e)
                )

        return result

    @classmethod
    def _generate_words_rules(cls, words, mimicking):
        # Convert words to Unicode, treat all uppercase words as dictation
        # words and other words as grammar words.
        # Minor note: this won't work for languages without capitalisation.
        result = []
        for word in words.split():
            if PY2 and isinstance(word, str):
                word = text_type(word, encoding="utf-8")
            if word.isupper() and mimicking:
                # Convert dictation words to lowercase for consistent
                # output.
                result.append((word.lower(), 1000000))
            else:
                result.append((word, 0))
        return tuple(result)

    def _process_hypotheses(self, speech, mimicking):
        """
        Internal method to process speech hypotheses. This should only be called
        from 'SphinxEngine._hypothesis_callback' because that method does important
        post processing.

        :param speech: speech
        :param mimicking: whether to treat speech as mimicked speech.
        :rtype: tuple
        """
        # Check key phrases search first.
        keyphrase = self._process_key_phrases(speech, mimicking)
        if keyphrase:
            # Keyphrase search matched.
            return True, keyphrase

        # Otherwise do grammar processing.
        processing_occurred = False
        hypotheses = {}

        # Collect each active grammar's GrammarWrapper.
        wrappers = [w for w in self._grammar_wrappers.values()
                    if w.grammar_active]

        # No grammar has been loaded.
        if not wrappers:
            # TODO What should we do here? Output formatted Dictation like DNS?
            return processing_occurred, speech

        # Batch process audio buffers for each active grammar. Store each
        # hypothesis.
        for wrapper in wrappers:
            if mimicking:
                # Just use 'speech' for everything if mimicking.
                hyp = speech
            else:
                # Switch to the search for this grammar and re-process the
                # audio.
                self._set_grammar(wrapper, True)
                hyp = self._decoder.batch_process(
                    self._audio_buffers,
                    use_callbacks=False
                )
                if hyp:
                    hyp = hyp.hypstr

            # Set the hypothesis in the dictionary.
            hypotheses[wrapper.search_name] = hyp

        # Get the best hypothesis.
        speech = self._get_best_hypothesis(list(hypotheses.values()))
        if not speech:
            return processing_occurred, speech

        # Process speech using the first matching grammar.
        words_rules = self._generate_words_rules(speech, mimicking)
        for wrapper in wrappers:
            if hypotheses[wrapper.search_name] != speech:
                continue

            processing_occurred = wrapper.process_words(words_rules)
            if processing_occurred:
                break

        # Return whether processing occurred and the final speech hypothesis for
        # post processing.
        return processing_occurred, speech

    def process_buffer(self, buf):
        """
        Recognise speech from an audio buffer.

        This method is meant to be called in sequence for multiple audio
        buffers. It will do nothing if :meth:`connect` hasn't been called.

        :param buf: audio buffer
        :type buf: str
        """
        if not self._decoder:
            return

        # Cancel current recognition if it has been requested.
        if self._cancel_recognition_next_time:
            self._decoder.end_utterance()
            self._audio_buffers = []
            self._cancel_recognition_next_time = False

        # Keep a list of buffers for possible reprocessing using different Pocket
        # Sphinx searches later.
        self._audio_buffers.append(buf)

        # Call the timer callback if it is set.
        self._call_timer_callback()

        # Process audio.
        try:
            self._recognising = True
            self._decoder.process_audio(buf)
        finally:
            self._recognising = False

    def process_wave_file(self, path):
        """
        Recognise speech from a wave file and return the recognition results.

        This method checks that the wave file is valid. It raises an error
        if the file doesn't exist, if it can't be read or if the WAV header
        values do not match those in the engine configuration.

        If recognition is paused (sleep mode), this method will call
        :meth:`resume_recognition`.

        The wave file must use the same sample width, sample rate and number
        of channels that the acoustic model uses.

        If the file is valid, :meth:`process_buffer` is then used to process
        the audio.

        Multiple utterances are supported.

        :param path: wave file path
        :raises: IOError | OSError | ValueError
        :returns: recognition results
        :rtype: generator
        """
        if not self._decoder:
            self.connect()

        # This method's implementation has been adapted from the PyAudio
        # play wave example:
        # http://people.csail.mit.edu/hubert/pyaudio/#play-wave-example

        # Check that path is a valid file.
        if not os.path.isfile(path):
            raise IOError("'%s' is not a file. Please use a different file path.")

        # Get required audio configuration from the engine config.
        channels, sample_width, rate, chunk = (
            self.config.CHANNELS,
            self.config.SAMPLE_WIDTH,
            self.config.RATE,
            self.config.FRAMES_PER_BUFFER
        )

        # Make sure recognition is not paused.
        if self.recognition_paused:
            self.resume_recognition(notify=False)

        # Open the wave file. Use contextlib to make sure that the file is
        # closed whether errors are raised or not.
        # Also register a custom recognition observer for the duration.
        obs = WaveRecognitionObserver(self)
        with contextlib.closing(wave.open(path, "rb")) as wf, obs as obs:
            # Validate the wave file's header.
            if wf.getnchannels() != channels:
                message = ("WAV file '%s' should use %d channel(s), not %d!"
                           % (path, channels, wf.getnchannels()))
            elif wf.getsampwidth() != sample_width:
                message = ("WAV file '%s' should use sample width %d, not "
                           "%d!" % (path, sample_width, wf.getsampwidth()))
            elif wf.getframerate() != rate:
                message = ("WAV file '%s' should use sample rate %d, not "
                           "%d!" % (path, rate, wf.getframerate()))
            else:
                message = None

            if message:
                raise ValueError(message)

            # Use process_buffer to process each buffer.
            for _ in range(0, int(wf.getnframes() / chunk) + 1):
                data = wf.readframes(chunk)
                if not data:
                    break

                self.process_buffer(data)

                # Get the results from the observer.
                if obs.words:
                    yield obs.words
                    obs.words = ""

        # Log warnings if speech start or end weren't detected.
        if not obs.complete:
            self._log.warning("Speech start/end wasn't detected in the wave "
                              "file!")
            self._log.warning("Perhaps the Sphinx '-vad_prespeech' value "
                              "should be higher?")
            self._log.warning("Or maybe '-vad_startspeech' or "
                              "'-vad_postspeech' should be lower?")

    def recognise_forever(self):
        """
        Start recognising from the default recording device until
        :meth:`disconnect` is called.

        Recognition can be paused and resumed using either the sleep/wake
        key phrases or by calling :meth:`pause_recognition` or
        :meth:`resume_recognition`.

        To configure audio input settings, modify the engine's ``CHANNELS``,
        ``RATE``, ``SAMPLE_WIDTH`` and/or ``FRAMES_PER_BUFFER``
        configuration options.
        """
        if not self._decoder:
            self.connect()

        # Start recognising in a loop.
        self._recorder.start()
        self._cancel_recognition_next_time = False
        while self.recognising:
            for buf in self._recorder.get_buffers():
                self.process_buffer(buf)

        # Free engine resources after recognition has stopped.
        self._free_engine_resources()

    def mimic(self, words):
        """ Mimic a recognition of the given *words* """
        if isinstance(words, (list, tuple)):
            words = " ".join(words)

        if self.recognition_paused and words == self.config.WAKE_PHRASE:
            self.resume_recognition()
            return

        # Pretend that Sphinx has started processing speech
        self._speech_start_callback(True)

        # Process the words as if they were spoken
        result = self._hypothesis_callback(words, True)
        if not result:
            raise MimicFailure("No matching rule found for words %s."
                               % words)

    def mimic_phrases(self, *phrases):
        """
        Mimic a recognition of the given *phrases*.

        This method accepts variable phrases instead of a list of words.
        """
        # Pretend that Sphinx has started processing speech
        self._speech_start_callback(True)

        # Process phrases as if they were spoken
        wake_phrase = self.config.WAKE_PHRASE
        for phrase in phrases:
            if self.recognition_paused and phrase == wake_phrase:
                self.resume_recognition()
                continue

            result = self._hypothesis_callback(phrase, True)
            if not result:
                raise MimicFailure("No matching rule found for words %s."
                                   % phrase)

    def speak(self, text):
        """"""
        self._log.warning("text-to-speech is not implemented for this "
                          "engine.")
        self._log.warning("Printing text instead.")
        print(text)

    def _get_language(self):
        return self.config.LANGUAGE

    # ----------------------------------------------------------------------
    # Training-related methods

    def write_transcript_files(self, fileids_path, transcription_path):
        """
        Write .fileids and .transcription files for files in the training
        data directory and write them to the specified file paths.

        This method will raise an error if the ``TRAINING_DATA_DIR``
        configuration option is not set to an existing directory.

        :param fileids_path: path to .fileids file to create.
        :param transcription_path: path to .transcription file to create.
        :type fileids_path: str
        :type transcription_path: str
        :raises: IOError | OSError
        """
        write_transcript_files(
            self.config, fileids_path, transcription_path
        )

    @property
    def training_session_active(self):
        """
        Whether a training session is in progress.

        :rtype: bool
        """
        return self._training_session_active

    def start_training_session(self):
        """
        Start the training session. This will stop recognition processing
        until either :meth:`end_training_session` is called or the end
        training keyphrase is heard.
        """
        data_dir = self.config.TRAINING_DATA_DIR
        if not data_dir or not os.path.isdir(data_dir):
            self._log.warning("Training data will not be recorded; '%s' is "
                              "not a directory" % data_dir)

        if not self._training_session_active:
            self._log.info("Training session has started. No rule "
                           "actions will be processed. ")
            self._log.info("Say '%s' to end the session."
                           % self.config.END_TRAINING_PHRASE)
            self._training_session_active = True

    def end_training_session(self):
        """
        End the training if one is in progress. This will allow recognition
        processing once again.
        """
        if self._training_session_active:
            self._log.info("Ending training session.")
            self._log.info("Rule actions will now be processed normally "
                           "again.")
            self._training_session_active = False

    # ----------------------------------------------------------------------
    # Recognition loop control methods
    # Stopping recognition loop is done using disconnect()

    @property
    def recognition_paused(self):
        """
        Whether the engine is waiting for the wake phrase to be heard or for
        :meth:`resume_recognition` to be called.

        :rtype: bool
        """
        return self._recognition_paused

    def pause_recognition(self):
        """
        Pause recognition and wait for :meth:`resume_recognition` to be
        called or for the wake keyphrase to be spoken.
        """
        if not self._decoder:
            return

        self._recognition_paused = True

        # Switch to the wake keyphrase search if a wake keyphrase has been
        # set.
        self._set_default_search()
        if not self.config.WAKE_PHRASE:
            self._log.warning("No wake phrase has been set.")
            self._log.warning("Use engine.resume_recognition() to wake up.")

        # Define temporary callback for the decoder.
        def hypothesis(hyp):
            # Clear any recorded audio buffers.
            self._recorder.clear_buffers()
            s = hyp.hypstr if hyp else None

            # Resume recognition if s is the wake keyphrase.
            if s and s.strip() == self.config.WAKE_PHRASE.strip():
                self.resume_recognition()
            elif self.config.WAKE_PHRASE:
                self._log.debug("Didn't hear %s" % self.config.WAKE_PHRASE)

            # Clear audio buffers
            self._audio_buffers = []

        # Override decoder hypothesis callback.
        self._decoder.hypothesis_callback = hypothesis

    def resume_recognition(self, notify=True):
        """
        Resume listening for grammar rules and key phrases.
        """
        if not self._decoder:
            return

        self._recognition_paused = False

        # Notify observers about recognition resume.
        keyphrase = self.config.WAKE_PHRASE
        words = tuple(keyphrase.strip().split())
        if words and notify:
            self._recognition_observer_manager.notify_recognition(words)

        # Restore the callbacks to normal
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        self._decoder.hypothesis_callback = hypothesis

        # Switch to the default search.
        self._set_default_search()

    def cancel_recognition(self):
        """
        If a recognition was in progress, cancel it before processing the
        next audio buffer.
        """
        self._cancel_recognition_next_time = True
Ejemplo n.º 5
0
    def connect(self):
        """
        Set up the CMU Pocket Sphinx decoder.

        This method does nothing if the engine is already connected.
        """
        if self._decoder:
            return

        # Initialise a new decoder with the given configuration
        decoder_config = self._config.DECODER_CONFIG
        self._decoder = PocketSphinx(decoder_config)
        self._valid_searches.add(self._default_search_name)

        # Set up callback function wrappers
        def hypothesis(hyp):
            # Set default search result.
            self._default_search_result = hyp

            # Set speech to the hypothesis string or None if there isn't one
            speech = hyp.hypstr if hyp else None
            return self._hypothesis_callback(speech, False)

        def speech_start():
            # Reset the default search result and call the engine's callback
            # method.
            self._default_search_result = None
            return self._speech_start_callback(False)

        self._decoder.hypothesis_callback = hypothesis
        self._decoder.speech_start_callback = speech_start

        # Set up built-in keyphrases if they set. Catch and log any
        # UnknownWordErrors because all keyphrases are optional.
        def get_phrase_values(name):
            phrase_attr = name + "_PHRASE"
            threshold_attr = name + "_PHRASE_THRESHOLD"
            return (getattr(self.config, phrase_attr, ""),
                    getattr(self.config, threshold_attr, 0))

        def safe_set_keyphrase(name, func):
            phrase, threshold = get_phrase_values(name)
            if phrase and threshold:
                try:
                    self.set_keyphrase(phrase, threshold, func)
                except UnknownWordError as e:
                    self._log.error(e)

        # Set the wake phrase using set_kws_list directly because it uses a
        # different search.
        wake_phrase, wake_threshold = get_phrase_values("WAKE")
        if wake_phrase and wake_threshold:
            try:
                self._validate_words(wake_phrase.split(), "keyphrase")
                self._decoder.set_kws_list("_wake_phrase", {
                    wake_phrase: wake_threshold
                })
            except UnknownWordError as e:
                self._log.error(e)

        # Set the other keyphrases using safe_set_keyphrase().
        safe_set_keyphrase("SLEEP", self.pause_recognition)
        safe_set_keyphrase("START_TRAINING",
                           self.start_training_session)
        safe_set_keyphrase("END_TRAINING",
                           self.end_training_session)

        # Set the PyAudioRecorder instance's config object.
        self._recorder.config = self.config

        # Start in sleep mode if requested.
        if self.config.START_ASLEEP:
            self.pause_recognition()
            self._log.warning("Starting in sleep mode as requested.")