Esempio n. 1
0
def retrieve_scores(word):
    filename = word + '.wav'
    grammarname = word + '-align.jsgf'
    model_path = get_model_path()

    # Initialize the config values
    config = DefaultConfig()
    config.set_boolean('-verbose', False)
    config.set_string('-hmm', os.path.join(model_path, 'en-us'))
    config.set_boolean('-lm', False)
    config.set_string('-dict', 'phonemes.dict.txt')
    config.set_boolean('-backtrace', True)
    config.set_boolean('-bestpath', False)
    config.set_boolean('-fsgusefiller', False)

    decoder = Decoder(config)

    # Set the search to JSGF Grammar
    jsgf = Jsgf(grammarname)
    rule = jsgf.get_rule('forcing.' + word)
    decoder.set_jsgf_file('grammar', grammarname)
    decoder.set_search('grammar')

    stream = open(filename, 'rb')
    utt_started = False
    scores = []
    decoder.start_utt()

    while True:
        buf = stream.read(1024)
        if buf:
            decoder.process_raw(buf, False, False)
            in_speech = decoder.get_in_speech()
            if (in_speech and not utt_started):
                utt_started = True
            if (not in_speech and utt_started):
                decoder.end_utt()
                hyp = decoder.hyp()
                if hyp is not None:
                    print('hyp: %s' % (hyp.best_score))
                print_segments(decoder)
                scores = retrieve_segments(decoder)
                decoder.start_utt()
                utt_started = False
        else:
            break

    decoder.end_utt()
    print('scores:', scores)

    return scores
Esempio n. 2
0
class VoiceService(object):
    audio_device = None
    buffer_size = 2048
    sampling_rate = 16000

    def __init__(self):
        config = get_decoder_config()
        self.decoder = Decoder(config)

        self.speech = pyttsx3.init()

        self.audio = sphinxbase.Ad(self.audio_device, self.sampling_rate)
        self.buffer = bytearray(self.buffer_size)

        self.default_search = self.decoder.get_search()
        self.in_speech = False
        self.max_history = 100
        self.phrases = []
        self.prompts = {}

        self.next_prompt_id = 1

        self.current_prompt = None
        self.prompt_queue = queue.Queue()

    def create_prompt(self,
                      message=None,
                      message_url=None,
                      search="enable",
                      timeout=15):
        """
        Create a new prompt and add it to the queue.

        Currently, only one type of prompt is supported. We play a message,
        then wait for someone to say a specific word (the search word) within
        the alloted amount of time.

        The status of the prompt can be retrieved by calling get_prompt with
        the appropriate id.

        timeout: prompt timeout in seconds, expected to be either None or numeric.
        """
        if timeout is not None:
            # Be forgiving of caller who may have passed timeout as a string.
            timeout = float(timeout)

        prompt = {
            "created_time": time.time(),
            "detected": False,
            "detected_time": None,
            "id": self.get_next_prompt_id(),
            "message": message,
            "message_url": message_url,
            "search": search,
            "search_started": False,
            "search_started_time": None,
            "played": False,
            "played_time": None,
            "timeout": timeout,
            "timed_out": False
        }
        self.prompts[str(prompt['id'])] = prompt
        self.prompt_queue.put(prompt)
        return prompt

    def get_next_prompt_id(self):
        """
        Get a unique ID for a prompt.
        """
        tmp = self.next_prompt_id
        self.next_prompt_id += 1
        return tmp

    def get_phrases(self):
        """
        Get the history of detected phrases.
        """
        return self.phrases

    def get_prompt(self, prompt_id):
        """
        Get information about a prompt.
        """
        return self.prompts[str(prompt_id)]

    def get_status(self):
        """
        Get the system status.
        """
        status = {
            "current_prompt": self.current_prompt,
            "in_speech": self.decoder.get_in_speech(),
            "queue_length": self.prompt_queue.qsize(),
            "search": self.decoder.get_search()
        }
        return status

    def play_prompt(self, prompt):
        prompt['played_time'] = time.time()

        if prompt.get("message_url", None) is not None:
            cmd = ["mplayer", "-ao", "pulse", prompt['message_url']]
            subprocess.call(cmd)
        elif prompt.get("message", None) is not None:
            self.speech.say(prompt['message'])
            self.speech.runAndWait()

        prompt['played'] = True

    def process_hypothesis(self, hypothesis):
        print("SPEECH {}".format(hypothesis.hypstr))

        phrase = {
            "search": self.decoder.get_search(),
            "time": time.time(),
            "text": hypothesis.hypstr
        }
        self.phrases.append(phrase)
        del self.phrases[:-self.max_history]

    def run_next_prompt(self):
        if self.prompt_queue.empty():
            self.create_prompt(None, search="paradrop", timeout=None)

        self.current_prompt = self.prompt_queue.get_nowait()
        self.decoder.set_search(self.current_prompt['search'])

        self.audio.stop_recording()
        self.play_prompt(self.current_prompt)
        self.audio.start_recording()

        self.current_prompt['search_started_time'] = time.time()
        self.current_prompt['search_started'] = True

    def detect_timeout(self):
        """
        Check if the current prompt has timed out.
        """
        if self.current_prompt is None:
            # No active prompt to timeout.
            return False

        if self.decoder.get_in_speech():
            # Defer timeout if decoder reports that speech is in progress.  A
            # person may be speaking the target phrase currently.
            return False

        if self.current_prompt['timeout'] is None:
            # If timeout is None, then only timeout when there is another item
            # in the queue.
            return not self.prompt_queue.empty()
        else:
            diff = time.time() - self.current_prompt['search_started_time']
            return diff >= self.current_prompt['timeout']

    def run(self):
        self.decoder.set_keyphrase("activate", "activate")
        self.decoder.set_keyphrase("allow", "allow")
        self.decoder.set_keyphrase("enable", "enable")
        self.decoder.set_keyphrase("paradrop", "para drop")

        self.audio.start_recording()
        while True:
            if self.current_prompt is None:
                self.run_next_prompt()
                self.decoder.start_utt()

            self.audio.readinto(self.buffer)
            self.decoder.process_raw(self.buffer, False, False)

            if self.in_speech and not self.decoder.get_in_speech():
                self.decoder.end_utt()

                hypothesis = self.decoder.hyp()
                if hypothesis is not None:
                    self.process_hypothesis(hypothesis)
                    self.current_prompt['detected'] = True
                    self.current_prompt['detected_time'] = time.time()
                    self.current_prompt = None
                else:
                    self.decoder.start_utt()

            if self.detect_timeout():
                self.decoder.end_utt()
                self.current_prompt['timed_out'] = True
                self.current_prompt = None

            self.in_speech = self.decoder.get_in_speech()
Esempio n. 3
0
class SphinxWrapper(object):
    '''
For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal.
Before signal is fed to decoder, it should be isntructed that new utterance is expected.
When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)`

    '''

    #MODELDIR = "../models"
    #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models"
    MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models"

    decoder = None
    config = None
    previousVadState = 0
    currentVadState = 0

    def __init__(self):
        '''
        Constructor
        '''

    def prepareDecoder(self, pGramma):
        '''
        Entry point where sphinx decoder is initialized or grammar updated
        '''
        if self.decoder is None:
            self.config = self.createConfig(pGramma);
            self.decoder = Decoder(self.config);
        else:
            self.updateGrammar(self.decoder, pGramma);

    def createConfig(self,pGramma):
        '''
        Create configuration with acoustic model path, grammar and dictionary
        '''
        print ("[createConfig]+++")
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/'))
        config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg'))
        #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram'))
        config.set_string('-dict', os.path.join("../resource/", 'service.dict'))
        print ("[createConfig]---")
        return config;

    def updateGrammar(self,pGramma):
        '''
        Update decoder language model from fsg file
        '''
        print ("[updateGrammar]+++" + pGramma)
        logmath = self.decoder.get_logmath();
        fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5)
        self.decoder.set_fsg("default",fsg);
        self.decoder.set_search("default");
        print ("[updateGrammar]---")

    def startListening(self):
        """
        Instruct decoder that new utterace should be expected
        """
        self.decoder.start_utt(None)


    def stopListening(self):
        """
        Instruct decoder that new utterace should is not expected any more
        """
        self.decoder.end_utt()


    def process_raw(self, data):
        """
        Feed decoder with raw audio data. After data is updating refresh VAD state
        """
        #print("process_raw...\n")
        self.decoder.process_raw(data, False, False)
        self.previousVadState = self.currentVadState
        self.currentVadState = self.decoder.get_vad_state();
        #print("process_raw", self.currentVadState and True, self.previousVadState and True)

    def calculateHypothesis(self):
        return self.decoder.hyp();

    def calculateVadState(self):
        return self.decoder.get_vad_state;

    def isVoiceStarted(self):
        '''
        silence -> speech transition,
        '''
        return self.currentVadState and not self.previousVadState

    def isVoiceEnded(self):
        '''
        speech -> silence transition,
        '''
        return not self.currentVadState and self.previousVadState
class Wrapper():
    def __init__(self, **kwargs):
        signal.signal(signal.SIGINT, self.stop)

        model_path = get_model_path()

        kwargs = {
            x: os.path.expandvars(kwargs[x])
            if type(kwargs[x]) is str else kwargs[x]
            for x in kwargs
        }

        nodename = kwargs.pop('nodename')
        grammar_file = kwargs.pop('grammar_file', None)
        grammar_rule = kwargs.pop('grammar_rule', None)
        grammar_name = kwargs.pop('grammar_name', None)

        kwargs.pop('esiaf_input_topic')

        if kwargs.get('dic') is not None and kwargs.get('dict') is None:
            kwargs['dict'] = kwargs.pop('dic')

        if kwargs.get('hmm') is None:
            kwargs['hmm'] = os.path.join(model_path, 'en-us')

        if kwargs.get('lm') is None:
            kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin')

        if kwargs.get('dict') is None and kwargs.get('dic') is None:
            kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict')

        if kwargs.pop('verbose', False) is False:
            if sys.platform.startswith('win'):
                kwargs['logfn'] = 'nul'
            else:
                kwargs['logfn'] = '/dev/null'

        config = Decoder.default_config()

        print(kwargs)

        for key, value in kwargs.items():
            if isinstance(value, bool):
                config.set_boolean('-{}'.format(key), value)
            elif isinstance(value, int):
                config.set_int('-{}'.format(key), value)
            elif isinstance(value, float):
                config.set_float('-{}'.format(key), value)
            elif isinstance(value, str):
                config.set_string('-{}'.format(key), value)

        self.decoder = Decoder(config)

        if grammar_file and grammar_rule and grammar_name:
            jsgf = Jsgf(grammar_file)
            rule = jsgf.get_rule(grammar_name + '.' + grammar_rule)
            fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
            self.decoder.set_fsg(grammar_name, fsg)
            self.decoder.set_search(grammar_name)

        self.start = None
        self.finish = None

        self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec',
                                                SpeechInfo,
                                                queue_size=10)

    def stop(self, *args, **kwargs):
        raise StopIteration

    def hypothesis(self):
        hyp = self.decoder.hyp()
        if hyp:
            return hyp.hypstr
        else:
            return ''

    def vad_finished_callback(self):
        self.decoder.end_utt()
        result = ''
        if self.decoder.hyp():
            result = self.hypothesis()
        rospy.loginfo('understood: \'' + str(result) + '\'')

        hypo = SpeechHypothesis()
        hypo.recognizedSpeech = result
        hypo.probability = 1.0

        time = RecordingTimeStamps()
        time.start = self.start
        time.finish = self.finish

        speechInfo = SpeechInfo()
        speechInfo.hypotheses = [hypo]
        speechInfo.duration = time

        self.speech_publisher.publish(speechInfo)

        self.start = None
        self.finish = None

    def add_audio_data(self, audio_data, recording_timestamps):
        _recording_timestamps = RecordingTimeStamps()
        msg_from_string(_recording_timestamps, recording_timestamps)
        rospy.loginfo('got audio!')
        if not self.start:
            self.start = _recording_timestamps.start
            self.decoder.start_utt()
        self.finish = _recording_timestamps.finish
        bytearray = audio_data.tobytes()
        self.decoder.process_raw(bytearray, False, False)
Esempio n. 5
0
class SphinxWrapper(object):
    '''
For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal.
Before signal is fed to decoder, it should be isntructed that new utterance is expected.
When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)`

    '''

    #MODELDIR = "../models"
    #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models"
    MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models"

    decoder = None
    config = None
    previousVadState = 0
    currentVadState = 0

    def __init__(self):
        '''
        Constructor
        '''

    def prepareDecoder(self, pGramma):
        '''
        Entry point where sphinx decoder is initialized or grammar updated
        '''
        if self.decoder is None:
            self.config = self.createConfig(pGramma)
            self.decoder = Decoder(self.config)
        else:
            self.updateGrammar(self.decoder, pGramma)

    def createConfig(self, pGramma):
        '''
        Create configuration with acoustic model path, grammar and dictionary
        '''
        print("[createConfig]+++")
        config = Decoder.default_config()
        config.set_string('-hmm',
                          os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/'))
        config.set_string('-fsg', os.path.join("../resource/",
                                               pGramma + '.fsg'))
        #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram'))
        config.set_string('-dict', os.path.join("../resource/",
                                                'service.dict'))
        print("[createConfig]---")
        return config

    def updateGrammar(self, pGramma):
        '''
        Update decoder language model from fsg file
        '''
        print("[updateGrammar]+++" + pGramma)
        logmath = self.decoder.get_logmath()
        fsg = sphinxbase.FsgModel(
            os.path.join("../resource/", pGramma + '.fsg'), logmath, 7.5)
        self.decoder.set_fsg("default", fsg)
        self.decoder.set_search("default")
        print("[updateGrammar]---")

    def startListening(self):
        """
        Instruct decoder that new utterace should be expected
        """
        self.decoder.start_utt(None)

    def stopListening(self):
        """
        Instruct decoder that new utterace should is not expected any more
        """
        self.decoder.end_utt()

    def process_raw(self, data):
        """
        Feed decoder with raw audio data. After data is updating refresh VAD state
        """
        #print("process_raw...\n")
        self.decoder.process_raw(data, False, False)
        self.previousVadState = self.currentVadState
        self.currentVadState = self.decoder.get_vad_state()
        #print("process_raw", self.currentVadState and True, self.previousVadState and True)

    def calculateHypothesis(self):
        return self.decoder.hyp()

    def calculateVadState(self):
        return self.decoder.get_vad_state

    def isVoiceStarted(self):
        '''
        silence -> speech transition,
        '''
        return self.currentVadState and not self.previousVadState

    def isVoiceEnded(self):
        '''
        speech -> silence transition,
        '''
        return not self.currentVadState and self.previousVadState