def transcribe(decoder: pocketsphinx.Decoder,
               audio_data: bytes,
               nbest: int = 0) -> Dict[str, Any]:
    """Transcribes audio data to text."""
    # Process data as an entire utterance
    start_time = time.time()
    decoder.start_utt()
    decoder.process_raw(audio_data, False, True)
    decoder.end_utt()
    end_time = time.time()

    logger.debug(f"Decoded audio in {end_time - start_time} second(s)")

    transcription = ""
    decode_seconds = end_time - start_time
    likelihood = 0.0
    score = 0

    hyp = decoder.hyp()
    if hyp is not None:
        likelihood = decoder.get_logmath().exp(hyp.prob)
        transcription = hyp.hypstr

    result = {
        "text": transcription,
        "transcribe_seconds": decode_seconds,
        "likelihood": likelihood,
    }

    if nbest > 0:
        # Include alternative transcriptions
        result["nbest"] = {
            nb.hypstr: nb.score
            for nb in decoder.nbest()[:nbest]
        }

    return result
class Wrapper():
    def __init__(self, **kwargs):
        signal.signal(signal.SIGINT, self.stop)

        model_path = get_model_path()

        kwargs = {
            x: os.path.expandvars(kwargs[x])
            if type(kwargs[x]) is str else kwargs[x]
            for x in kwargs
        }

        nodename = kwargs.pop('nodename')
        grammar_file = kwargs.pop('grammar_file', None)
        grammar_rule = kwargs.pop('grammar_rule', None)
        grammar_name = kwargs.pop('grammar_name', None)

        kwargs.pop('esiaf_input_topic')

        if kwargs.get('dic') is not None and kwargs.get('dict') is None:
            kwargs['dict'] = kwargs.pop('dic')

        if kwargs.get('hmm') is None:
            kwargs['hmm'] = os.path.join(model_path, 'en-us')

        if kwargs.get('lm') is None:
            kwargs['lm'] = os.path.join(model_path, 'en-us.lm.bin')

        if kwargs.get('dict') is None and kwargs.get('dic') is None:
            kwargs['dict'] = os.path.join(model_path, 'cmudict-en-us.dict')

        if kwargs.pop('verbose', False) is False:
            if sys.platform.startswith('win'):
                kwargs['logfn'] = 'nul'
            else:
                kwargs['logfn'] = '/dev/null'

        config = Decoder.default_config()

        print(kwargs)

        for key, value in kwargs.items():
            if isinstance(value, bool):
                config.set_boolean('-{}'.format(key), value)
            elif isinstance(value, int):
                config.set_int('-{}'.format(key), value)
            elif isinstance(value, float):
                config.set_float('-{}'.format(key), value)
            elif isinstance(value, str):
                config.set_string('-{}'.format(key), value)

        self.decoder = Decoder(config)

        if grammar_file and grammar_rule and grammar_name:
            jsgf = Jsgf(grammar_file)
            rule = jsgf.get_rule(grammar_name + '.' + grammar_rule)
            fsg = jsgf.build_fsg(rule, self.decoder.get_logmath(), 7.5)
            self.decoder.set_fsg(grammar_name, fsg)
            self.decoder.set_search(grammar_name)

        self.start = None
        self.finish = None

        self.speech_publisher = rospy.Publisher(nodename + '/' + 'SpeechRec',
                                                SpeechInfo,
                                                queue_size=10)

    def stop(self, *args, **kwargs):
        raise StopIteration

    def hypothesis(self):
        hyp = self.decoder.hyp()
        if hyp:
            return hyp.hypstr
        else:
            return ''

    def vad_finished_callback(self):
        self.decoder.end_utt()
        result = ''
        if self.decoder.hyp():
            result = self.hypothesis()
        rospy.loginfo('understood: \'' + str(result) + '\'')

        hypo = SpeechHypothesis()
        hypo.recognizedSpeech = result
        hypo.probability = 1.0

        time = RecordingTimeStamps()
        time.start = self.start
        time.finish = self.finish

        speechInfo = SpeechInfo()
        speechInfo.hypotheses = [hypo]
        speechInfo.duration = time

        self.speech_publisher.publish(speechInfo)

        self.start = None
        self.finish = None

    def add_audio_data(self, audio_data, recording_timestamps):
        _recording_timestamps = RecordingTimeStamps()
        msg_from_string(_recording_timestamps, recording_timestamps)
        rospy.loginfo('got audio!')
        if not self.start:
            self.start = _recording_timestamps.start
            self.decoder.start_utt()
        self.finish = _recording_timestamps.finish
        bytearray = audio_data.tobytes()
        self.decoder.process_raw(bytearray, False, False)
Example #3
0
class SphinxWrapper(object):
    '''
For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal.
Before signal is fed to decoder, it should be isntructed that new utterance is expected.
When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)`

    '''

    #MODELDIR = "../models"
    #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models"
    MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models"

    decoder = None
    config = None
    previousVadState = 0
    currentVadState = 0

    def __init__(self):
        '''
        Constructor
        '''

    def prepareDecoder(self, pGramma):
        '''
        Entry point where sphinx decoder is initialized or grammar updated
        '''
        if self.decoder is None:
            self.config = self.createConfig(pGramma);
            self.decoder = Decoder(self.config);
        else:
            self.updateGrammar(self.decoder, pGramma);

    def createConfig(self,pGramma):
        '''
        Create configuration with acoustic model path, grammar and dictionary
        '''
        print ("[createConfig]+++")
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/'))
        config.set_string('-fsg', os.path.join("../resource/", pGramma+'.fsg'))
        #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram'))
        config.set_string('-dict', os.path.join("../resource/", 'service.dict'))
        print ("[createConfig]---")
        return config;

    def updateGrammar(self,pGramma):
        '''
        Update decoder language model from fsg file
        '''
        print ("[updateGrammar]+++" + pGramma)
        logmath = self.decoder.get_logmath();
        fsg = sphinxbase.FsgModel(os.path.join("../resource/", pGramma+'.fsg'), logmath, 7.5)
        self.decoder.set_fsg("default",fsg);
        self.decoder.set_search("default");
        print ("[updateGrammar]---")

    def startListening(self):
        """
        Instruct decoder that new utterace should be expected
        """
        self.decoder.start_utt(None)


    def stopListening(self):
        """
        Instruct decoder that new utterace should is not expected any more
        """
        self.decoder.end_utt()


    def process_raw(self, data):
        """
        Feed decoder with raw audio data. After data is updating refresh VAD state
        """
        #print("process_raw...\n")
        self.decoder.process_raw(data, False, False)
        self.previousVadState = self.currentVadState
        self.currentVadState = self.decoder.get_vad_state();
        #print("process_raw", self.currentVadState and True, self.previousVadState and True)

    def calculateHypothesis(self):
        return self.decoder.hyp();

    def calculateVadState(self):
        return self.decoder.get_vad_state;

    def isVoiceStarted(self):
        '''
        silence -> speech transition,
        '''
        return self.currentVadState and not self.previousVadState

    def isVoiceEnded(self):
        '''
        speech -> silence transition,
        '''
        return not self.currentVadState and self.previousVadState
Example #4
0
class SphinxWrapper(object):
    '''
For audio stream feeding is used `process_raw(...)` method. It also updates vad status: if voice found in signal.
Before signal is fed to decoder, it should be isntructed that new utterance is expected.
When Vad says that speech segment ended it should be called `stopListening(...)`, only then we could request hypothesis what was said. `calculateHypothesis(...)`

    '''

    #MODELDIR = "../models"
    #MODELDIR = "/home/as/src/speech/sphinx/lt-pocketsphinx-tutorial/impl/models"
    MODELDIR = "../../lt-pocketsphinx-tutorial/impl/models"

    decoder = None
    config = None
    previousVadState = 0
    currentVadState = 0

    def __init__(self):
        '''
        Constructor
        '''

    def prepareDecoder(self, pGramma):
        '''
        Entry point where sphinx decoder is initialized or grammar updated
        '''
        if self.decoder is None:
            self.config = self.createConfig(pGramma)
            self.decoder = Decoder(self.config)
        else:
            self.updateGrammar(self.decoder, pGramma)

    def createConfig(self, pGramma):
        '''
        Create configuration with acoustic model path, grammar and dictionary
        '''
        print("[createConfig]+++")
        config = Decoder.default_config()
        config.set_string('-hmm',
                          os.path.join(self.MODELDIR, 'hmm/lt.cd_cont_200/'))
        config.set_string('-fsg', os.path.join("../resource/",
                                               pGramma + '.fsg'))
        #config.set_string('-jsgf', os.path.join("../resource/", pGramma+'.gram'))
        config.set_string('-dict', os.path.join("../resource/",
                                                'service.dict'))
        print("[createConfig]---")
        return config

    def updateGrammar(self, pGramma):
        '''
        Update decoder language model from fsg file
        '''
        print("[updateGrammar]+++" + pGramma)
        logmath = self.decoder.get_logmath()
        fsg = sphinxbase.FsgModel(
            os.path.join("../resource/", pGramma + '.fsg'), logmath, 7.5)
        self.decoder.set_fsg("default", fsg)
        self.decoder.set_search("default")
        print("[updateGrammar]---")

    def startListening(self):
        """
        Instruct decoder that new utterace should be expected
        """
        self.decoder.start_utt(None)

    def stopListening(self):
        """
        Instruct decoder that new utterace should is not expected any more
        """
        self.decoder.end_utt()

    def process_raw(self, data):
        """
        Feed decoder with raw audio data. After data is updating refresh VAD state
        """
        #print("process_raw...\n")
        self.decoder.process_raw(data, False, False)
        self.previousVadState = self.currentVadState
        self.currentVadState = self.decoder.get_vad_state()
        #print("process_raw", self.currentVadState and True, self.previousVadState and True)

    def calculateHypothesis(self):
        return self.decoder.hyp()

    def calculateVadState(self):
        return self.decoder.get_vad_state

    def isVoiceStarted(self):
        '''
        silence -> speech transition,
        '''
        return self.currentVadState and not self.previousVadState

    def isVoiceEnded(self):
        '''
        speech -> silence transition,
        '''
        return not self.currentVadState and self.previousVadState