Ejemplo n.º 1
0
def audio2phoneme(audio_file):
    wave_read = wave.open(audio_file, 'rb')
    length = wave_read.getnframes()/wave_read.getframerate()
    wave_read.close()

    # Decode streaming data.
    decoder = Decoder(config)

    buf = bytearray(1024)
    with open(audio_file, 'rb') as f:
        decoder.start_utt()
        while f.readinto(buf):
            decoder.process_raw(buf, False, False)
        decoder.end_utt()

    nframes = decoder.n_frames()


    phonemes = []
    offset = None
    for seg in decoder.seg():
        if offset is None:
            offset = seg.start_frame
        start_frame = seg.start_frame - offset
        end_frame = seg.end_frame - offset
        phonemes.append((
            seg.word, start_frame/nframes*length, end_frame/nframes*length))

    return phonemes
Ejemplo n.º 2
0
class LocalRecognizer(object):
    def __init__(self, sample_rate=16000, lang="en-us", key_phrase="mycroft"):
        self.lang = lang
        self.key_phrase = key_phrase
        self.sample_rate = sample_rate
        self.configure()

    def configure(self):
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
                                               'hmm'))
        config.set_string('-dict', os.path.join(BASEDIR, 'model', self.lang,
                                                'mycroft-en-us.dict'))
        config.set_string('-keyphrase', self.key_phrase)
        config.set_float('-kws_threshold', float('1e-45'))
        config.set_float('-samprate', self.sample_rate)
        config.set_int('-nfft', 2048)
        config.set_string('-logfn', '/dev/null')
        self.decoder = Decoder(config)

    def transcribe(self, byte_data, metrics=None):
        start = time.time()
        self.decoder.start_utt()
        self.decoder.process_raw(byte_data, False, False)
        self.decoder.end_utt()
        if metrics:
            metrics.timer("mycroft.stt.local.time_s", time.time() - start)
        return self.decoder.hyp()

    def is_recognized(self, byte_data, metrics):
        hyp = self.transcribe(byte_data, metrics)
        return hyp and self.key_phrase in hyp.hypstr.lower()

    def found_wake_word(self, hypothesis):
        return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
Ejemplo n.º 3
0
    def speech_recog(self, model):
        # Create a decoder with certain model
        config = Decoder.default_config()
        config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us')
        config.set_int('-ds', 2)
        config.set_int('-topn', 3)
        config.set_int('-maxwpf', 5)
        #config.set_string('-kws', MODELDIR + model + '.txt')
        config.set_string('-lm', MODELDIR + model + '.lm')
        config.set_string('-dict', MODELDIR + model + '.dict')
        decoder = Decoder(config)

        decoder.start_utt()
        recog_text = ''

        with self.stream_in as stream:
            audio_generator = stream.generator()
            for content in audio_generator:
                decoder.process_raw(content, False, False)
                if decoder.hyp() and decoder.hyp().hypstr != '':
                    recog_text += decoder.hyp().hypstr
                    if len(recog_text) > 1:
                        decoder.end_utt()
                        logging.info("recog text: %s", recog_text)
                        return recog_text
        return recog_text
Ejemplo n.º 4
0
class LocalRecognizer(object):
    def __init__(self, key_phrase, phonemes, threshold, sample_rate=16000,
                 lang="en-us"):
        self.lang = lang
        self.key_phrase = key_phrase
        self.sample_rate = sample_rate
        self.threshold = threshold
        self.phonemes = phonemes
        dict_name = self.create_dict(key_phrase, phonemes)
        self.decoder = Decoder(self.create_config(dict_name))

    def create_dict(self, key_phrase, phonemes):
        (fd, file_name) = tempfile.mkstemp()
        words = key_phrase.split()
        phoneme_groups = phonemes.split('.')
        with os.fdopen(fd, 'w') as f:
            for word, phoneme in zip(words, phoneme_groups):
                f.write(word + ' ' + phoneme + '\n')
        return file_name

    def create_config(self, dict_name):
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(BASEDIR, 'model', self.lang,
                                               'hmm'))
        config.set_string('-dict', dict_name)
        config.set_string('-keyphrase', self.key_phrase)
        config.set_float('-kws_threshold', float(self.threshold))
        config.set_float('-samprate', self.sample_rate)
        config.set_int('-nfft', 2048)
        config.set_string('-logfn', '/dev/null')
        return config

    def transcribe(self, byte_data, metrics=None):
        start = time.time()
        self.decoder.start_utt()
        self.decoder.process_raw(byte_data, False, False)
        self.decoder.end_utt()
        if metrics:
            metrics.timer("mycroft.stt.local.time_s", time.time() - start)
        return self.decoder.hyp()

    def is_recognized(self, byte_data, metrics):
        hyp = self.transcribe(byte_data, metrics)
        return hyp and self.key_phrase in hyp.hypstr.lower()

    def found_wake_word(self, hypothesis):
        return hypothesis and self.key_phrase in hypothesis.hypstr.lower()
Ejemplo n.º 5
0
def main():
    abspath = os.path.dirname(os.path.abspath(__file__))
    abspath = os.path.join(abspath, '..')

    model_dir = os.path.join(abspath, 'model')

    hmm = os.path.join(model_dir, HMM)
    lm = os.path.join(model_dir, LM)
    dic = os.path.join(model_dir, DIC)

    config = Decoder.default_config()
    config.set_string('-hmm', hmm)
    config.set_string('-lm', lm)
    config.set_string('-dict', dic)
    config.set_string('-logfn', '/dev/null')
    decoder = Decoder(config)

    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=BUFFER)
    stream.start_stream()
    in_speech_bf = True
    decoder.start_utt()
    while True:
        buf = stream.read(BUFFER)
        if buf:
            decoder.process_raw(buf, False, False)
            if decoder.get_in_speech():
                sys.stdout.write('.')
                sys.stdout.flush()
            if decoder.get_in_speech() == in_speech_bf:
                continue

            in_speech_bf = decoder.get_in_speech()
            if in_speech_bf:
                continue

            decoder.end_utt()
            try:
                if decoder.hyp().hypstr != '':
                    print('You said:', decoder.hyp().hypstr)
            except AttributeError:
                pass
            decoder.start_utt()
        else:
            break
    decoder.end_utt()
    print('An Error occured:', decoder.hyp().hypstr)
Ejemplo n.º 6
0
  def speech_recog(self, model):

    # Create a decoder with certain model
    config = Decoder.default_config()
    config.set_string('-hmm', '/usr/local/share/pocketsphinx/model/en-us/en-us')
    config.set_int('-ds', 2)
    config.set_int('-topn', 3)
    config.set_int('-maxwpf', 5)
    #config.set_string('-kws', MODELDIR + model + '.txt')
    config.set_string('-lm', MODELDIR + model + '.lm')
    config.set_string('-dict', MODELDIR + model + '.dict')
    decoder = Decoder(config)

    decoder.start_utt()
    tstamp = time.time()
    recog_text = ''

    while len(recog_text) < 1:
      try:
        buf = self.stream_in.read(CHUNK_SIZE)
        logging.info("actual voice")
        decoder.process_raw(buf, False, False)
        if decoder.hyp().hypstr != '':
          recog_text += decoder.hyp().hypstr
          print "text: " + decoder.hyp().hypstr
          tstamp = time.time()
      except IOError as ex:
        if ex[1] != pyaudio.paInputOverflowed:
          raise
        buf = '\x00' * CHUNK_SIZE #white noise
        logging.info("white noise") 
      except AttributeError:
        pass

    decoder.end_utt()

    logging.info("recog text: " + recog_text)
    return recog_text
Ejemplo n.º 7
0
    stream = open(sys.argv[1], "rb")
else:
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()


print('start...')

while True:
    buf = stream.read(1024)
    if buf:
        decoder.process_raw(buf, False, False)
    else:
        break

    hypothesis = decoder.hyp()
    if hypothesis:
        print('\nhypothesis: %s, score: %d' % (hypothesis.hypstr, hypothesis.best_score))
        print ([(seg.word, seg.prob, seg.start_frame, seg.end_frame) for seg in decoder.seg()])
        print ("Detected keyword, restarting search")
        os.system('mpg123 ' + os.path.join(script_dir, 'hi.mp3'))

        print('restart...')
        decoder.end_utt()
        decoder.start_utt()
        print('ok')
        # break

stream.close()
Ejemplo n.º 8
0
    def detect(self):
        # create decoders on the fly
        if not self.decoders:
            self.decoders = []

            for id, phrase in self.config['triggers'].iteritems():
                config = Decoder.default_config()

                # set recognition model to US
                config.set_string('-hmm',
                                  os.path.join(get_model_path(), 'en-us'))
                config.set_string(
                    '-dict',
                    os.path.join(get_model_path(), 'cmudict-en-us.dict'))

                # specify recognition key phrase
                config.set_string('-keyphrase', phrase)
                config.set_float('-kws_threshold', 1e-5)

                # hide the VERY verbose logging information
                # if not self.config['debug']:
                config.set_string('-logfn', '/dev/null')

                decoder = Decoder(config)
                decoder.id = id

                self.decoders.append(decoder)

        events.fire('detection_started')

        # start decoding
        for decoder in self.decoders:
            decoder.start_utt()

        pcm = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                            self.config['device'])
        pcm.setchannels(1)
        pcm.setrate(16000)
        pcm.setformat(alsaaudio.PCM_FORMAT_S16_LE)
        pcm.setperiodsize(1024)

        phrase = None
        triggered = False
        while not triggered:
            _, buffer = pcm.read()

            for decoder in self.decoders:
                decoder.process_raw(buffer, False, False)
                triggered = decoder.hyp() is not None

                if triggered:
                    phrase = decoder.id
                    break

        pcm.close()
        pcm = None

        for decoder in self.decoders:
            decoder.end_utt()

        events.fire('detection_fullfilled', id=phrase)
Ejemplo n.º 9
0
class PocketsphinxTrigger(BaseTrigger):

    type = triggers.TYPES.VOICE

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000

    _capture = None

    def __init__(self, config, trigger_callback, capture):
        super(PocketsphinxTrigger, self).__init__(config, trigger_callback,
                                                  'pocketsphinx')

        self._capture = capture

        self._enabled_lock = threading.Event()
        # self._disabled_sync_lock = threading.Event()
        self._decoder = None

    def setup(self):
        # PocketSphinx configuration
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string('-hmm', os.path.join(get_model_path(), 'en-us'))
        ps_config.set_string(
            '-dict', os.path.join(get_model_path(), 'cmudict-en-us.dict'))

        # Specify recognition key phrase
        ps_config.set_string('-keyphrase', self._tconfig['phrase'])
        ps_config.set_float('-kws_threshold',
                            float(self._tconfig['threshold']))

        # Hide the VERY verbose logging information when not in debug
        if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:

            null_path = '/dev/null'
            if platform.system() == 'Windows':
                null_path = 'nul'

            ps_config.set_string('-logfn', null_path)

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)

    def run(self):
        thread = threading.Thread(target=self.thread, args=())
        thread.setDaemon(True)
        thread.start()

    def thread(self):
        while True:
            self._enabled_lock.wait()

            self._capture.handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)

            self._decoder.start_utt()

            triggered = False
            while not triggered:

                if not self._enabled_lock.isSet():
                    break

                # Read from microphone
                data = self._capture.handle_read()

                # Detect if keyword/trigger word was said
                self._decoder.process_raw(data, False, False)

                triggered = self._decoder.hyp() is not None

            self._capture.handle_release()

            self._decoder.end_utt()

            if triggered:
                self._trigger_callback(self)

    def enable(self):
        self._enabled_lock.set()

    def disable(self):
        self._enabled_lock.clear()
Ejemplo n.º 10
0
class PocketsphinxTrigger(BaseTrigger):

    type = triggers.TYPES.VOICE

    def __init__(self, config, trigger_callback):
        super(PocketsphinxTrigger, self).__init__(config, trigger_callback,
                                                  'pocketsphinx')

        self._enabled_lock = threading.Event()
        self._disabled_sync_lock = threading.Event()
        self._decoder = None

    def setup(self):
        # PocketSphinx configuration
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string(
            '-hmm', os.path.join(get_model_path(), self._tconfig['language']))
        ps_config.set_string(
            '-dict', os.path.join(get_model_path(),
                                  self._tconfig['dictionary']))

        # Specify recognition key phrase
        #ps_config.set_string('-keyphrase', self._tconfig['phrase'])
        #ps_config.set_float('-kws_threshold', float(self._tconfig['threshold']))

        ### Multiple Hotwords
        #ps_config.set_string('-inmic', 'yes')
        ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list')

        # Hide the VERY verbose logging information when not in debug
        if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:
            ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)

    def run(self):
        thread = threading.Thread(target=self.thread, args=())
        thread.setDaemon(True)
        thread.start()

    def thread(self):
        while True:
            self._enabled_lock.wait()

            # Enable reading microphone raw data
            inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL,
                                self._config['sound']['input_device'])
            inp.setchannels(1)
            inp.setrate(16000)
            inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
            inp.setperiodsize(1024)

            self._decoder.start_utt()

            triggered = False
            #assistantTriggered = False
            voice_command = ""

            while not triggered:

                if not self._enabled_lock.isSet():
                    break

                # Read from microphone
                _, buf = inp.read()

                # Detect if keyword/trigger word was said
                self._decoder.process_raw(buf, False, False)

                triggered = self._decoder.hyp() is not None

            # To avoid overflows close the microphone connection
            inp.close()

            self._decoder.end_utt()

            self._disabled_sync_lock.set()

            if triggered:
                ### Assistant Starts Here
                voice_command = self._decoder.hyp().hypstr
                self._trigger_callback(self, voice_command)
                ###

    def enable(self):
        self._enabled_lock.set()
        self._disabled_sync_lock.clear()

    def disable(self):
        self._enabled_lock.clear()
        self._disabled_sync_lock.wait()
Ejemplo n.º 11
0
class PocketGrammar(object):

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000
    HMM = 'cmusphinx-5prealpha-en-us-ptm-2.0/'
    DIC = 'dictionary.dic'
    GRAMMAR = 'grammar.jsgf'

    def __init__(self, device_index=0, model_path=None):

        self._decoder = None
        self._pa = None
        self._device_no = device_index
        self._model_path = model_path

        # PocketSphinx configuration
        logging.info('Grammar file:' + os.path.join(model_path, self.GRAMMAR))
        ps_config = Decoder.default_config()

        # Set recognition model to ...
        ps_config.set_string('-hmm', os.path.join(model_path, self.HMM))
        ps_config.set_string('-dict', os.path.join(model_path, self.DIC))
        ps_config.set_string('-jsgf', os.path.join(model_path, self.GRAMMAR))
        ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)
        self._pa = pyaudio.PyAudio()

    def _handle_init(self, rate, chunk_size):
        self._handle = self._pa.open(input=True,
                                     input_device_index=self._device_no,
                                     format=pyaudio.paInt16,
                                     channels=1,
                                     rate=rate,
                                     frames_per_buffer=chunk_size)

    def _handle_release(self):
        self._handle.stop_stream()
        self._handle.close()

    def _handle_read(self, chunk_size):
        return self._handle.read(chunk_size, exception_on_overflow=False)

    def getHypothesys(self):

        # init microphone
        self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)
        self._decoder.start_utt()

        #  from speech to silence or from silence to speech?
        utteranceStarted = False
        triggered = False
        while not triggered:
            # Read from microphone and process
            data = self._handle_read(self.AUDIO_CHUNK_SIZE)
            self._decoder.process_raw(data, False, False)

            # checks for transition from silence to speech.
            inSpeech = self._decoder.get_in_speech()
            if inSpeech and not utteranceStarted:
                utteranceStarted = True
                logging.debug("Silence")

            # checks for the transition from speech to silence
            if not inSpeech and utteranceStarted:
                hypothesis = self._decoder.hyp()
                triggered = hypothesis is not None

        # close microphone
        self._handle_release()
        self._decoder.end_utt()
        if triggered:
            return hypothesis.hypstr
Ejemplo n.º 12
0
class PocketKeyword(object):

    AUDIO_CHUNK_SIZE = 1024
    AUDIO_RATE = 16000

    def __init__(self, phrase, threshold, device_index=0):

        self._decoder = None
        self._pa = None
        self._device_no = device_index
        self._phrase = phrase
        self._threshold = float(threshold)

        # PocketSphinx configuration
        logging.info('Phrase: ' + phrase + ' Threshold: ' + str(threshold))
        ps_config = Decoder.default_config()

        # Set recognition model to US
        ps_config.set_string('-hmm',
                             os.path.join(get_model_path_keyword(), 'en-us'))
        ps_config.set_string(
            '-dict',
            os.path.join(get_model_path_keyword(), 'cmudict-en-us.dict'))
        # Specify recognition key phrase
        ps_config.set_string('-keyphrase', self._phrase)
        ps_config.set_float('-kws_threshold', self._threshold)
        ps_config.set_string('-logfn', '/dev/null')

        # Process audio chunk by chunk. On keyword detected perform action and restart search
        self._decoder = Decoder(ps_config)
        self._pa = pyaudio.PyAudio()

    def _handle_init(self, rate, chunk_size):
        self._handle = self._pa.open(input=True,
                                     input_device_index=self._device_no,
                                     format=pyaudio.paInt16,
                                     channels=1,
                                     rate=rate,
                                     frames_per_buffer=chunk_size)

    def _handle_release(self):
        self._handle.stop_stream()
        self._handle.close()

    def _handle_read(self, chunk_size):
        return self._handle.read(chunk_size, exception_on_overflow=False)

    def getHypothesys(self):

        # init microphone
        self._handle_init(self.AUDIO_RATE, self.AUDIO_CHUNK_SIZE)
        self._decoder.start_utt()

        triggered = False
        while not triggered:
            # Read from microphone and process
            data = self._handle_read(self.AUDIO_CHUNK_SIZE)
            self._decoder.process_raw(data, False, False)

            # best guess from CMU Sphinx STT
            hypothesis = self._decoder.hyp()
            triggered = hypothesis is not None

        # close microphone
        self._handle_release()
        self._decoder.end_utt()
        if triggered:
            return hypothesis.hypstr
Ejemplo n.º 13
0
class SpeechRecognizer(Interpreter):
    def __init__(self, name: str, sr: str = "pocketsphinx"):
        super().__init__(name, True)
        self.logger = self.get_logger()
        self.sr = sr
        self.current_data = []
        self.setup()

    def setup(self) -> None:
        self.RATE = int(os.getenv("RATE"))
        self.CHUNK = int(os.getenv("CHUNK"))
        self.setup_pocketsphinx()

        if (self.sr == "googlespeech"):
            self.setup_googlespeech()

    def setup_pocketsphinx(self) -> None:
        self.logger.info("Setting up PocketSphinx.")
        self.MODELDIR = "resources/model"

        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(self.MODELDIR, 'es-es'))
        config.set_string('-lm', os.path.join(self.MODELDIR, 'es-es.lm'))
        config.set_string('-dict', os.path.join(self.MODELDIR, 'es.dict'))
        config.set_string('-logfn', '/dev/null')

        self.decoder = Decoder(config)

        self.prev_buf_is_speech = False
        self.decoder.start_utt()
        self.logger.info("Done setting up PocketSphinx.")

    def setup_googlespeech(self) -> None:
        self.logger.info("Setting up Google Speech.")
        credentials = service_account.Credentials.from_service_account_file(
            'resources/keys/credentials.json')
        config = speech.types.RecognitionConfig(
            encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
            language_code='es-PE',
            sample_rate_hertz=self.RATE,
        )
        self.client = speech.SpeechClient(credentials=credentials)
        self.streaming_config = speech.types.StreamingRecognitionConfig(
            config=config)
        self.logger.info("Done setting up Google Speech.")

    def get_destinations_ID(self, raw_data) -> List[Identifier]:
        return [self.destinations_ID[0]]

    def preprocess(self, raw_data):
        """Filtering"""
        return raw_data

    def query_gs(self):
        requests = (speech.types.StreamingRecognizeRequest(audio_content=chunk)
                    for chunk in self.current_data)
        responses = self.client.streaming_recognize(
            config=self.streaming_config, requests=requests)
        try:
            response = next(responses)
            data = response.results[0].alternatives[0].transcript
            conf = response.results[0].alternatives[0].confidence
        except Exception as e:
            self.logger.info(f"{self.name}>> {e}")
            conf = None
            data = None
        self.current_data.clear()
        return data, conf

    def query_ps(self):
        try:
            data = self.decoder.hyp().hypstr
            conf = self.decoder.hyp().best_score
            if data == "":
                data = None
        except Exception as e:
            self.logger.info(f"{self.name}>> {e}")
            conf = None
            data = None
        return data, conf

    def process(self, raw_data) -> Generator:
        self.decoder.process_raw(raw_data, False, False)
        cur_buf_is_speech = self.decoder.get_in_speech()
        data = None
        self.logger.info(
            f"prev: {self.prev_buf_is_speech}, current: {cur_buf_is_speech}")

        force_speech = False
        if raw_data == bytes([0] * self.CHUNK * 16):
            force_speech = True
            self.logger.info("RECEIVED FORCE STOP")

        if force_speech or (self.prev_buf_is_speech and not cur_buf_is_speech):
            # No longer in speech -> stop listening and process
            self.logger.info("No longer in speech, yielding True.")
            yield True
            self.decoder.end_utt()
            if (self.sr == "googlespeech"):
                data, conf = self.query_gs()
            elif (self.sr == "pocketsphinx"):
                data, conf = self.query_ps()
            self.logger.info(
                f"{self.name}>> Heard DATA: '{data}' with confidence: {conf}.")
            self.decoder.start_utt()
            self.prev_buf_is_speech = cur_buf_is_speech
        elif not self.prev_buf_is_speech and cur_buf_is_speech:
            # Now in speech -> Start listening
            self.current_data.append(raw_data)
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        elif self.prev_buf_is_speech and cur_buf_is_speech:
            # Still in speech -> Keep on listening
            self.current_data.append(raw_data)
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        else:
            self.prev_buf_is_speech = cur_buf_is_speech
            yield False

        yield data
        return

    def pass_msg(self, msg: str) -> None:
        if msg == "RESUME":
            self.e.set()

    def dump_history(self, filename: str, data: List[Any]) -> None:
        pass
class PocketsphinxTrigger(BaseTrigger):


	type = triggers.TYPES.VOICE

	def __init__(self, config, trigger_callback):
		super(PocketsphinxTrigger, self).__init__(config, trigger_callback, 'pocketsphinx')

		self._enabled_lock = threading.Event()
		self._disabled_sync_lock = threading.Event()
		self._decoder = None

	def setup(self):
		# PocketSphinx configuration
		ps_config = Decoder.default_config()

		# Set recognition model to US
		ps_config.set_string('-hmm', os.path.join(get_model_path(), self._tconfig['language']))
		ps_config.set_string('-dict', os.path.join(get_model_path(), self._tconfig['dictionary']))

		# Specify recognition key phrase
		#ps_config.set_string('-keyphrase', self._tconfig['phrase'])
		#ps_config.set_float('-kws_threshold', float(self._tconfig['threshold']))

		### Multiple Hotwords
		#ps_config.set_string('-inmic', 'yes')
		ps_config.set_string('-kws', '/opt/AlexaPi/src/keyphrase.list')


		# Hide the VERY verbose logging information when not in debug
		if logging.getLogger('alexapi').getEffectiveLevel() != logging.DEBUG:
			ps_config.set_string('-logfn', '/dev/null')

		# Process audio chunk by chunk. On keyword detected perform action and restart search
		self._decoder = Decoder(ps_config)

	def run(self):
		thread = threading.Thread(target=self.thread, args=())
		thread.setDaemon(True)
		thread.start()

	def thread(self):
		while True:
			self._enabled_lock.wait()

			# Enable reading microphone raw data
			inp = alsaaudio.PCM(alsaaudio.PCM_CAPTURE, alsaaudio.PCM_NORMAL, self._config['sound']['input_device'])
			inp.setchannels(1)
			inp.setrate(16000)
			inp.setformat(alsaaudio.PCM_FORMAT_S16_LE)
			inp.setperiodsize(1024)

			self._decoder.start_utt()

			triggered = False
			#assistantTriggered = False
			voice_command = ""

			while not triggered:

				if not self._enabled_lock.isSet():
					break

				# Read from microphone
				_, buf = inp.read()

				# Detect if keyword/trigger word was said
				self._decoder.process_raw(buf, False, False)

				triggered = self._decoder.hyp() is not None

			# To avoid overflows close the microphone connection
			inp.close()

			self._decoder.end_utt()

			self._disabled_sync_lock.set()

			if triggered:
				### Assistant Starts Here
				try:
					voice_command = self._decoder.hyp().hypstr
				except:
					voice_command = ""
				self._trigger_callback(self, voice_command)
				###

	def enable(self):
		self._enabled_lock.set()
		self._disabled_sync_lock.clear()

	def disable(self):
		self._enabled_lock.clear()
		self._disabled_sync_lock.wait()
Ejemplo n.º 15
0
def main():
    """ A main method to that does a simple matching of sentences and executes scripts
    """

    notifier = sdnotify.SystemdNotifier()

    # Load config first
    config_file = open(os.path.join(os.getcwd(), 'config.yaml'), 'r')
    config = yaml.load(config_file)

    interaction_timeout = int(config['interaction_timeout'])

    # Create Decoder config
    pocketsphinx_config = Decoder.default_config()
    pocketsphinx_config.set_string('-hmm', os.path.join(os.getcwd(), config['hmm_path']))
    pocketsphinx_config.set_string('-dict', os.path.join(os.getcwd(), config['dict_path']))
    pocketsphinx_config.set_string('-featparams', os.path.join(os.getcwd(), config['feat_params_path']))
    pocketsphinx_config.set_boolean("-allphone_ci", True)
    # Using decoder.set_kws & decoder.set_lm_file
    # pocketsphinx_config.set_string('-lm', os.path.join(os.getcwd(), config['lm_path']))
    # pocketsphinx_config.set_string('-kws', os.path.join(os.getcwd(), config['keyphrase_path']))

    # Initialize audio
    p = pyaudio.PyAudio()
    stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=1024)
    stream.start_stream()

    # Load invocations and commands
    invocations = config['invocations']

    # Process audio chunk by chunk. On keyword detected perform action and restart search
    decoder = Decoder(pocketsphinx_config)
    logmath = decoder.get_logmath()
    decoder.set_kws('keyword', os.path.join(os.getcwd(), config['invocation_path']))
    decoder.set_lm_file('lm', os.path.join(os.getcwd(), config['lm_path']))

    invocation_ctx = None
    in_speech_bf = False

    # Run some initialization scripts for terminal displays
    subprocess.Popen([os.path.join(os.getcwd(), config['init_exec'])]).communicate()

    decoder.set_search('keyword')
    decoder.start_utt()
    notifier.notify("READY=1")

    interaction_time = None

    while True:
        notifier.notify("WATCHDOG=1")
        buf = stream.read(1024, exception_on_overflow = False)
        if buf:
            decoder.process_raw(buf, False, False)
        else:
            logging.error("Unable to get audio, exiting")
            break

        hyp = decoder.hyp()
        # seg = decoder.seg()
        hyp_str = hyp.hypstr.lower().strip() if hyp else None
        now_in_speech = decoder.get_in_speech()

        if now_in_speech != in_speech_bf:
            in_speech_bf = now_in_speech
            if not in_speech_bf:
                decoder.end_utt()
                if hyp_str:
                    logging.info("Heard: '%s' while being in '%s' context (score: %d, confidence: %d -> in log scale %d)" %
                                 (hyp_str, invocation_ctx, hyp.best_score, logmath.exp(hyp.prob), hyp.prob))

                    if not invocation_ctx:
                        if hyp_str in invocations:
                            logging.info("Matched invocation: '%s'" % hyp_str) 
                            invocation_ctx = hyp_str
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['enter']),
                                             invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                            interaction_time = time.time()
                            decoder.set_search('lm')
                        else:
                            logging.debug('Unknown invocation or wrongly heard, silently ignoring')
                    else:
                        matched = False
                        score_dict = defaultdict(list)

                        commands = invocations[invocation_ctx]['commands']
                        for command in commands:
                            logging.info("- command: '%s':" % command['name'])
                            for sentence in command['sentence']:
                                score = calc_similarity(command, sentence.lower(), hyp_str)
                                score_dict[score].append(command)
                                logging.debug("   - similarity: %d for sentence: %s" % (score, sentence))
                                if score == 1000:
                                    logging.debug("... seems like found perfect match, ignoring the rest")
                                    break

                        for best in sorted(score_dict.items(), reverse=True):
                            if best[0] > 90:
                                command = best[1][0]  # here might be some randomness
                                logging.info("The best matching command is '%s', executing: %s" % (command['name'], command['exec']))
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['ack']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), command['exec']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, command['name']]).communicate()
                                subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                                                 invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str])
                                invocation_ctx = None
                                decoder.set_search('keyword')
                                matched = True
                            break  # take only the first which should be the best

                        if not matched:
                            logging.info("... not matched, ignoring")
                            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['noop']),
                                              invocations[invocation_ctx]['voice_params'], invocation_ctx, hyp_str]).communicate()

                decoder.start_utt()

        if invocation_ctx and interaction_time and time.time() > interaction_time + interaction_timeout:
            logging.info("The invocation context has just timed out, returning to listen for invocation word.")
            subprocess.Popen([os.path.join(os.getcwd(), invocations[invocation_ctx]['exit']),
                              invocations[invocation_ctx]['voice_params'], invocation_ctx])
            invocation_ctx = None
            interaction_time = None
            decoder.end_utt()
            decoder.set_search('keyword')
            decoder.start_utt()
Ejemplo n.º 16
0
class NLUAudio(NLUBase):
    """Define NLUAudio component

    For now hotword uses pocketsphinx with speech_recognition
    and Nuance services has NLU
    """
    def __init__(self, settings, action_queue, tts_queue, logger):
        NLUBase.__init__(self, settings, action_queue, None, tts_queue, logger)
        # Init private attributes
        self._rerun = True

        self._answer_sound_path = "sounds/answer.wav"
        self._config = Decoder.default_config()
        if not self._prepare_decoder():
            self._must_run = False

    def _prepare_decoder(self):
        """Set decoder config"""
        # prepare config
        self._hotword = self._settings['speech']['hotword']
        # self._answer = self._settings['hotword']['answer']
        if not os.path.isdir("pocketsphinx-data"):
            raise HotWordError("Missing pocketsphinx-data folder. Please run `make hotword`")

        acoustic_model = os.path.join("pocketsphinx-data",
                                      self._settings['speech']['language'],
                                      'acoustic-model',
                                      )
        language_model = os.path.join("pocketsphinx-data",
                                      self._settings['speech']['language'],
                                      'language-model.lm.bin',
                                      )
        pocket_dict = os.path.join("pocketsphinx-data",
                                   self._settings['speech']['language'],
                                   'pronounciation-dictionary.dict',
                                   )
        self._config.set_string('-logfn', "/dev/null")
        self._config.set_string('-hmm', acoustic_model)
        self._config.set_string('-lm', language_model)
        self._config.set_string('-dict', pocket_dict)
        try:
            self._decoder = Decoder(self._config)
        except RuntimeError:
            self.logger.critical("Error get audio decoder. Hotword not started")
            return False
        self._decoder.set_keyphrase('wakeup', self._hotword)
        self._decoder.set_search('wakeup')

    def stop(self):
        """Stop process"""
        self._rerun = False
        NLUBase.stop(self)

    def _answering(self):
        """Play the hotwoard confirmation sound"""
        f_ans = wave.open(self._answer_sound_path, "rb")
        stream = self._paudio.open(format=self._paudio.get_format_from_width(f_ans.getsampwidth()),
                                   channels=f_ans.getnchannels(),
                                   rate=f_ans.getframerate(),
                                   output=True)
        data = f_ans.readframes(1024)
        while len(data) > 0:
            stream.write(data)
            data = f_ans.readframes(1024)
        f_ans.close()

    def run(self):
        """Listen for NLU"""
        self._rerun = True
        self._must_run = True
        self.logger.debug("starting listening hotword %s", self._hotword)
        while self._rerun:
            self._rerun = False
            try:
                self._paudio = pyaudio.PyAudio()
                stream = self._paudio.open(format=pyaudio.paInt16, channels=1, rate=16000,
                                           input=True, frames_per_buffer=1024)
            except OSError:
                self.logger.warning("No audio device found can not listen for NLU")
                self.logger.warning("Disabling NLU audio")
                self._must_run = False
                self._rerun = False
                return
            stream.start_stream()
            self._paudio.get_default_input_device_info()

            self._decoder.start_utt()
            while self._must_run:
                buf = stream.read(1024)
                self._decoder.process_raw(buf, False, False)
                if not self.tts_queue.empty():
                    # If tts_queue is not empty, this means the Droid
                    # is currently speaking. So we don't want to it listen itself
                    # TODO replace this stuff by speaker annulation
                    continue
                if self._decoder.hyp() and self._decoder.hyp().hypstr == self._hotword:
                    self.logger.debug("Hotword detected")
                    # self.tts_queue.put(gtt(self._answer))
                    # self.tts_queue.put(gtt("mmm"))
                    self._answering()
                    ret = nlu_audio(self._settings, self.logger)

                    # GOT ACTIONS
                    interpretations = ret.get("nlu_interpretation_results", {}).\
                        get("payload", {}).get("interpretations", {})
                    # TODO: what about if len(interpretations) > 1 ??
                    for interpretation in interpretations:
                        intent = interpretation.get("action", {}).get("intent", {})
                        self.logger.info("Intent: {}".format(intent.get("value")))
                        self.logger.info("Confidence: {}".format(intent.get("confidence")))
                        # TODO log arguments
                        if intent.get("value") == "NO_MATCH":
                            # I don't understand :/
                            self._misunderstand(0, True, True)
                        elif intent.get("confidence") < 0.8:
                            # I'm not sure to undestand :/
                            self._misunderstand(intent.get("confidence"), True, True)
                        else:
                            # Check intent name
                            if len(intent.get("value").split("__")) != 2:
                                self.logger.critical("BAD Intent name: "
                                                     "{}".format(intent.get("value")))
                                self._misunderstand(0, True, True)
                            # Run function with parameters
                            action, method = intent.get("value").split("__")
                            # Run action
                            # TODO add parameters from NLU response
                            self._run_action(action, method, {}, False, True, True)
                    # TODO run nlu audio detection
                    self._rerun = True
                    break
            self._decoder.end_utt()
Ejemplo n.º 17
0
class InstructionRecogniser(QThread):
    '''
	You should only use keyIn/keyOut, and shutdown after use. The thread starts itself when appropriate.
	Signals are emitted with any recognised instructions.
	'''
    def __init__(self, gui):
        QThread.__init__(self, gui)
        if settings.sphinx_acoustic_model_dir == '':  # use default acoustic model
            acoustic_model_directory = path.join(get_model_path(), 'en-us')
        else:  # use custom acoustic model
            acoustic_model_directory = settings.sphinx_acoustic_model_dir
        config = Decoder.default_config()
        config.set_string('-hmm', acoustic_model_directory)  # acoustic model
        config.set_string(
            '-dict', settings.prepared_lexicon_file)  # lexicon pronunciation
        config.set_string(
            '-jsgf',
            settings.prepared_grammar_file)  # language model from grammar
        config.set_string(
            '-logfn',
            settings.outputFileName(sphinx_decoder_log_file_base_name,
                                    ext='log'))
        self.listen = False
        self.decoder = Decoder(config)
        self.audio = None
        self.device = None

    def startup(self):
        self.audio = PyAudio()
        if 0 <= settings.audio_input_device_index < self.audio.get_device_count(
        ):  # out of range or -1 for default
            self.device = settings.audio_input_device_index
        else:
            self.device = None

    def shutdown(self):
        self.listen = False
        self.wait()
        self.audio.terminate()
        self.audio = None

    def keyIn(self):
        if not self.isRunning():
            self.listen = True
            self.start()

    def keyOut(self):
        self.listen = False

    def run(self):
        audio_stream = self.audio.open(input_device_index=self.device,
                                       channels=1,
                                       format=paInt16,
                                       rate=audio_sample_rate,
                                       frames_per_buffer=audio_chunk_size,
                                       input=True)
        chunks = []
        msg_duration = 0
        buff = audio_stream.read(audio_chunk_size)
        while self.listen and len(
                buff) > 0 and msg_duration < message_duration_limit:
            chunks.append(buff)
            buff = audio_stream.read(audio_chunk_size)
            msg_duration += audio_chunk_size / audio_sample_rate
        audio_stream.close()
        audio_message = b''.join(chunks)

        self.decoder.start_utt(
        )  # STYLE catch failures here (e.g. grammar/lex files not found)
        self.decoder.process_raw(audio_message, False, True)
        self.decoder.end_utt()
        hyp = self.decoder.hyp()
        if hyp:
            SR_log('VOICE: "%s"' % hyp.hypstr)
            if settings.show_recognised_voice_strings:
                signals.statusBarMsg.emit('VOICE: "%s"' % hyp.hypstr)
            callsign_tokens, instr_lst = interpret_string(hyp.hypstr)
            signals.voiceMsgRecognised.emit(callsign_tokens, instr_lst)
        else:
            SR_log('VOICE: no hypothesis, message duration was %g s' %
                   msg_duration)
            signals.voiceMsgNotRecognised.emit()