Example #1
0
 def test_wait_while_speaking(self):
     # Check that test terminates
     create_signal('isSpeaking')
     Thread(target=wait_while_speaking_thread).start()
     sleep(2)
     self.assertFalse(done_waiting)
     check_for_signal('isSpeaking')
     sleep(2)
     self.assertTrue(done_waiting)
Example #2
0
    def test_check_signal(self):
        if exists('/tmp/mycroft'):
            rmtree('/tmp/mycroft')
        # check that signal is not found if file does not exist
        self.assertFalse(check_for_signal('test_signal'))

        # Check that the signal is found when created
        create_signal('test_signal')
        self.assertTrue(check_for_signal('test_signal'))
        # Check that the signal is removed after use
        self.assertFalse(isfile('/tmp/mycroft/ipc/signal/test_signal'))
Example #3
0
    def end_audio(self):
        """
            Helper function for child classes to call in execute().

            Sends the recognizer_loop:audio_output_end message, indicating
            that speaking is done for the moment. It also checks if cache
            directory needs cleaning to free up disk space.
        """

        self.bus.emit(Message("recognizer_loop:audio_output_end"))
        # Clean the cache as needed
        cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name)
        mycroft.util.curate_cache(cache_dir, min_free_percent=100)

        # This check will clear the "signal"
        check_for_signal("isSpeaking")
Example #4
0
def main():
    """ Main function. Run when file is invoked. """
    reset_sigint_handler()
    check_for_signal("isSpeaking")
    bus = WebsocketClient()  # Connect to the Mycroft Messagebus
    Configuration.init(bus)
    speech.init(bus)

    LOG.info("Starting Audio Services")
    bus.on('message', create_echo_function('AUDIO', ['mycroft.audio.service']))
    audio = AudioService(bus)  # Connect audio service instance to message bus
    create_daemon(bus.run_forever)

    wait_for_exit_signal()

    speech.shutdown()
    audio.shutdown()
Example #5
0
def handle_stop(event):
    """
        handle stop message
    """
    global _last_stop_signal
    if check_for_signal("isSpeaking", -1):
        _last_stop_signal = time.time()
        tts.playback.clear()  # Clear here to get instant stop
        bus.emit(Message("mycroft.stop.handled", {"by": "TTS"}))
Example #6
0
def handle_stop(event):
    """
        handle stop message
    """
    global _last_stop_signal
    if check_for_signal("isSpeaking", -1):
        _last_stop_signal = time.time()
        tts.playback.clear_queue()
        tts.playback.clear_visimes()
    def _skip_wake_word(self):
        # Check if told programatically to skip the wake word, like
        # when we are in a dialog with the user.
        if check_for_signal('startListening'):
            return True

        # Pressing the Mark 1 button can start recording (unless
        # it is being used to mean 'stop' instead)
        if check_for_signal('buttonPress', 1):
            # give other processes time to consume this signal if
            # it was meant to be a 'stop'
            sleep(0.25)
            if check_for_signal('buttonPress'):
                # Signal is still here, assume it was intended to
                # begin recording
                logger.debug("Button Pressed, wakeword not needed")
                return True

        return False
Example #8
0
def handle_stop(event):
    """
        handle stop message
    """
    global _last_stop_signal
    if check_for_signal("isSpeaking", -1):
        _last_stop_signal = time.time()
        tts.playback.clear_queue()
        tts.playback.clear_visimes()
        bus.emit(Message("mycroft.stop.handled", {"by": "TTS"}))
Example #9
0
def handle_speak(event):
    """
        Handle "speak" message
    """
    config = Configuration.get()
    Configuration.init(bus)
    global _last_stop_signal

    # Get conversation ID
    if event.context and 'ident' in event.context:
        ident = event.context['ident']
    else:
        ident = 'unknown'

    start = time.time()  # Time of speech request
    with lock:
        stopwatch = Stopwatch()
        stopwatch.start()
        utterance = event.data['utterance']
        if event.data.get('expect_response', False):
            # When expect_response is requested, the listener will be restarted
            # at the end of the next bit of spoken audio.
            bus.once('recognizer_loop:audio_output_end', _start_listener)

        # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
        # for 30 seconds fairly often, so we don't want to break on periods
        # (decreasing the chance of encountering the block).  But we will
        # keep the split for non-Picroft installs since it give user feedback
        # faster on longer phrases.
        #
        # TODO: Remove or make an option?  This is really a hack, anyway,
        # so we likely will want to get rid of this when not running on Mimic
        if (config.get('enclosure', {}).get('platform') != "picroft" and
                len(re.findall('<[^>]*>', utterance)) == 0):
            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
                              utterance)
            for chunk in chunks:
                # Check if somthing has aborted the speech
                if (_last_stop_signal > start or
                        check_for_signal('buttonPress')):
                    # Clear any newly queued speech
                    tts.playback.clear()
                    break
                try:
                    mute_and_speak(chunk, ident)
                except KeyboardInterrupt:
                    raise
                except Exception:
                    LOG.error('Error in mute_and_speak', exc_info=True)
        else:
            mute_and_speak(utterance, ident)

        stopwatch.stop()
    report_timing(ident, 'speech', stopwatch, {'utterance': utterance,
                                               'tts': tts.__class__.__name__})
Example #10
0
def handle_speak(event):
    global _last_stop_signal

    # Mild abuse of the signal system to allow other processes to detect
    # when TTS is happening.  See mycroft.util.is_speaking()
    create_signal("isSpeaking")

    utterance = event.data['utterance']
    expect_response = event.data.get('expect_response', False)

    # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
    # for 30 seconds fairly often, so we don't want to break on periods
    # (decreasing the chance of encountering the block).  But we will
    # keep the split for non-Picroft installs since it give user feedback
    # faster on longer phrases.
    #
    # TODO: Remove or make an option?  This is really a hack, anyway,
    # so we likely will want to get rid of this when not running on Mimic
    if not config.get('enclosure', {}).get('platform') == "picroft":
        start = time.time()
        chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s',
                          utterance)
        for chunk in chunks:
            try:
                mute_and_speak(chunk)
            except KeyboardInterrupt:
                raise
            except:
                logger.error('Error in mute_and_speak', exc_info=True)
            if _last_stop_signal > start or check_for_signal('buttonPress'):
                break
    else:
        mute_and_speak(utterance)

    # This check will clear the "signal"
    check_for_signal("isSpeaking")

    if expect_response:
        create_signal('startListening')
Example #11
0
    def show_visimes(self, pairs):
        """
            Send visime data to enclosure

            Args:
                pairs(list): Visime and timing pair

            Returns:
                True if button has been pressed.
        """
        start = time()
        for code, duration in pairs:
            if check_for_signal('stoppingTTS', -1):
                return True
            if check_for_signal('buttonPress'):
                return True
            if self.enclosure:
                self.enclosure.mouth_viseme(code)
            delta = time() - start
            if delta < duration:
                sleep(duration - delta)
        return False
Example #12
0
 def visime(self, output):
     start = time()
     pairs = output.split(" ")
     for pair in pairs:
         if check_for_signal('buttonPress'):
             return
         pho_dur = pair.split(":")  # phoneme:duration
         if len(pho_dur) == 2:
             code = VISIMES.get(pho_dur[0], '4')
             self.enclosure.mouth_viseme(code)
             duration = float(pho_dur[1])
             delta = time() - start
             if delta < duration:
                 sleep(duration - delta)
Example #13
0
    def wait_until_wake_word(self, source, sec_per_buffer):
        num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
                               source.SAMPLE_WIDTH)

        silence = '\0' * num_silent_bytes

        # bytearray to store audio in
        byte_data = silence

        buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
        buffers_since_check = 0.0

        # Max bytes for byte_data before audio is removed from the front
        max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)

        said_wake_word = False
        while not said_wake_word:
            if check_for_signal('buttonPress'):
                said_wake_word = True
                continue

            chunk = self.record_sound_chunk(source)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            if energy < self.energy_threshold * self.multiplier:
                self.adjust_threshold(energy, sec_per_buffer)

            needs_to_grow = len(byte_data) < max_size
            if needs_to_grow:
                byte_data += chunk
            else:  # Remove beginning of audio and add new chunk to end
                byte_data = byte_data[len(chunk):] + chunk

            buffers_since_check += 1.0
            if buffers_since_check < buffers_per_check:
                buffers_since_check -= buffers_per_check
                said_wake_word = self.wake_word_in_audio(byte_data + silence)
Example #14
0
    def _record_phrase(self, source, sec_per_buffer):
        """Record an entire spoken phrase.

        Essentially, this code waits for a period of silence and then returns
        the audio.  If silence isn't detected, it will terminate and return
        a buffer of RECORDING_TIMEOUT duration.

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk

        Returns:
            bytearray: complete audio buffer recorded, including any
                       silence at the end of the user's utterance
        """

        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

        silence_duration = 0

        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
            return level

        def decrease_noise(level):
            if level > min_noise:
                return level - 100 * sec_per_buffer
            return level

        # Smallest number of loud chunks required to return
        min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
        num_chunks = 0

        # Will return if exceeded this even if there's not enough loud chunks
        max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
                                    sec_per_buffer)

        # bytearray to store audio in
        byte_data = '\0' * source.SAMPLE_WIDTH

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.multiplier
            is_loud = energy > test_threshold
            if is_loud:
                noise = increase_noise(noise)
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
                self._adjust_threshold(energy, sec_per_buffer)

            if num_chunks % 10 == 0:
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
                f.close()

            was_loud_enough = num_loud_chunks > min_loud_chunks

            quiet_enough = noise <= min_noise
            if quiet_enough:
                silence_duration += sec_per_buffer
                if silence_duration < self.MIN_SILENCE_AT_END:
                    quiet_enough = False  # gotta be silent for min of 1/4 sec
            else:
                silence_duration = 0
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True

            # Pressing top-button will end recording immediately
            if check_for_signal('buttonPress'):
                phrase_complete = True

        return byte_data
Example #15
0
    def _record_phrase(self, source, sec_per_buffer):
        """Record an entire spoken phrase.

        Essentially, this code waits for a period of silence and then returns
        the audio.  If silence isn't detected, it will terminate and return
        a buffer of RECORDING_TIMEOUT duration.

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk

        Returns:
            bytearray: complete audio buffer recorded, including any
                       silence at the end of the user's utterance
        """

        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

        silence_duration = 0

        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
            return level

        def decrease_noise(level):
            if level > min_noise:
                return level - 100 * sec_per_buffer
            return level

        # Smallest number of loud chunks required to return
        min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
        num_chunks = 0

        # Will return if exceeded this even if there's not enough loud chunks
        max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
                                    sec_per_buffer)

        # bytearray to store audio in
        byte_data = get_silence(source.SAMPLE_WIDTH)

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.multiplier
            is_loud = energy > test_threshold
            if is_loud:
                noise = increase_noise(noise)
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
                self._adjust_threshold(energy, sec_per_buffer)

            if num_chunks % 10 == 0:
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
                f.close()

            was_loud_enough = num_loud_chunks > min_loud_chunks

            quiet_enough = noise <= min_noise
            if quiet_enough:
                silence_duration += sec_per_buffer
                if silence_duration < self.MIN_SILENCE_AT_END:
                    quiet_enough = False  # gotta be silent for min of 1/4 sec
            else:
                silence_duration = 0
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True

            # Pressing top-button will end recording immediately
            if check_for_signal('buttonPress'):
                phrase_complete = True

        return byte_data
Example #16
0
def handle_speak(event):
    """Handle "speak" message

    Parse sentences and invoke text to speech service.
    """
    config = Configuration.get()
    Configuration.set_config_update_handlers(bus)
    global _last_stop_signal

    # if the message is targeted and audio is not the target don't
    # don't synthezise speech
    event.context = event.context or {}
    if event.context.get('destination') and not \
            ('debug_cli' in event.context['destination'] or
             'audio' in event.context['destination']):
        return

    # Get conversation ID
    if event.context and 'ident' in event.context:
        ident = event.context['ident']
    else:
        ident = 'unknown'

    start = time.time()  # Time of speech request
    with lock:
        stopwatch = Stopwatch()
        stopwatch.start()
        play_error = _should_play_error(event.data)
        utterance = event.data['utterance']
        listen = event.data.get('expect_response', False)
        # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
        # for 30 seconds fairly often, so we don't want to break on periods
        # (decreasing the chance of encountering the block).  But we will
        # keep the split for non-Picroft installs since it give user feedback
        # faster on longer phrases.
        #
        # TODO: Remove or make an option?  This is really a hack, anyway,
        # so we likely will want to get rid of this when not running on Mimic
        if (config.get('enclosure', {}).get('platform') != "picroft" and len(
                re.findall('<[^>]*>', utterance)) == 0) and not play_error:
            # Remove any whitespace present after the period,
            # if a character (only alpha) ends with a period
            # ex: A. Lincoln -> A.Lincoln
            # so that we don't split at the period
            utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance)
            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
                              utterance)
            # Apply the listen flag to the last chunk, set the rest to False
            chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
                      for i in range(len(chunks))]
            for chunk, listen in chunks:
                # Check if somthing has aborted the speech
                if (_last_stop_signal > start
                        or check_for_signal('buttonPress')):
                    # Clear any newly queued speech
                    tts.playback.clear()
                    break
                try:
                    mute_and_speak(chunk, ident, listen)
                except KeyboardInterrupt:
                    raise
                except Exception:
                    LOG.error('Error in mute_and_speak', exc_info=True)
        else:
            mute_and_speak(utterance, ident, listen, play_error)

        stopwatch.stop()
    report_timing(ident, 'speech', stopwatch, {
        'utterance': utterance,
        'tts': tts.__class__.__name__
    })
Example #17
0
 def on_stop_handled(self, event):
     # A skill performed a stop
     check_for_signal('buttonPress')
Example #18
0
    def record_phrase(self, source, sec_per_buffer):
        """
        This attempts to record an entire spoken phrase. Essentially,
        this waits for a period of silence and then returns the audio

        :rtype: bytearray
        :param source: AudioSource
        :param sec_per_buffer: Based on source.SAMPLE_RATE
        :return: bytearray representing the frame_data of the recorded phrase
        """
        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
            return level

        def decrease_noise(level):
            if level > min_noise:
                return level - 100 * sec_per_buffer
            return level

        # Smallest number of loud chunks required to return
        min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
        num_chunks = 0

        # Will return if exceeded this even if there's not enough loud chunks
        max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
                                    sec_per_buffer)

        # bytearray to store audio in
        byte_data = '\0' * source.SAMPLE_WIDTH

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.threshold_multiplier
            is_loud = energy > test_threshold
            if is_loud:
                noise = increase_noise(noise)
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
                self.adjust_threshold(energy, sec_per_buffer)

            was_loud_enough = num_loud_chunks > min_loud_chunks
            quiet_enough = noise <= min_noise
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True
            if check_for_signal('buttonPress'):
                phrase_complete = True

        return byte_data
def handle_speak(event):
    """Handle "speak" message
    Parse sentences and invoke text to speech service.
    """
    #/home/insitelabdev/mycroft-core/mycroft/client/speech/set_config.txt
    path = pathlib.Path().absolute()
    settings_file = open(
        str(path) + '/mycroft/client/speech/set_config.txt', 'r')
    settings_dict = eval(settings_file.read())
    config = Configuration.get()
    Configuration.set_config_update_handlers(bus)
    global _last_stop_signal

    # Get conversation ID
    if event.context and 'ident' in event.context:
        ident = event.context['ident']
    else:
        ident = 'unknown'

    start = time.time()  # Time of speech request
    with lock:
        stopwatch = Stopwatch()
        stopwatch.start()
        utterance = event.data['utterance']
        listen = event.data.get('expect_response', False)
        # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
        # for 30 seconds fairly often, so we don't want to break on periods
        # (decreasing the chance of encountering the block).  But we will
        # keep the split for non-Picroft installs since it give user feedback
        # faster on longer phrases.
        #
        # TODO: Remove or make an option?  This is really a hack, anyway,
        # so we likely will want to get rid of this when not running on Mimic
        if (config.get('enclosure', {}).get('platform') != "picroft"
                and len(re.findall('<[^>]*>', utterance)) == 0):
            # Remove any whitespace present after the period,
            # if a character (only alpha) ends with a period
            # ex: A. Lincoln -> A.Lincoln
            # so that we don't split at the period
            utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance)
            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
                              utterance)
            # Apply the listen flag to the last chunk, set the rest to False
            chunks = [(chunks[i], listen if i == len(chunks) - 1 else False)
                      for i in range(len(chunks))]
            for chunk, listen in chunks:
                # Check if somthing has aborted the speech
                if (_last_stop_signal > start
                        or check_for_signal('buttonPress')):
                    # Clear any newly queued speech
                    tts.playback.clear()
                    break
                try:
                    #ana added this
                    LOG.info("*******THIS IS THE SPEECH SPEED: " +
                             settings_dict["rate"])

                    mute_and_speak(
                        "<speak><prosody volume= " + settings_dict["volume"] +
                        " rate= " + settings_dict["rate"] + ">" + chunk +
                        "</prosody></speak>", ident, listen)
                except KeyboardInterrupt:
                    raise
                except Exception:
                    LOG.error('Error in mute_and_speak', exc_info=True)
        else:
            mute_and_speak(utterance, ident, listen)

        stopwatch.stop()
    report_timing(ident, 'speech', stopwatch, {
        'utterance': utterance,
        'tts': tts.__class__.__name__
    })

    settings_file.close()
Example #20
0
    def _record_phrase(self,
                       source,
                       sec_per_buffer,
                       stream=None,
                       ww_frames=None):
        """Record an entire spoken phrase.

        Essentially, this code waits for a period of silence and then returns
        the audio.  If silence isn't detected, it will terminate and return
        a buffer of self.recording_timeout duration.

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk
            stream (AudioStreamHandler): Stream target that will receive chunks
                                         of the utterance audio while it is
                                         being recorded.
            ww_frames (deque):  Frames of audio data from the last part of wake
                                word detection.

        Returns:
            bytearray: complete audio buffer recorded, including any
                       silence at the end of the user's utterance
        """

        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

        silence_duration = 0

        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
            return level

        def decrease_noise(level):
            if level > min_noise:
                return level - 100 * sec_per_buffer
            return level

        # Smallest number of loud chunks required to return
        min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.recording_timeout / sec_per_buffer)
        num_chunks = 0

        # Will return if exceeded this even if there's not enough loud chunks
        max_chunks_of_silence = int(self.recording_timeout_with_silence /
                                    sec_per_buffer)

        # bytearray to store audio in
        byte_data = get_silence(source.SAMPLE_WIDTH)

        if stream:
            stream.stream_start()

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            if ww_frames:
                chunk = ww_frames.popleft()
            else:
                chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            if stream:
                stream.stream_chunk(chunk)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.multiplier
            is_loud = energy > test_threshold
            if is_loud:
                noise = increase_noise(noise)
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
                self._adjust_threshold(energy, sec_per_buffer)

            if num_chunks % 10 == 0:
                self.write_mic_level(energy, source)

            was_loud_enough = num_loud_chunks > min_loud_chunks

            quiet_enough = noise <= min_noise
            if quiet_enough:
                silence_duration += sec_per_buffer
                if silence_duration < self.MIN_SILENCE_AT_END:
                    quiet_enough = False  # gotta be silent for min of 1/4 sec
            else:
                silence_duration = 0
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True

            # Pressing top-button will end recording immediately
            if check_for_signal('buttonPress'):
                phrase_complete = True

        return byte_data
Example #21
0
 def test_is_speaking(self):
     create_signal('isSpeaking')
     self.assertTrue(mycroft.audio.is_speaking())
     # Check that the signal hasn't been removed
     self.assertTrue(check_for_signal('isSpeaking'))
     self.assertFalse(mycroft.audio.is_speaking())
Example #22
0
    def _record_phrase(self,
                       source,
                       sec_per_buffer,
                       stream=None,
                       ww_frames=None):
        """Record an entire spoken phrase.

        Essentially, this code waits for a period of silence and then returns
        the audio.  If silence isn't detected, it will terminate and return
        a buffer of self.recording_timeout duration.

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk
            stream (AudioStreamHandler): Stream target that will receive chunks
                                         of the utterance audio while it is
                                         being recorded.
            ww_frames (deque):  Frames of audio data from the last part of wake
                                word detection.

        Returns:
            bytearray: complete audio buffer recorded, including any
                       silence at the end of the user's utterance
        """
        noise_tracker = NoiseTracker(0, 25, sec_per_buffer,
                                     self.MIN_LOUD_SEC_PER_PHRASE,
                                     self.recording_timeout_with_silence)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.recording_timeout / sec_per_buffer)
        num_chunks = 0

        # bytearray to store audio in, initialized with a single sample of
        # silence.
        byte_data = get_silence(source.SAMPLE_WIDTH)

        if stream:
            stream.stream_start()

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            if ww_frames:
                chunk = ww_frames.popleft()
            else:
                chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            if stream:
                stream.stream_chunk(chunk)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.multiplier
            is_loud = energy > test_threshold
            noise_tracker.update(is_loud)
            if not is_loud:
                self._adjust_threshold(energy, sec_per_buffer)

            # The phrase is complete if the noise_tracker end of sentence
            # criteria is met or if the  top-button is pressed
            phrase_complete = (noise_tracker.recording_complete()
                               or check_for_signal('buttonPress'))

            # Periodically write the energy level to the mic level file.
            if num_chunks % 10 == 0:
                self._watchdog()
                self.write_mic_level(energy, source)

        return byte_data
Example #23
0
    def end_audio(self):
        """Helper function for child classes to call in execute()"""
        self.ws.emit(Message("recognizer_loop:audio_output_end"))

        # This check will clear the "signal"
        check_for_signal("isSpeaking")
Example #24
0
def handle_speak(event):
    """
        Handle "speak" message
    """
    config = Configuration.get()
    Configuration.init(bus)
    global _last_stop_signal

    # Get conversation ID
    if event.context and 'ident' in event.context:
        ident = event.context['ident']
    else:
        ident = 'unknown'

    start = time.time()  # Time of speech request
    with lock:
        stopwatch = Stopwatch()
        stopwatch.start()
        utterance = event.data['utterance']
        if event.data.get('expect_response', False):
            # When expect_response is requested, the listener will be restarted
            # at the end of the next bit of spoken audio.
            bus.once('recognizer_loop:audio_output_end', _start_listener)

        # This is a bit of a hack for Picroft.  The analog audio on a Pi blocks
        # for 30 seconds fairly often, so we don't want to break on periods
        # (decreasing the chance of encountering the block).  But we will
        # keep the split for non-Picroft installs since it give user feedback
        # faster on longer phrases.
        #
        # TODO: Remove or make an option?  This is really a hack, anyway,
        # so we likely will want to get rid of this when not running on Mimic
        if (config.get('enclosure', {}).get('platform') != "picroft"
                and len(re.findall('<[^>]*>', utterance)) == 0):
            # Remove any whitespace present after the period,
            # if a character (only alpha) ends with a period
            # ex: A. Lincoln -> A.Lincoln
            # so that we don't split at the period
            utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance)
            chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s',
                              utterance)
            for chunk in chunks:
                # Check if somthing has aborted the speech
                if (_last_stop_signal > start
                        or check_for_signal('buttonPress')):
                    # Clear any newly queued speech
                    tts.playback.clear()
                    break
                try:
                    mute_and_speak(chunk, ident)
                except KeyboardInterrupt:
                    raise
                except Exception:
                    LOG.error('Error in mute_and_speak', exc_info=True)
        else:
            mute_and_speak(utterance, ident)

        stopwatch.stop()
    report_timing(ident, 'speech', stopwatch, {
        'utterance': utterance,
        'tts': tts.__class__.__name__
    })
Example #25
0
    def _wait_until_wake_word(self, source, sec_per_buffer):
        """Listen continuously on source until a wake word is spoken

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk
        """
        num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
                               source.SAMPLE_WIDTH)

        silence = '\0' * num_silent_bytes

        # bytearray to store audio in
        byte_data = silence

        buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
        buffers_since_check = 0.0

        # Max bytes for byte_data before audio is removed from the front
        max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)

        said_wake_word = False

        # Rolling buffer to track the audio energy (loudness) heard on
        # the source recently.  An average audio energy is maintained
        # based on these levels.
        energies = []
        idx_energy = 0
        avg_energy = 0.0
        energy_avg_samples = int(5 / sec_per_buffer)  # avg over last 5 secs

        counter = 0

        while not said_wake_word:
            if check_for_signal('buttonPress'):
                said_wake_word = True
                continue

            chunk = self.record_sound_chunk(source)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            if energy < self.energy_threshold * self.multiplier:
                self._adjust_threshold(energy, sec_per_buffer)

            if len(energies) < energy_avg_samples:
                # build the average
                energies.append(energy)
                avg_energy += float(energy)/energy_avg_samples
            else:
                # maintain the running average and rolling buffer
                avg_energy -= float(energies[idx_energy])/energy_avg_samples
                avg_energy += float(energy)/energy_avg_samples
                energies[idx_energy] = energy
                idx_energy = (idx_energy+1) % energy_avg_samples

                # maintain the threshold using average
                if energy < avg_energy*1.5:
                    if energy > self.energy_threshold:
                        # bump the threshold to just above this value
                        self.energy_threshold = energy*1.2

            # Periodically output energy level stats.  This can be used to
            # visualize the microphone input, e.g. a needle on a meter.
            if counter % 3:
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
                f.close()
            counter += 1

            # At first, the buffer is empty and must fill up.  After that
            # just drop the first chunk bytes to keep it the same size.
            needs_to_grow = len(byte_data) < max_size
            if needs_to_grow:
                byte_data += chunk
            else:  # Remove beginning of audio and add new chunk to end
                byte_data = byte_data[len(chunk):] + chunk

            buffers_since_check += 1.0
            if buffers_since_check > buffers_per_check:
                buffers_since_check -= buffers_per_check
                said_wake_word = self.wake_word_in_audio(byte_data + silence)
Example #26
0
    def record_phrase(self, source, sec_per_buffer):
        """
        This attempts to record an entire spoken phrase. Essentially,
        this waits for a period of silence and then returns the audio

        :rtype: bytearray
        :param source: AudioSource
        :param sec_per_buffer: Based on source.SAMPLE_RATE
        :return: bytearray representing the frame_data of the recorded phrase
        """
        num_loud_chunks = 0
        noise = 0

        max_noise = 25
        min_noise = 0

        def increase_noise(level):
            if level < max_noise:
                return level + 200 * sec_per_buffer
            return level

        def decrease_noise(level):
            if level > min_noise:
                return level - 100 * sec_per_buffer
            return level

        # Smallest number of loud chunks required to return
        min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer)

        # Maximum number of chunks to record before timing out
        max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer)
        num_chunks = 0

        # Will return if exceeded this even if there's not enough loud chunks
        max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE /
                                    sec_per_buffer)

        # bytearray to store audio in
        byte_data = '\0' * source.SAMPLE_WIDTH

        phrase_complete = False
        while num_chunks < max_chunks and not phrase_complete:
            chunk = self.record_sound_chunk(source)
            byte_data += chunk
            num_chunks += 1

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            test_threshold = self.energy_threshold * self.multiplier
            is_loud = energy > test_threshold
            if is_loud:
                noise = increase_noise(noise)
                num_loud_chunks += 1
            else:
                noise = decrease_noise(noise)
                self.adjust_threshold(energy, sec_per_buffer)

            was_loud_enough = num_loud_chunks > min_loud_chunks
            quiet_enough = noise <= min_noise
            recorded_too_much_silence = num_chunks > max_chunks_of_silence
            if quiet_enough and (was_loud_enough or recorded_too_much_silence):
                phrase_complete = True
            if check_for_signal('buttonPress'):
                phrase_complete = True

        return byte_data
Example #27
0
 def test_is_speaking(self):
     create_signal('isSpeaking')
     self.assertTrue(mycroft.audio.is_speaking())
     # Check that the signal hasn't been removed
     self.assertTrue(check_for_signal('isSpeaking'))
     self.assertFalse(mycroft.audio.is_speaking())
Example #28
0
 def on_stop_handled(self, event):
     # A skill performed a stop
     check_for_signal('buttonPress')
Example #29
0
    def _wait_until_wake_word(self, source, sec_per_buffer):
        """Listen continuously on source until a wake word is spoken

        Args:
            source (AudioSource):  Source producing the audio chunks
            sec_per_buffer (float):  Fractional number of seconds in each chunk
        """
        num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE *
                               source.SAMPLE_WIDTH)

        silence = '\0' * num_silent_bytes

        # bytearray to store audio in
        byte_data = silence

        buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer
        buffers_since_check = 0.0

        # Max bytes for byte_data before audio is removed from the front
        max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source)

        said_wake_word = False

        # Rolling buffer to track the audio energy (loudness) heard on
        # the source recently.  An average audio energy is maintained
        # based on these levels.
        energies = []
        idx_energy = 0
        avg_energy = 0.0
        energy_avg_samples = int(5 / sec_per_buffer)  # avg over last 5 secs

        counter = 0

        while not said_wake_word:
            if check_for_signal('buttonPress'):
                said_wake_word = True
                continue

            chunk = self.record_sound_chunk(source)

            energy = self.calc_energy(chunk, source.SAMPLE_WIDTH)
            if energy < self.energy_threshold * self.multiplier:
                self._adjust_threshold(energy, sec_per_buffer)

            if len(energies) < energy_avg_samples:
                # build the average
                energies.append(energy)
                avg_energy += float(energy) / energy_avg_samples
            else:
                # maintain the running average and rolling buffer
                avg_energy -= float(energies[idx_energy]) / energy_avg_samples
                avg_energy += float(energy) / energy_avg_samples
                energies[idx_energy] = energy
                idx_energy = (idx_energy + 1) % energy_avg_samples

                # maintain the threshold using average
                if energy < avg_energy * 1.5:
                    if energy > self.energy_threshold:
                        # bump the threshold to just above this value
                        self.energy_threshold = energy * 1.2

            # Periodically output energy level stats.  This can be used to
            # visualize the microphone input, e.g. a needle on a meter.
            if counter % 3:
                with open(self.mic_level_file, 'w') as f:
                    f.write("Energy:  cur=" + str(energy) + " thresh=" +
                            str(self.energy_threshold))
                f.close()
            counter += 1

            # At first, the buffer is empty and must fill up.  After that
            # just drop the first chunk bytes to keep it the same size.
            needs_to_grow = len(byte_data) < max_size
            if needs_to_grow:
                byte_data += chunk
            else:  # Remove beginning of audio and add new chunk to end
                byte_data = byte_data[len(chunk):] + chunk

            buffers_since_check += 1.0
            if buffers_since_check > buffers_per_check:
                buffers_since_check -= buffers_per_check
                said_wake_word = self.wake_word_in_audio(byte_data + silence)