def test_wait_while_speaking(self): # Check that test terminates create_signal('isSpeaking') Thread(target=wait_while_speaking_thread).start() sleep(2) self.assertFalse(done_waiting) check_for_signal('isSpeaking') sleep(2) self.assertTrue(done_waiting)
def test_check_signal(self): if exists('/tmp/mycroft'): rmtree('/tmp/mycroft') # check that signal is not found if file does not exist self.assertFalse(check_for_signal('test_signal')) # Check that the signal is found when created create_signal('test_signal') self.assertTrue(check_for_signal('test_signal')) # Check that the signal is removed after use self.assertFalse(isfile('/tmp/mycroft/ipc/signal/test_signal'))
def end_audio(self): """ Helper function for child classes to call in execute(). Sends the recognizer_loop:audio_output_end message, indicating that speaking is done for the moment. It also checks if cache directory needs cleaning to free up disk space. """ self.bus.emit(Message("recognizer_loop:audio_output_end")) # Clean the cache as needed cache_dir = mycroft.util.get_cache_directory("tts/" + self.tts_name) mycroft.util.curate_cache(cache_dir, min_free_percent=100) # This check will clear the "signal" check_for_signal("isSpeaking")
def main(): """ Main function. Run when file is invoked. """ reset_sigint_handler() check_for_signal("isSpeaking") bus = WebsocketClient() # Connect to the Mycroft Messagebus Configuration.init(bus) speech.init(bus) LOG.info("Starting Audio Services") bus.on('message', create_echo_function('AUDIO', ['mycroft.audio.service'])) audio = AudioService(bus) # Connect audio service instance to message bus create_daemon(bus.run_forever) wait_for_exit_signal() speech.shutdown() audio.shutdown()
def handle_stop(event): """ handle stop message """ global _last_stop_signal if check_for_signal("isSpeaking", -1): _last_stop_signal = time.time() tts.playback.clear() # Clear here to get instant stop bus.emit(Message("mycroft.stop.handled", {"by": "TTS"}))
def handle_stop(event): """ handle stop message """ global _last_stop_signal if check_for_signal("isSpeaking", -1): _last_stop_signal = time.time() tts.playback.clear_queue() tts.playback.clear_visimes()
def _skip_wake_word(self): # Check if told programatically to skip the wake word, like # when we are in a dialog with the user. if check_for_signal('startListening'): return True # Pressing the Mark 1 button can start recording (unless # it is being used to mean 'stop' instead) if check_for_signal('buttonPress', 1): # give other processes time to consume this signal if # it was meant to be a 'stop' sleep(0.25) if check_for_signal('buttonPress'): # Signal is still here, assume it was intended to # begin recording logger.debug("Button Pressed, wakeword not needed") return True return False
def handle_stop(event): """ handle stop message """ global _last_stop_signal if check_for_signal("isSpeaking", -1): _last_stop_signal = time.time() tts.playback.clear_queue() tts.playback.clear_visimes() bus.emit(Message("mycroft.stop.handled", {"by": "TTS"}))
def handle_speak(event): """ Handle "speak" message """ config = Configuration.get() Configuration.init(bus) global _last_stop_signal # Get conversation ID if event.context and 'ident' in event.context: ident = event.context['ident'] else: ident = 'unknown' start = time.time() # Time of speech request with lock: stopwatch = Stopwatch() stopwatch.start() utterance = event.data['utterance'] if event.data.get('expect_response', False): # When expect_response is requested, the listener will be restarted # at the end of the next bit of spoken audio. bus.once('recognizer_loop:audio_output_end', _start_listener) # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if (config.get('enclosure', {}).get('platform') != "picroft" and len(re.findall('<[^>]*>', utterance)) == 0): chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s', utterance) for chunk in chunks: # Check if somthing has aborted the speech if (_last_stop_signal > start or check_for_signal('buttonPress')): # Clear any newly queued speech tts.playback.clear() break try: mute_and_speak(chunk, ident) except KeyboardInterrupt: raise except Exception: LOG.error('Error in mute_and_speak', exc_info=True) else: mute_and_speak(utterance, ident) stopwatch.stop() report_timing(ident, 'speech', stopwatch, {'utterance': utterance, 'tts': tts.__class__.__name__})
def handle_speak(event): global _last_stop_signal # Mild abuse of the signal system to allow other processes to detect # when TTS is happening. See mycroft.util.is_speaking() create_signal("isSpeaking") utterance = event.data['utterance'] expect_response = event.data.get('expect_response', False) # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if not config.get('enclosure', {}).get('platform') == "picroft": start = time.time() chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', utterance) for chunk in chunks: try: mute_and_speak(chunk) except KeyboardInterrupt: raise except: logger.error('Error in mute_and_speak', exc_info=True) if _last_stop_signal > start or check_for_signal('buttonPress'): break else: mute_and_speak(utterance) # This check will clear the "signal" check_for_signal("isSpeaking") if expect_response: create_signal('startListening')
def show_visimes(self, pairs): """ Send visime data to enclosure Args: pairs(list): Visime and timing pair Returns: True if button has been pressed. """ start = time() for code, duration in pairs: if check_for_signal('stoppingTTS', -1): return True if check_for_signal('buttonPress'): return True if self.enclosure: self.enclosure.mouth_viseme(code) delta = time() - start if delta < duration: sleep(duration - delta) return False
def visime(self, output): start = time() pairs = output.split(" ") for pair in pairs: if check_for_signal('buttonPress'): return pho_dur = pair.split(":") # phoneme:duration if len(pho_dur) == 2: code = VISIMES.get(pho_dur[0], '4') self.enclosure.mouth_viseme(code) duration = float(pho_dur[1]) delta = time() - start if delta < duration: sleep(duration - delta)
def wait_until_wake_word(self, source, sec_per_buffer): num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = '\0' * num_silent_bytes # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) said_wake_word = False while not said_wake_word: if check_for_signal('buttonPress'): said_wake_word = True continue chunk = self.record_sound_chunk(source) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self.adjust_threshold(energy, sec_per_buffer) needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 if buffers_since_check < buffers_per_check: buffers_since_check -= buffers_per_check said_wake_word = self.wake_word_in_audio(byte_data + silence)
def _record_phrase(self, source, sec_per_buffer): """Record an entire spoken phrase. Essentially, this code waits for a period of silence and then returns the audio. If silence isn't detected, it will terminate and return a buffer of RECORDING_TIMEOUT duration. Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk Returns: bytearray: complete audio buffer recorded, including any silence at the end of the user's utterance """ num_loud_chunks = 0 noise = 0 max_noise = 25 min_noise = 0 silence_duration = 0 def increase_noise(level): if level < max_noise: return level + 200 * sec_per_buffer return level def decrease_noise(level): if level > min_noise: return level - 100 * sec_per_buffer return level # Smallest number of loud chunks required to return min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer) # Maximum number of chunks to record before timing out max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer) num_chunks = 0 # Will return if exceeded this even if there's not enough loud chunks max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE / sec_per_buffer) # bytearray to store audio in byte_data = '\0' * source.SAMPLE_WIDTH phrase_complete = False while num_chunks < max_chunks and not phrase_complete: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.multiplier is_loud = energy > test_threshold if is_loud: noise = increase_noise(noise) num_loud_chunks += 1 else: noise = decrease_noise(noise) self._adjust_threshold(energy, sec_per_buffer) if num_chunks % 10 == 0: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() was_loud_enough = num_loud_chunks > min_loud_chunks quiet_enough = noise <= min_noise if quiet_enough: silence_duration += sec_per_buffer if silence_duration < self.MIN_SILENCE_AT_END: quiet_enough = False # gotta be silent for min of 1/4 sec else: silence_duration = 0 recorded_too_much_silence = num_chunks > max_chunks_of_silence if quiet_enough and (was_loud_enough or recorded_too_much_silence): phrase_complete = True # Pressing top-button will end recording immediately if check_for_signal('buttonPress'): phrase_complete = True return byte_data
def _record_phrase(self, source, sec_per_buffer): """Record an entire spoken phrase. Essentially, this code waits for a period of silence and then returns the audio. If silence isn't detected, it will terminate and return a buffer of RECORDING_TIMEOUT duration. Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk Returns: bytearray: complete audio buffer recorded, including any silence at the end of the user's utterance """ num_loud_chunks = 0 noise = 0 max_noise = 25 min_noise = 0 silence_duration = 0 def increase_noise(level): if level < max_noise: return level + 200 * sec_per_buffer return level def decrease_noise(level): if level > min_noise: return level - 100 * sec_per_buffer return level # Smallest number of loud chunks required to return min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer) # Maximum number of chunks to record before timing out max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer) num_chunks = 0 # Will return if exceeded this even if there's not enough loud chunks max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE / sec_per_buffer) # bytearray to store audio in byte_data = get_silence(source.SAMPLE_WIDTH) phrase_complete = False while num_chunks < max_chunks and not phrase_complete: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.multiplier is_loud = energy > test_threshold if is_loud: noise = increase_noise(noise) num_loud_chunks += 1 else: noise = decrease_noise(noise) self._adjust_threshold(energy, sec_per_buffer) if num_chunks % 10 == 0: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() was_loud_enough = num_loud_chunks > min_loud_chunks quiet_enough = noise <= min_noise if quiet_enough: silence_duration += sec_per_buffer if silence_duration < self.MIN_SILENCE_AT_END: quiet_enough = False # gotta be silent for min of 1/4 sec else: silence_duration = 0 recorded_too_much_silence = num_chunks > max_chunks_of_silence if quiet_enough and (was_loud_enough or recorded_too_much_silence): phrase_complete = True # Pressing top-button will end recording immediately if check_for_signal('buttonPress'): phrase_complete = True return byte_data
def handle_speak(event): """Handle "speak" message Parse sentences and invoke text to speech service. """ config = Configuration.get() Configuration.set_config_update_handlers(bus) global _last_stop_signal # if the message is targeted and audio is not the target don't # don't synthezise speech event.context = event.context or {} if event.context.get('destination') and not \ ('debug_cli' in event.context['destination'] or 'audio' in event.context['destination']): return # Get conversation ID if event.context and 'ident' in event.context: ident = event.context['ident'] else: ident = 'unknown' start = time.time() # Time of speech request with lock: stopwatch = Stopwatch() stopwatch.start() play_error = _should_play_error(event.data) utterance = event.data['utterance'] listen = event.data.get('expect_response', False) # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if (config.get('enclosure', {}).get('platform') != "picroft" and len( re.findall('<[^>]*>', utterance)) == 0) and not play_error: # Remove any whitespace present after the period, # if a character (only alpha) ends with a period # ex: A. Lincoln -> A.Lincoln # so that we don't split at the period utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance) chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s', utterance) # Apply the listen flag to the last chunk, set the rest to False chunks = [(chunks[i], listen if i == len(chunks) - 1 else False) for i in range(len(chunks))] for chunk, listen in chunks: # Check if somthing has aborted the speech if (_last_stop_signal > start or check_for_signal('buttonPress')): # Clear any newly queued speech tts.playback.clear() break try: mute_and_speak(chunk, ident, listen) except KeyboardInterrupt: raise except Exception: LOG.error('Error in mute_and_speak', exc_info=True) else: mute_and_speak(utterance, ident, listen, play_error) stopwatch.stop() report_timing(ident, 'speech', stopwatch, { 'utterance': utterance, 'tts': tts.__class__.__name__ })
def on_stop_handled(self, event): # A skill performed a stop check_for_signal('buttonPress')
def record_phrase(self, source, sec_per_buffer): """ This attempts to record an entire spoken phrase. Essentially, this waits for a period of silence and then returns the audio :rtype: bytearray :param source: AudioSource :param sec_per_buffer: Based on source.SAMPLE_RATE :return: bytearray representing the frame_data of the recorded phrase """ num_loud_chunks = 0 noise = 0 max_noise = 25 min_noise = 0 def increase_noise(level): if level < max_noise: return level + 200 * sec_per_buffer return level def decrease_noise(level): if level > min_noise: return level - 100 * sec_per_buffer return level # Smallest number of loud chunks required to return min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer) # Maximum number of chunks to record before timing out max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer) num_chunks = 0 # Will return if exceeded this even if there's not enough loud chunks max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE / sec_per_buffer) # bytearray to store audio in byte_data = '\0' * source.SAMPLE_WIDTH phrase_complete = False while num_chunks < max_chunks and not phrase_complete: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.threshold_multiplier is_loud = energy > test_threshold if is_loud: noise = increase_noise(noise) num_loud_chunks += 1 else: noise = decrease_noise(noise) self.adjust_threshold(energy, sec_per_buffer) was_loud_enough = num_loud_chunks > min_loud_chunks quiet_enough = noise <= min_noise recorded_too_much_silence = num_chunks > max_chunks_of_silence if quiet_enough and (was_loud_enough or recorded_too_much_silence): phrase_complete = True if check_for_signal('buttonPress'): phrase_complete = True return byte_data
def handle_speak(event): """Handle "speak" message Parse sentences and invoke text to speech service. """ #/home/insitelabdev/mycroft-core/mycroft/client/speech/set_config.txt path = pathlib.Path().absolute() settings_file = open( str(path) + '/mycroft/client/speech/set_config.txt', 'r') settings_dict = eval(settings_file.read()) config = Configuration.get() Configuration.set_config_update_handlers(bus) global _last_stop_signal # Get conversation ID if event.context and 'ident' in event.context: ident = event.context['ident'] else: ident = 'unknown' start = time.time() # Time of speech request with lock: stopwatch = Stopwatch() stopwatch.start() utterance = event.data['utterance'] listen = event.data.get('expect_response', False) # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if (config.get('enclosure', {}).get('platform') != "picroft" and len(re.findall('<[^>]*>', utterance)) == 0): # Remove any whitespace present after the period, # if a character (only alpha) ends with a period # ex: A. Lincoln -> A.Lincoln # so that we don't split at the period utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance) chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s', utterance) # Apply the listen flag to the last chunk, set the rest to False chunks = [(chunks[i], listen if i == len(chunks) - 1 else False) for i in range(len(chunks))] for chunk, listen in chunks: # Check if somthing has aborted the speech if (_last_stop_signal > start or check_for_signal('buttonPress')): # Clear any newly queued speech tts.playback.clear() break try: #ana added this LOG.info("*******THIS IS THE SPEECH SPEED: " + settings_dict["rate"]) mute_and_speak( "<speak><prosody volume= " + settings_dict["volume"] + " rate= " + settings_dict["rate"] + ">" + chunk + "</prosody></speak>", ident, listen) except KeyboardInterrupt: raise except Exception: LOG.error('Error in mute_and_speak', exc_info=True) else: mute_and_speak(utterance, ident, listen) stopwatch.stop() report_timing(ident, 'speech', stopwatch, { 'utterance': utterance, 'tts': tts.__class__.__name__ }) settings_file.close()
def _record_phrase(self, source, sec_per_buffer, stream=None, ww_frames=None): """Record an entire spoken phrase. Essentially, this code waits for a period of silence and then returns the audio. If silence isn't detected, it will terminate and return a buffer of self.recording_timeout duration. Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk stream (AudioStreamHandler): Stream target that will receive chunks of the utterance audio while it is being recorded. ww_frames (deque): Frames of audio data from the last part of wake word detection. Returns: bytearray: complete audio buffer recorded, including any silence at the end of the user's utterance """ num_loud_chunks = 0 noise = 0 max_noise = 25 min_noise = 0 silence_duration = 0 def increase_noise(level): if level < max_noise: return level + 200 * sec_per_buffer return level def decrease_noise(level): if level > min_noise: return level - 100 * sec_per_buffer return level # Smallest number of loud chunks required to return min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer) # Maximum number of chunks to record before timing out max_chunks = int(self.recording_timeout / sec_per_buffer) num_chunks = 0 # Will return if exceeded this even if there's not enough loud chunks max_chunks_of_silence = int(self.recording_timeout_with_silence / sec_per_buffer) # bytearray to store audio in byte_data = get_silence(source.SAMPLE_WIDTH) if stream: stream.stream_start() phrase_complete = False while num_chunks < max_chunks and not phrase_complete: if ww_frames: chunk = ww_frames.popleft() else: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 if stream: stream.stream_chunk(chunk) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.multiplier is_loud = energy > test_threshold if is_loud: noise = increase_noise(noise) num_loud_chunks += 1 else: noise = decrease_noise(noise) self._adjust_threshold(energy, sec_per_buffer) if num_chunks % 10 == 0: self.write_mic_level(energy, source) was_loud_enough = num_loud_chunks > min_loud_chunks quiet_enough = noise <= min_noise if quiet_enough: silence_duration += sec_per_buffer if silence_duration < self.MIN_SILENCE_AT_END: quiet_enough = False # gotta be silent for min of 1/4 sec else: silence_duration = 0 recorded_too_much_silence = num_chunks > max_chunks_of_silence if quiet_enough and (was_loud_enough or recorded_too_much_silence): phrase_complete = True # Pressing top-button will end recording immediately if check_for_signal('buttonPress'): phrase_complete = True return byte_data
def test_is_speaking(self): create_signal('isSpeaking') self.assertTrue(mycroft.audio.is_speaking()) # Check that the signal hasn't been removed self.assertTrue(check_for_signal('isSpeaking')) self.assertFalse(mycroft.audio.is_speaking())
def _record_phrase(self, source, sec_per_buffer, stream=None, ww_frames=None): """Record an entire spoken phrase. Essentially, this code waits for a period of silence and then returns the audio. If silence isn't detected, it will terminate and return a buffer of self.recording_timeout duration. Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk stream (AudioStreamHandler): Stream target that will receive chunks of the utterance audio while it is being recorded. ww_frames (deque): Frames of audio data from the last part of wake word detection. Returns: bytearray: complete audio buffer recorded, including any silence at the end of the user's utterance """ noise_tracker = NoiseTracker(0, 25, sec_per_buffer, self.MIN_LOUD_SEC_PER_PHRASE, self.recording_timeout_with_silence) # Maximum number of chunks to record before timing out max_chunks = int(self.recording_timeout / sec_per_buffer) num_chunks = 0 # bytearray to store audio in, initialized with a single sample of # silence. byte_data = get_silence(source.SAMPLE_WIDTH) if stream: stream.stream_start() phrase_complete = False while num_chunks < max_chunks and not phrase_complete: if ww_frames: chunk = ww_frames.popleft() else: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 if stream: stream.stream_chunk(chunk) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.multiplier is_loud = energy > test_threshold noise_tracker.update(is_loud) if not is_loud: self._adjust_threshold(energy, sec_per_buffer) # The phrase is complete if the noise_tracker end of sentence # criteria is met or if the top-button is pressed phrase_complete = (noise_tracker.recording_complete() or check_for_signal('buttonPress')) # Periodically write the energy level to the mic level file. if num_chunks % 10 == 0: self._watchdog() self.write_mic_level(energy, source) return byte_data
def end_audio(self): """Helper function for child classes to call in execute()""" self.ws.emit(Message("recognizer_loop:audio_output_end")) # This check will clear the "signal" check_for_signal("isSpeaking")
def handle_speak(event): """ Handle "speak" message """ config = Configuration.get() Configuration.init(bus) global _last_stop_signal # Get conversation ID if event.context and 'ident' in event.context: ident = event.context['ident'] else: ident = 'unknown' start = time.time() # Time of speech request with lock: stopwatch = Stopwatch() stopwatch.start() utterance = event.data['utterance'] if event.data.get('expect_response', False): # When expect_response is requested, the listener will be restarted # at the end of the next bit of spoken audio. bus.once('recognizer_loop:audio_output_end', _start_listener) # This is a bit of a hack for Picroft. The analog audio on a Pi blocks # for 30 seconds fairly often, so we don't want to break on periods # (decreasing the chance of encountering the block). But we will # keep the split for non-Picroft installs since it give user feedback # faster on longer phrases. # # TODO: Remove or make an option? This is really a hack, anyway, # so we likely will want to get rid of this when not running on Mimic if (config.get('enclosure', {}).get('platform') != "picroft" and len(re.findall('<[^>]*>', utterance)) == 0): # Remove any whitespace present after the period, # if a character (only alpha) ends with a period # ex: A. Lincoln -> A.Lincoln # so that we don't split at the period utterance = re.sub(r'\b([A-za-z][\.])(\s+)', r'\g<1>', utterance) chunks = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\;|\?)\s', utterance) for chunk in chunks: # Check if somthing has aborted the speech if (_last_stop_signal > start or check_for_signal('buttonPress')): # Clear any newly queued speech tts.playback.clear() break try: mute_and_speak(chunk, ident) except KeyboardInterrupt: raise except Exception: LOG.error('Error in mute_and_speak', exc_info=True) else: mute_and_speak(utterance, ident) stopwatch.stop() report_timing(ident, 'speech', stopwatch, { 'utterance': utterance, 'tts': tts.__class__.__name__ })
def _wait_until_wake_word(self, source, sec_per_buffer): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = '\0' * num_silent_bytes # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 while not said_wake_word: if check_for_signal('buttonPress'): said_wake_word = True continue chunk = self.record_sound_chunk(source) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy)/energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy])/energy_avg_samples avg_energy += float(energy)/energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy+1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy*1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy*1.2 # Periodically output energy level stats. This can be used to # visualize the microphone input, e.g. a needle on a meter. if counter % 3: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check said_wake_word = self.wake_word_in_audio(byte_data + silence)
def record_phrase(self, source, sec_per_buffer): """ This attempts to record an entire spoken phrase. Essentially, this waits for a period of silence and then returns the audio :rtype: bytearray :param source: AudioSource :param sec_per_buffer: Based on source.SAMPLE_RATE :return: bytearray representing the frame_data of the recorded phrase """ num_loud_chunks = 0 noise = 0 max_noise = 25 min_noise = 0 def increase_noise(level): if level < max_noise: return level + 200 * sec_per_buffer return level def decrease_noise(level): if level > min_noise: return level - 100 * sec_per_buffer return level # Smallest number of loud chunks required to return min_loud_chunks = int(self.MIN_LOUD_SEC_PER_PHRASE / sec_per_buffer) # Maximum number of chunks to record before timing out max_chunks = int(self.RECORDING_TIMEOUT / sec_per_buffer) num_chunks = 0 # Will return if exceeded this even if there's not enough loud chunks max_chunks_of_silence = int(self.RECORDING_TIMEOUT_WITH_SILENCE / sec_per_buffer) # bytearray to store audio in byte_data = '\0' * source.SAMPLE_WIDTH phrase_complete = False while num_chunks < max_chunks and not phrase_complete: chunk = self.record_sound_chunk(source) byte_data += chunk num_chunks += 1 energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) test_threshold = self.energy_threshold * self.multiplier is_loud = energy > test_threshold if is_loud: noise = increase_noise(noise) num_loud_chunks += 1 else: noise = decrease_noise(noise) self.adjust_threshold(energy, sec_per_buffer) was_loud_enough = num_loud_chunks > min_loud_chunks quiet_enough = noise <= min_noise recorded_too_much_silence = num_chunks > max_chunks_of_silence if quiet_enough and (was_loud_enough or recorded_too_much_silence): phrase_complete = True if check_for_signal('buttonPress'): phrase_complete = True return byte_data
def _wait_until_wake_word(self, source, sec_per_buffer): """Listen continuously on source until a wake word is spoken Args: source (AudioSource): Source producing the audio chunks sec_per_buffer (float): Fractional number of seconds in each chunk """ num_silent_bytes = int(self.SILENCE_SEC * source.SAMPLE_RATE * source.SAMPLE_WIDTH) silence = '\0' * num_silent_bytes # bytearray to store audio in byte_data = silence buffers_per_check = self.SEC_BETWEEN_WW_CHECKS / sec_per_buffer buffers_since_check = 0.0 # Max bytes for byte_data before audio is removed from the front max_size = self.sec_to_bytes(self.SAVED_WW_SEC, source) said_wake_word = False # Rolling buffer to track the audio energy (loudness) heard on # the source recently. An average audio energy is maintained # based on these levels. energies = [] idx_energy = 0 avg_energy = 0.0 energy_avg_samples = int(5 / sec_per_buffer) # avg over last 5 secs counter = 0 while not said_wake_word: if check_for_signal('buttonPress'): said_wake_word = True continue chunk = self.record_sound_chunk(source) energy = self.calc_energy(chunk, source.SAMPLE_WIDTH) if energy < self.energy_threshold * self.multiplier: self._adjust_threshold(energy, sec_per_buffer) if len(energies) < energy_avg_samples: # build the average energies.append(energy) avg_energy += float(energy) / energy_avg_samples else: # maintain the running average and rolling buffer avg_energy -= float(energies[idx_energy]) / energy_avg_samples avg_energy += float(energy) / energy_avg_samples energies[idx_energy] = energy idx_energy = (idx_energy + 1) % energy_avg_samples # maintain the threshold using average if energy < avg_energy * 1.5: if energy > self.energy_threshold: # bump the threshold to just above this value self.energy_threshold = energy * 1.2 # Periodically output energy level stats. This can be used to # visualize the microphone input, e.g. a needle on a meter. if counter % 3: with open(self.mic_level_file, 'w') as f: f.write("Energy: cur=" + str(energy) + " thresh=" + str(self.energy_threshold)) f.close() counter += 1 # At first, the buffer is empty and must fill up. After that # just drop the first chunk bytes to keep it the same size. needs_to_grow = len(byte_data) < max_size if needs_to_grow: byte_data += chunk else: # Remove beginning of audio and add new chunk to end byte_data = byte_data[len(chunk):] + chunk buffers_since_check += 1.0 if buffers_since_check > buffers_per_check: buffers_since_check -= buffers_per_check said_wake_word = self.wake_word_in_audio(byte_data + silence)