def webrtc_split(audio, rate, aggressiveness=3, frame_duration_ms=30, window_duration_ms=300): # adapted from https://github.com/wiseman/py-webrtcvad/blob/master/example.py audio_bytes, audio_rate = to_pcm16(audio, rate) vad = Vad(aggressiveness) num_window_frames = int(window_duration_ms / frame_duration_ms) sliding_window = collections.deque(maxlen=num_window_frames) triggered = False voiced_frames = [] for frame in generate_frames(audio_bytes, audio_rate, frame_duration_ms): is_speech = vad.is_speech(frame.bytes, audio_rate) sliding_window.append((frame, is_speech)) if not triggered: num_voiced = len([f for f, speech in sliding_window if speech]) if num_voiced > 0.9 * sliding_window.maxlen: triggered = True voiced_frames += [frame for frame, _ in sliding_window] sliding_window.clear() else: voiced_frames.append(frame) num_unvoiced = len( [f for f, speech in sliding_window if not speech]) if num_unvoiced > 0.9 * sliding_window.maxlen: triggered = False yield voiced_frames, audio_rate sliding_window.clear() voiced_frames = [] if voiced_frames: yield voiced_frames, audio_rate
class VAD(Element): def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None): super(VAD, self).__init__() self.rate = rate self.vad = Vad(mode) self.on_inactive = on_inactive self.limit_inactive_cnt = duration / 10 # a frame is 10 ms self.current_inactive_cnt = 0 def put(self, data): active = self.vad.is_speech(data, self.rate) if active: self.current_inactive_cnt = 0 else: self.current_inactive_cnt += 1 if self.current_inactive_cnt == self.limit_inactive_cnt: if callable(self.on_inactive): self.on_inactive() self.current_inactive_cnt = 0 super(VAD, self).put(data) def on_inactive(self, cb): self.on_inactive = cb
class VoiceOver(object): """ Bag of state for tracking stuff in the stream callback. """ def __init__(self): self.input_buf = np.empty((BLOCK_SIZE, 1), dtype=np.int16) self.vad = Vad(2) self.vad_q = deque([False], LAG_TIME) self.output = cycle(open_wav()) def output_take(self, n_bytes): return list(islice(self.output, n_bytes)) def input_is_talking(self): return sum(self.vad_q) > len(self.vad_q) * NECESSARY_FRACTION def callback(self, in_data, out_data, time_info, status): self.input_buf = np.concatenate((self.input_buf, in_data)) if len(self.input_buf) > HOP_SIZE: # we can pass data to vad ten_ms, rest = (self.input_buf[0:HOP_SIZE], self.input_buf[HOP_SIZE:]) resampled_to_32k = resample(ten_ms, 3 * 320, axis=0).astype(np.int16).tostring() self.vad_q.append(self.vad.is_speech(resampled_to_32k, 32000)) self.input_buf = rest if self.input_is_talking(): out_data[:] = self.output_take(len(out_data)) else: out_data[:] = np.zeros(out_data.shape) return continue_flag
def vad_filter(sample_rate: int, vad: webrtcvad.Vad, frames: List[bytes]) -> bytes: """ # Adapted from https://github.com/wiseman/py-webrtcvad/blob/3b39545dbb026d998bf407f1cb86e0ed6192a5a6/example.py#L45 Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, returns the voiced audio. Uses a padded, sliding window algorithm over the audio frames. All frames between [prev(first(speech_frame)), next(last(speech_frame))] will be considered as speech frames. This is done because we want to remove leading and tailing silence here. Arguments: sample_rate - The audio sample rate, in Hz. vad - An instance of webrtcvad.Vad. frames - a source of audio frames. Returns: the speech withing the frames. """ voiced_frames_offsets = [ i for i, frame in enumerate(frames) if vad.is_speech(frame, sample_rate) ] if len(voiced_frames_offsets) > 0: return b''.join(frames[max(voiced_frames_offsets[0] - 1, 0):min(voiced_frames_offsets[-1] + 2, len(frames))]) # + 2 because b is not included in list[a:b] else: return b''.join(frames)
def vad_split( audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3, ): sample_rate, channels, width = audio_format if channels != 1: raise ValueError("VAD-splitting requires mono samples") if width != 2: raise ValueError("VAD-splitting requires 16 bit samples") if sample_rate not in [8000, 16000, 32000, 48000]: raise ValueError( "VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000" ) if aggressiveness not in [0, 1, 2, 3]: raise ValueError( "VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3") ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False vad = Vad(int(aggressiveness)) voiced_frames = [] frame_duration_ms = 0 frame_index = 0 for frame_index, frame in enumerate(audio_frames): frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 if int(frame_duration_ms) not in [10, 20, 30]: raise ValueError( "VAD-splitting only supported for frame durations 10, 20, or 30 ms" ) is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > threshold * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > threshold * ring_buffer.maxlen: triggered = False yield b"".join(voiced_frames), frame_duration_ms * max( 0, frame_index - len(voiced_frames)), frame_duration_ms * frame_index ring_buffer.clear() voiced_frames = [] if len(voiced_frames) > 0: yield b"".join(voiced_frames), frame_duration_ms * ( frame_index - len(voiced_frames)), frame_duration_ms * (frame_index + 1)
class VAD(Element): def __init__(self, rate=16000, mode=0): super(VAD, self).__init__() self.rate = rate self.vad = Vad(mode) def put(self, data): voice = '1' if self.vad.is_speech(data, self.rate) else '0' sys.stdout.write(voice) sys.stdout.flush() super(VAD, self).put(data)
def check_for_speech(self, frame_duration_ms=20): """Checks for speech. :param int frame_duration_ms: Audio frame length in ms. """ vad = Vad(self.vad_aggressiveness) speaking = False # to keep track of if vocalization ongoing n = int(SAMPLE_RATE * (frame_duration_ms / 1000.) * 2) # duration = n / SAMPLE_RATE / 2.0 last_timestamp_sent = 0 while not self.done.is_set(): chunk = self.data_queue.get() offset = 0 framecount = [] while offset + n < len(chunk): now = time.time( ) * 1000.0 # caveat: this is not the same as PyEPL's clock... frame = chunk[offset:offset + n] if vad.is_speech(frame, SAMPLE_RATE): framecount.append({"timestamp": now}) if len(framecount ) >= self.consecutive_frames and not speaking: speaking = True payload = { "speaking": True, "timestamp": framecount[0]["timestamp"] } self.pipe.send(ipc.message("VOCALIZATION", payload)) self.logger.debug("Started speaking at %f", now) else: if speaking: speaking = False payload = {"speaking": False, "timestamp": now} self.pipe.send(ipc.message("VOCALIZATION", payload)) self.logger.debug("Stopped speaking at %f", now) framecount = [] offset += n now = time.time() * 1000 if now - last_timestamp_sent >= 1000: self.pipe.send(ipc.message("TIMESTAMP", dict(timestamp=now))) last_timestamp_sent = now
def iter_wav_chunks(input_uri, input_format, framerate=16000, vad_duration=0.02, min_chunk_len=2, max_chunk_len=10): vad = Vad(2) bufferbytes = io.BytesIO() buffersize = 0 bufferduration = 0 remains = b'' audio_offset = .0 for ok, *payload in \ stream2wav(input_uri, input_format, framerate): if not ok: raise RuntimeError(payload[0]) header, body, _, secondsize = payload chunksize = round(secondsize * 0.02) # 20ms body = remains + body if min_chunk_len < 0: # no limit bufferbytes.write(body) buffersize += len(body) bufferduration = buffersize / secondsize continue for offset in range(0, len(body), chunksize): chunk = body[offset:offset + chunksize] if len(chunk) < chunksize: remains = chunk break if bufferduration < min_chunk_len or \ (bufferduration < max_chunk_len and vad.is_speech(chunk, framerate)): bufferbytes.write(chunk) buffersize += chunksize bufferduration += chunksize / secondsize elif buffersize > 0: audiodata = bufferbytes.getvalue() + chunk duration = len(audiodata) / secondsize yield (header, audiodata, duration, audio_offset) audio_offset += duration bufferbytes = io.BytesIO() buffersize = 0 bufferduration = 0 if buffersize > 0: audiodata = bufferbytes.getvalue() + remains duration = len(audiodata) / secondsize yield (header, audiodata, duration, audio_offset)
def trim_long_silences(wav: np.ndarray, vad: webrtcvad.Vad = None) -> np.ndarray: """ Ensures that segments without voice in the waveform remain no longer than a threshold determined by the VAD parameters in params.py. :param wav: the raw waveform as a numpy array of floats :param vad: an webrtcvad.Vad object. A new one with mode=3 will be created if None. :return: the same waveform with silences trimmed away (length <= original wav length) """ # Compute the voice detection window size samples_per_window = (vad_window_length * sampling_rate) // 1000 # Trim the end of the audio to have a multiple of the window size wav = wav[:len(wav) - (len(wav) % samples_per_window)] # Convert the float waveform to 16-bit mono PCM pcm_wave = struct.pack(f'{len(wav)}h', *(np.round(wav * int16_max)).astype(np.int16)) # Perform voice activation detection voice_flags = [] if vad is None: vad = webrtcvad.Vad(mode=3) for window_start in range(0, len(wav), samples_per_window): window_end = window_start + samples_per_window voice_flags.append( vad.is_speech(pcm_wave[window_start * 2:window_end * 2], sample_rate=sampling_rate)) voice_flags = np.array(voice_flags) # Smooth the voice detection with a moving average def moving_average(array, width): array_padded = np.concatenate((np.zeros( (width - 1) // 2), array, np.zeros(width // 2))) ret = np.cumsum(array_padded, dtype=float) ret[width:] = ret[width:] - ret[:-width] return ret[width - 1:] / width audio_mask = moving_average(voice_flags, vad_moving_average_width) audio_mask = np.round(audio_mask).astype(np.bool) # Dilate the voiced regions audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) audio_mask = np.repeat(audio_mask, samples_per_window) return wav[audio_mask]
class Listener: q = Queue() def __init__(self, samplerate, on_noise=None): self.samplerate = samplerate self.speech_timeout = SPEECH_TIMEOUT self.on_noise = on_noise self.listening = Lock() self.vad = Vad() self.vad.set_mode(3) # very restrictive filtering @staticmethod def _device_callback(indata, frames, time, status): """ This is called (from a separate thread) for each audio block. """ Listener.q.put(bytes(indata)) def record(self): recorded_data = b'' current = time.time() end = time.time() + self.speech_timeout # record until no sound is detected or time is over while current <= end: data = Listener.q.get() recorded_data += data if self.vad.is_speech(data, self.samplerate): end = time.time() + self.speech_timeout current = time.time() #print(end - start) return recorded_data def _start(self): self.listening.acquire() with sd.RawInputStream(samplerate=self.samplerate, channels=1, callback=Listener._device_callback, dtype='int16', blocksize=int(self.samplerate * 0.03)): while self.listening.locked(): data = Listener.q.get() if self.on_noise is not None: self.on_noise(data) def start(self): Thread(target=self._start).start() def stop(self): if self.listening.locked(): self.listening.release()
def get_voice_events(filename, frame_dur, aggressiveness): """Evaluate the file for voice events. :param str filename: :param int frame_dur: :param int aggressiveness: """ assert frame_dur in [10, 20, 30] assert aggressiveness in range(4) vad = Vad() vad.set_mode(args.aggressiveness) sample_rate = 16000 frame_dur = args.frame_duration clip = downsample(filename, sample_rate).read() return [ (frame_dur*n, vad.is_speech(frame.bytes, sample_rate)) for n, frame in enumerate(frame_generator(clip, frame_dur, sample_rate)) ]
class VAD(object): AUDIO_FRAME_MS = 10 BUFFER_SIZE = 100 AUDIO_TYPE = np.int16 AUDIO_TYPE_BYTES = 2 VOICE_THRESHOLD = 0.6 VOICE_WINDOW = 50 MODE = 3 def __init__(self, microphone): # type: (AbstractMicrophone) -> VAD """ Perform Voice Activity Detection on Microphone Input Parameters ---------- microphone: AbstractMicrophone """ self._microphone = microphone self._vad = Vad(VAD.MODE) # Voice Activity Detection Frame Size, Atomic VAD Unit self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000 self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size), VAD.AUDIO_TYPE) self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool) self._buffer_index = 0 self._utterance = None self._utterance_queue = Queue() self._frame_buffer = bytearray() self._activation = 0 self.microphone.callbacks += [self._on_audio] @property def microphone(self): """ VAD Microphone Returns ------- microphone: AbstractMicrophone """ return self._microphone @property def activation(self): """ VAD Activation Returns ------- activation: float """ return self._activation @property def utterances(self): # type: () -> Iterable[Utterance] """ Get Utterances from Microphone Stream Yields ------- voices: Iterable of Voice """ while True: yield self._utterance_queue.get() def _on_audio(self, audio): # type: (np.ndarray) -> None # Work through Microphone Stream Frame by Frame self._frame_buffer.extend(audio.tobytes()) while len(self._frame_buffer) >= self._frame_size_bytes: self._on_frame( np.frombuffer(self._frame_buffer[:self._frame_size_bytes], VAD.AUDIO_TYPE)) del self._frame_buffer[:self._frame_size_bytes] def _on_frame(self, frame): self._activation = self._calculate_activation(frame) if not self._utterance: if self.activation > VAD.VOICE_THRESHOLD: # Create New Utterance Object self._utterance = Utterance() # Add Buffer Contents to Utterance self._utterance.add_frame( self._audio_buffer[self._buffer_index:].ravel()) self._utterance.add_frame( self._audio_buffer[:self._buffer_index].ravel()) # Add Utterance to Utterance Queue self._utterance_queue.put(self._utterance) else: # If Utterance Ongoing: Add Frame to Utterance Object if self.activation > VAD.VOICE_THRESHOLD: self._utterance.add_frame(frame) # Else: Terminate Utterance else: self._utterance.add_frame(None) self._utterance = None def _calculate_activation(self, frame): # Update Buffers self._audio_buffer[self._buffer_index] = frame self._voice_buffer[self._buffer_index] = self._vad.is_speech( frame.tobytes(), self.microphone.rate, len(frame)) self._buffer_index = (self._buffer_index + 1) % VAD.BUFFER_SIZE # Calculate Activation voice_window = np.arange(self._buffer_index - VAD.VOICE_WINDOW, self._buffer_index) % VAD.BUFFER_SIZE return np.mean(self._voice_buffer[voice_window]) def __iter__(self): return self.utterances
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, aggressiveness, audio): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only the voiced audio. Uses a padded, sliding window algorithm over the audio frames. When more than 90% of the frames in the window are voiced (as reported by the VAD), the collector triggers and begins yielding audio frames. Then the collector waits until 90% of the frames in the window are unvoiced to detrigger. The window is padded at the front and back to provide a small amount of silence or the beginnings/endings of speech around the voiced frames. Arguments: sample_rate - The audio sample rate, in Hz. frame_duration_ms - The frame duration in milliseconds. padding_duration_ms - The amount to pad the window, in milliseconds. vad - An instance of webrtcvad.Vad. frames - a source of audio frames (sequence or generator). Returns: A generator that yields PCM audio data. """ vad = Vad(aggressiveness) num_padding_frames = int(padding_duration_ms / frame_duration_ms) # We use a deque for our sliding window/ring buffer. ring_buffer = collections.deque(maxlen=num_padding_frames) # We have two states: TRIGGERED and NOTTRIGGERED. We start in the # NOTTRIGGERED state. triggered = False voiced_frames = [] for frame in frame_generator(30, audio, sample_rate): is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) # If we're NOTTRIGGERED and more than 90% of the frames in # the ring buffer are voiced frames, then enter the # TRIGGERED state. if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: # We're in the TRIGGERED state, so collect the audio data # and add it to the ring buffer. voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) # If more than 90% of the frames in the ring buffer are # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False yield b''.join([f for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: yield b''.join([f for f in voiced_frames])
class AudioManager(Manager): SAMPLERATE = 16000 FRAMES_PER_BUFFER = 320 LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav' SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav' def __init__(self): super().__init__() self._stopPlayingFlag: Optional[AliceEvent] = None self._playing = False self._waves: Dict[str, wave.Wave_write] = dict() self._audioInputStream = None if not self.ConfigManager.getAliceConfigByName('disableCapture'): self._vad = Vad(2) self._audioInput = None self._audioOutput = None def onStart(self): super().onStart() if not self.ConfigManager.getAliceConfigByName('inputDevice'): self.logWarning( 'Input device not set in config, trying to find default device' ) try: self._audioInput = sd.query_devices(kind='input')['name'] except: self.logFatal('Audio input not found, cannot continue') return self.ConfigManager.updateAliceConfiguration(key='inputDevice', value=self._audioInput) else: self._audioInput = self.ConfigManager.getAliceConfigByName( 'inputDevice') if not self.ConfigManager.getAliceConfigByName('outputDevice'): self.logWarning( 'Output device not set in config, trying to find default device' ) try: self._audioOutput = sd.query_devices(kind='output')['name'] except: self.logFatal('Audio output not found, cannot continue') return self.ConfigManager.updateAliceConfiguration( key='outputDevice', value=self._audioOutput) else: self._audioOutput = self.ConfigManager.getAliceConfigByName( 'outputDevice') self.setDefaults() self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying') self.MqttManager.mqttClient.subscribe( constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid'))) def onBooted(self): if not self.ConfigManager.getAliceConfigByName('disableCapture'): self.ThreadManager.newThread(name='audioPublisher', target=self.publishAudio) def setDefaults(self): self.logInfo(f'Using **{self._audioInput}** for audio input') self.logInfo(f'Using **{self._audioOutput}** for audio output') sd.default.device = self._audioInput, self._audioOutput def onStop(self): super().onStop() if self._audioInputStream: self._audioInputStream.stop(ignore_errors=True) self._audioInputStream.close(ignore_errors=True) self.MqttManager.mqttClient.unsubscribe( constants.TOPIC_AUDIO_FRAME.format( self.DeviceManager.getMainDevice().uid)) def onStartListening(self, session: DialogSession): if not self.ConfigManager.getAliceConfigByName( 'recordAudioAfterWakeword' ) and self.WakewordRecorder.state != WakewordRecorderState.RECORDING: return path = Path( self.LAST_USER_SPEECH.format(session.user, session.deviceUid)) if path.exists(): path.rename( Path( self.SECOND_LAST_USER_SPEECH.format( session.user, session.deviceUid))) waveFile = wave.open(str(path), 'wb') waveFile.setsampwidth(2) waveFile.setframerate(self.AudioServer.SAMPLERATE) waveFile.setnchannels(1) self._waves[session.deviceUid] = waveFile def onCaptured(self, session: DialogSession): wav = self._waves.pop(session.deviceUid, None) if not wav: return wav.close() def recordFrame(self, deviceUid: str, frame: bytes): if deviceUid not in self._waves: return self._waves[deviceUid].writeframes(frame) def publishAudio(self) -> None: """ captures the audio and broadcasts it via publishAudioFrames to the topic 'hermes/audioServer/{}/audioFrame' furthermore it will publish VAD_UP and VAD_DOWN when detected :return: """ self.logInfo('Starting audio publisher') self._audioInputStream = sd.RawInputStream( dtype='int16', channels=1, samplerate=self.SAMPLERATE, blocksize=self.FRAMES_PER_BUFFER, ) self._audioInputStream.start() speech = False silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 minSpeechFrames = round(silence / 3) while True: if self.ProjectAlice.shuttingDown: break try: frames = self._audioInputStream.read( frames=self.FRAMES_PER_BUFFER)[0] if self._vad.is_speech(frames, self.SAMPLERATE): if not speech and speechFrames < minSpeechFrames: speechFrames += 1 elif speechFrames >= minSpeechFrames: speech = True self.MqttManager.publish( topic=constants.TOPIC_VAD_UP.format( self.DeviceManager.getMainDevice().uid), payload={ 'siteId': self.DeviceManager.getMainDevice().uid }) silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 else: if speech: if silence > 0: silence -= 1 else: speech = False self.MqttManager.publish( topic=constants.TOPIC_VAD_DOWN.format( self.DeviceManager.getMainDevice().uid), payload={ 'siteId': self.DeviceManager.getMainDevice().uid }) else: speechFrames = 0 self.publishAudioFrames(frames) except Exception as e: self.logDebug(f'Error publishing frame: {e}') def publishAudioFrames(self, frames: bytes) -> None: """ receives some audio frames, adds them to the buffer and publishes them to MQTT :param frames: :return: """ with io.BytesIO() as buffer: with wave.open(buffer, 'wb') as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(self.SAMPLERATE) wav.writeframes(frames) audioFrames = buffer.getvalue() self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format( self.DeviceManager.getMainDevice().uid), payload=bytearray(audioFrames)) def onPlayBytes(self, payload: bytearray, deviceUid: str, sessionId: str = None, requestId: str = None): """ Handles the playing of arbitrary bytes, be it sound, voice or even music. Triggered via MQTT onPlayBytes topic. Ignoring any request when sound is disabled via config :param payload: :param deviceUid: :param sessionId: :param requestId: :return: """ if deviceUid != self.DeviceManager.getMainDevice( ).uid or self.ConfigManager.getAliceConfigByName( 'disableSound') or self.DeviceManager.getDevice( uid=deviceUid).getParam('soundMuted'): return requestId = requestId or sessionId or str(uuid.uuid4()) if self.ConfigManager.getAliceConfigByName('debug'): with Path('/tmp/onPlayBytes.wav').open('wb') as file: file.write(payload) self._playing = True with io.BytesIO(payload) as buffer: try: with wave.open(buffer, 'rb') as wav: channels = wav.getnchannels() framerate = wav.getframerate() def streamCallback(outData: buffer, frames: int, _time: CData, _status: sd.CallbackFlags): data = wav.readframes(frames) if len(data) < len(outData): outData[:len(data)] = data outData[len(data):] = b'\x00' * (len(outData) - len(data)) raise sd.CallbackStop else: outData[:] = data stream = sd.RawOutputStream(dtype='int16', channels=channels, samplerate=framerate, callback=streamCallback) self.logDebug( f'Playing wav stream using **{self._audioOutput}** audio output from device **{self.DeviceManager.getDevice(uid=deviceUid).displayName}** (channels: {channels}, rate: {framerate})' ) stream.start() while stream.active: if self._stopPlayingFlag.is_set(): if not sessionId: raise PlayBytesStopped session = self.DialogManager.getSession( sessionId=sessionId) if session.lastWasSoundPlayOnly: raise PlayBytesStopped self.MqttManager.publish( topic=constants.TOPIC_TTS_FINISHED, payload={ 'id': requestId, 'sessionId': sessionId, 'siteId': deviceUid }) self.DialogManager.onEndSession(session) time.sleep(0.1) except PlayBytesStopped: self.logDebug('Playing bytes stopped') except Exception as e: self.logError(f'Playing wav failed with error: {e}') finally: self.logDebug('Playing bytes finished') stream.stop() stream.close() self._stopPlayingFlag.clear() self._playing = False # Session id support is not Hermes protocol official self.MqttManager.publish( topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(deviceUid), payload={ 'id': requestId, 'sessionId': sessionId }) def stopPlaying(self): self._stopPlayingFlag.set() def updateAudioDevices(self): self._audioInput = self.ConfigManager.getAliceConfigByName( 'inputDevice') self._audioOutput = self.ConfigManager.getAliceConfigByName( 'outputDevice') self.setDefaults() @property def isPlaying(self) -> bool: return self._playing
class VAD: """This class implements a Voice Activity Detector. The voice activity detector is a critical component in any speech processing application. It is able to identify the presence or absence of human speech in an audio frame. Generally, It is used to deactivate some processes during non-speech section of an audio session, saving on computation and on network bandwidth. Notes: This algorithm was implemented in the WebRTC project. The algorithm was originally designed to work with 8KHz, 16 bit PCM, mono audio samples. The algorithm accepts sampling rates of 8000Hz, 16000Hz, 32000Hz and 48000Hz, but internally all processing will be done 8000 Hz, input data in higher sample rates will just be down-sampled first. """ def __init__(self, rate: int = 8000, mode: int = 0): """Creates a VAD detector with the given configuration Args: rate (int): The audio sample rate, in Hz. mode (int): Operational mode, must be [0, 3] """ self.__rate = rate self.__mode = mode self.__vad = Vad(mode=mode) @property def mode(self) -> int: """Returns an integer representing the operational mode""" return self.__mode @property def sample_rate(self) -> int: """Returns the sampling rate in Hz.""" return self.__rate @mode.setter def mode(self, mode: int): """Set the operational mode of the VAD A more aggressive (higher mode) VAD is more restrictive in reporting speech. Put in other words the probability of being speech when the VAD returns 1 is increased with increasing mode. As a consequence also the missed detection rate goes up. Valid modes are: - 0 ("quality"): - 1 ("low bitrate"), - 2 ("aggressive") - 3 ("very aggressive"). The default mode is 0. Args: mode (int): Operational moder, must be [0, 3] """ self.__mode = mode self.__vad.set_mode(mode) @profile def process(self, data: np.ndarray) -> bool: """Checks if the given data contains human speech. Args: data (np.ndarray): An array containing the data Returns: True if the audio data contains speech, false otherwise Notes: The input data must be an array of signed 16-bit samples or an array of floating points storing values in the same range [-32,768, 32,768] Only mono frames with a length of 10, 20 or 30 ms are supported. For instance, if the class is using a sampling rate of 8KHz, the processing function is expecting an numpy.ndarray of shape [80, N], [160, N] or [240, N] where N is the number of channels in the input data. The signal may be down-mixed to a single channel before processing. """ mono = np.mean(a=data, axis=0, dtype=np.float32) mono = Converter.fromFloatToInt16(mono) mono = Converter.interleave(mono) result = self.__vad.is_speech(buf=mono, sample_rate=self.sample_rate, length=mono.size()) if (result < 0): raise RuntimeError( "Invalid frame length. Only frames with a length of 10, 20 or 30 ms are supported." ) return result
def vad_split(audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3): """ Credit: https://github.com/mozilla/DSAlign Splits audio into segments using Voice Activity Detection. Parameters ---------- audio_frames : list List of audio frames audio_format : tuple Tuple containing the audio sample rate, channels & width num_padding_frames : int Number of frames to pad threshold : float Minimum threshold aggressiveness : int Aggressivess of VAD split Yields ------- Audio segments (tuples containing number of frames, start time & end time)) """ sample_rate, channels, width = audio_format if channels != 1: raise ValueError("VAD-splitting requires mono samples") if width != 2: raise ValueError("VAD-splitting requires 16 bit samples") if sample_rate not in [8000, 16000, 32000, 48000]: raise ValueError("VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000") if aggressiveness not in [0, 1, 2, 3]: raise ValueError("VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3") ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False vad = Vad(int(aggressiveness)) voiced_frames = [] frame_duration_ms = 0 frame_index = 0 for frame_index, frame in enumerate(audio_frames): frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 if int(frame_duration_ms) not in [10, 20, 30]: raise ValueError("VAD-splitting only supported for frame durations 10, 20, or 30 ms") is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > threshold * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > threshold * ring_buffer.maxlen: triggered = False yield b"".join(voiced_frames), frame_duration_ms * max( 0, frame_index - len(voiced_frames) ), frame_duration_ms * frame_index ring_buffer.clear() voiced_frames = [] if len(voiced_frames) > 0: yield b"".join(voiced_frames), frame_duration_ms * (frame_index - len(voiced_frames)), frame_duration_ms * ( frame_index + 1 )
class AudioManager(Manager): SAMPLERATE = 16000 FRAMES_PER_BUFFER = 320 LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav' SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav' # Inspired by https://github.com/koenvervloesem/hermes-audio-server def __init__(self): super().__init__() self._stopPlayingFlag: Optional[AliceEvent] = None self._playing = False self._waves: Dict[str, wave.Wave_write] = dict() if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): return with self.Commons.shutUpAlsaFFS(): self._audio = pyaudio.PyAudio() self._vad = Vad(2) try: self._audioOutput = self._audio.get_default_output_device_info() except: self.logFatal('Audio output not found, cannot continue') return else: self.logInfo( f'Using **{self._audioOutput["name"]}** for audio output') try: self._audioInput = self._audio.get_default_input_device_info() except: self.logFatal('Audio input not found, cannot continue') else: self.logInfo( f'Using **{self._audioInput["name"]}** for audio input') def onStart(self): super().onStart() self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying') self.MqttManager.mqttClient.subscribe( constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid'))) if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): self.ThreadManager.newThread(name='audioPublisher', target=self.publishAudio) def onStop(self): super().onStop() self.MqttManager.mqttClient.unsubscribe( constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid'))) if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): self._audio.terminate() def onStartListening(self, session: DialogSession): if not self.ConfigManager.getAliceConfigByName( 'recordAudioAfterWakeword'): return path = Path(self.LAST_USER_SPEECH.format(session.user, session.siteId)) if path.exists(): path.rename( Path( self.SECOND_LAST_USER_SPEECH.format( session.user, session.siteId))) waveFile = wave.open(str(path), 'wb') waveFile.setsampwidth(2) waveFile.setframerate(self.AudioServer.SAMPLERATE) waveFile.setnchannels(1) self._waves[session.siteId] = waveFile def onCaptured(self, session: DialogSession): wav = self._waves.pop(session.siteId, None) if not wav: return wav.close() def recordFrame(self, siteId: str, frame: bytes): if siteId not in self._waves: return self._waves[siteId].writeframes(frame) def publishAudio(self): self.logInfo('Starting audio publisher') audioStream = self._audio.open( format=pyaudio.paInt16, channels=1, rate=self.SAMPLERATE, frames_per_buffer=self.FRAMES_PER_BUFFER, input=True) speech = False silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 minSpeechFrames = round(silence / 3) while True: if self.ProjectAlice.shuttingDown: break try: frames = audioStream.read(num_frames=self.FRAMES_PER_BUFFER, exception_on_overflow=False) if self._vad.is_speech(frames, self.SAMPLERATE): if not speech and speechFrames < minSpeechFrames: speechFrames += 1 elif speechFrames >= minSpeechFrames: speech = True self.MqttManager.publish( topic=constants.TOPIC_VAD_UP.format( self.ConfigManager.getAliceConfigByName( 'uuid')), payload={ 'siteId': self.ConfigManager.getAliceConfigByName('uuid') }) silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 else: if speech: if silence > 0: silence -= 1 else: speech = False self.MqttManager.publish( topic=constants.TOPIC_VAD_DOWN.format( self.ConfigManager.getAliceConfigByName( 'uuid')), payload={ 'siteId': self.ConfigManager.getAliceConfigByName( 'uuid') }) else: speechFrames = 0 self.publishAudioFrames(frames) except Exception as e: self.logDebug(f'Error publishing frame: {e}') def publishAudioFrames(self, frames: bytes): with io.BytesIO() as buffer: with wave.open(buffer, 'wb') as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(self.SAMPLERATE) wav.writeframes(frames) audioFrames = buffer.getvalue() self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid')), payload=bytearray(audioFrames)) def onPlayBytes(self, requestId: str, payload: bytearray, siteId: str, sessionId: str = None): if siteId != self.ConfigManager.getAliceConfigByName( 'uuid') or self.ConfigManager.getAliceConfigByName( 'disableSoundAndMic'): return self._playing = True with io.BytesIO(payload) as buffer: try: with wave.open(buffer, 'rb') as wav: sampleWidth = wav.getsampwidth() nFormat = self._audio.get_format_from_width(sampleWidth) channels = wav.getnchannels() framerate = wav.getframerate() def streamCallback(_inData, frameCount, _timeInfo, _status) -> tuple: data = wav.readframes(frameCount) return data, pyaudio.paContinue audioStream = self._audio.open( format=nFormat, channels=channels, rate=framerate, output=True, output_device_index=self._audioOutput['index'], stream_callback=streamCallback) self.logDebug( f'Playing wav stream using **{self._audioOutput["name"]}** audio output from site id **{self.DeviceManager.siteIdToDeviceName(siteId)}** (Format: {nFormat}, channels: {channels}, rate: {framerate})' ) audioStream.start_stream() while audioStream.is_active(): if self._stopPlayingFlag.is_set(): audioStream.stop_stream() audioStream.close() if sessionId: self.MqttManager.publish( topic=constants.TOPIC_TTS_FINISHED, payload={ 'id': requestId, 'sessionId': sessionId, 'siteId': siteId }) self.DialogManager.onEndSession( self.DialogManager.getSession(sessionId)) raise PlayBytesStopped time.sleep(0.1) audioStream.stop_stream() audioStream.close() except PlayBytesStopped: self.logDebug('Playing bytes stopped') except Exception as e: self.logError(f'Playing wav failed with error: {e}') finally: self._stopPlayingFlag.clear() self._playing = False # Session id support is not Hermes protocol official self.MqttManager.publish( topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(siteId), payload={ 'id': requestId, 'sessionId': sessionId }) def stopPlaying(self): self._stopPlayingFlag.set() @property def isPlaying(self) -> bool: return self._playing
class AudioManager(Manager): SAMPLERATE = 16000 FRAMES_PER_BUFFER = 320 LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav' SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav' def __init__(self): super().__init__() self._stopPlayingFlag: Optional[AliceEvent] = None self._playing = False self._waves: Dict[str, wave.Wave_write] = dict() self._audioInputStream = None if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): return self._vad = Vad(2) self._audioInput = None self._audioOutput = None def onStart(self): super().onStart() if not self.ConfigManager.getAliceConfigByName('inputDevice'): self.logWarning( 'Input device not set in config, trying to find default device' ) try: self._audioInput = sd.query_devices(kind='input')['name'] except: self.logFatal('Audio input not found, cannot continue') return self.ConfigManager.updateAliceConfiguration(key='inputDevice', value=self._audioInput) else: self._audioInput = self.ConfigManager.getAliceConfigByName( 'inputDevice') if not self.ConfigManager.getAliceConfigByName('outputDevice'): self.logWarning( 'Output device not set in config, trying to find default device' ) try: self._audioOutput = sd.query_devices(kind='output')['name'] except: self.logFatal('Audio output not found, cannot continue') return self.ConfigManager.updateAliceConfiguration( key='outputDevice', value=self._audioOutput) else: self._audioOutput = self.ConfigManager.getAliceConfigByName( 'outputDevice') self.setDefaults() self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying') self.MqttManager.mqttClient.subscribe( constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid'))) if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): self.ThreadManager.newThread(name='audioPublisher', target=self.publishAudio) def setDefaults(self): self.logInfo(f'Using **{self._audioInput}** for audio input') self.logInfo(f'Using **{self._audioOutput}** for audio output') sd.default.device = self._audioInput, self._audioOutput def onStop(self): super().onStop() self._audioInputStream.stop(ignore_errors=True) self._audioInputStream.close(ignore_errors=True) self.MqttManager.mqttClient.unsubscribe( constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid'))) def onStartListening(self, session: DialogSession): if not self.ConfigManager.getAliceConfigByName( 'recordAudioAfterWakeword'): return path = Path(self.LAST_USER_SPEECH.format(session.user, session.siteId)) if path.exists(): path.rename( Path( self.SECOND_LAST_USER_SPEECH.format( session.user, session.siteId))) waveFile = wave.open(str(path), 'wb') waveFile.setsampwidth(2) waveFile.setframerate(self.AudioServer.SAMPLERATE) waveFile.setnchannels(1) self._waves[session.siteId] = waveFile def onCaptured(self, session: DialogSession): wav = self._waves.pop(session.siteId, None) if not wav: return wav.close() def recordFrame(self, siteId: str, frame: bytes): if siteId not in self._waves: return self._waves[siteId].writeframes(frame) def publishAudio(self): self.logInfo('Starting audio publisher') self._audioInputStream = sd.RawInputStream( dtype='int16', channels=1, samplerate=self.SAMPLERATE, blocksize=self.FRAMES_PER_BUFFER, ) self._audioInputStream.start() speech = False silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 minSpeechFrames = round(silence / 3) while True: if self.ProjectAlice.shuttingDown: break try: frames = self._audioInputStream.read( frames=self.FRAMES_PER_BUFFER)[0] if self._vad.is_speech(frames, self.SAMPLERATE): if not speech and speechFrames < minSpeechFrames: speechFrames += 1 elif speechFrames >= minSpeechFrames: speech = True self.MqttManager.publish( topic=constants.TOPIC_VAD_UP.format( self.ConfigManager.getAliceConfigByName( 'uuid')), payload={ 'siteId': self.ConfigManager.getAliceConfigByName('uuid') }) silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER speechFrames = 0 else: if speech: if silence > 0: silence -= 1 else: speech = False self.MqttManager.publish( topic=constants.TOPIC_VAD_DOWN.format( self.ConfigManager.getAliceConfigByName( 'uuid')), payload={ 'siteId': self.ConfigManager.getAliceConfigByName( 'uuid') }) else: speechFrames = 0 self.publishAudioFrames(frames) except Exception as e: self.logDebug(f'Error publishing frame: {e}') def publishAudioFrames(self, frames: bytes): with io.BytesIO() as buffer: with wave.open(buffer, 'wb') as wav: wav.setnchannels(1) wav.setsampwidth(2) wav.setframerate(self.SAMPLERATE) wav.writeframes(frames) audioFrames = buffer.getvalue() self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format( self.ConfigManager.getAliceConfigByName('uuid')), payload=bytearray(audioFrames)) def onPlayBytes(self, requestId: str, payload: bytearray, siteId: str, sessionId: str = None): if siteId != self.ConfigManager.getAliceConfigByName( 'uuid') or self.ConfigManager.getAliceConfigByName( 'disableSoundAndMic'): return self._playing = True with io.BytesIO(payload) as buffer: try: with wave.open(buffer, 'rb') as wav: channels = wav.getnchannels() framerate = wav.getframerate() def streamCallback(outdata, frameCount, _timeInfo, _status): data = wav.readframes(frameCount) if len(data) < len(outdata): outdata[:len(data)] = data outdata[len(data):] = b'\x00' * (len(outdata) - len(data)) raise sd.CallbackStop else: outdata[:] = data stream = sd.RawOutputStream(dtype='int16', channels=channels, samplerate=framerate, callback=streamCallback) self.logDebug( f'Playing wav stream using **{self._audioOutput}** audio output from site id **{self.DeviceManager.siteIdToDeviceName(siteId)}** (channels: {channels}, rate: {framerate})' ) stream.start() while stream.active: if self._stopPlayingFlag.is_set(): stream.stop() stream.close() if sessionId: self.MqttManager.publish( topic=constants.TOPIC_TTS_FINISHED, payload={ 'id': requestId, 'sessionId': sessionId, 'siteId': siteId }) self.DialogManager.onEndSession( self.DialogManager.getSession(sessionId)) raise PlayBytesStopped time.sleep(0.1) stream.stop() stream.close() except PlayBytesStopped: self.logDebug('Playing bytes stopped') except Exception as e: self.logError(f'Playing wav failed with error: {e}') finally: self._stopPlayingFlag.clear() self._playing = False # Session id support is not Hermes protocol official self.MqttManager.publish( topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(siteId), payload={ 'id': requestId, 'sessionId': sessionId }) def stopPlaying(self): self._stopPlayingFlag.set() def updateAudioDevices(self): self._audioInput = self.ConfigManager.getAliceConfigByName( 'inputDevice') self._audioOutput = self.ConfigManager.getAliceConfigByName( 'outputDevice') self.setDefaults() @property def isPlaying(self) -> bool: return self._playing