def __init__(self, microphone): # type: (AbstractMicrophone) -> None self._microphone = microphone self._vad = Vad(VAD.MODE) # Voice Activity Detection Frame Size: VAD works in units of 'frames' self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000 self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES # Audio & Voice Ring-Buffers self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size), VAD.AUDIO_TYPE) self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool) self._buffer_index = 0 self._voice = None self._voice_queue = Queue() self._frame_buffer = bytearray() self._activation = 0 # Subscribe VAD to Microphone on_audio event self.microphone.callbacks += [self._on_audio]
def __init__(self): super().__init__() self._stopPlayingFlag: Optional[AliceEvent] = None self._playing = False self._waves: Dict[str, wave.Wave_write] = dict() if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'): return with self.Commons.shutUpAlsaFFS(): self._audio = pyaudio.PyAudio() self._vad = Vad(2) try: self._audioOutput = self._audio.get_default_output_device_info() except: self.logFatal('Audio output not found, cannot continue') return else: self.logInfo( f'Using **{self._audioOutput["name"]}** for audio output') try: self._audioInput = self._audio.get_default_input_device_info() except: self.logFatal('Audio input not found, cannot continue') else: self.logInfo( f'Using **{self._audioInput["name"]}** for audio input')
def __init__(self, microphone): # type: (AbstractMicrophone) -> VAD """ Perform Voice Activity Detection on Microphone Input Parameters ---------- microphone: AbstractMicrophone """ self._microphone = microphone self._vad = Vad(VAD.MODE) # Voice Activity Detection Frame Size, Atomic VAD Unit self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000 self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size), VAD.AUDIO_TYPE) self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool) self._buffer_index = 0 self._utterance = None self._utterance_queue = Queue() self._frame_buffer = bytearray() self._activation = 0 self.microphone.callbacks += [self._on_audio]
def __init__(self, samplerate, on_noise=None): self.samplerate = samplerate self.speech_timeout = SPEECH_TIMEOUT self.on_noise = on_noise self.listening = Lock() self.vad = Vad() self.vad.set_mode(3) # very restrictive filtering
def webrtc_split(audio, rate, aggressiveness=3, frame_duration_ms=30, window_duration_ms=300): # adapted from https://github.com/wiseman/py-webrtcvad/blob/master/example.py audio_bytes, audio_rate = to_pcm16(audio, rate) vad = Vad(aggressiveness) num_window_frames = int(window_duration_ms / frame_duration_ms) sliding_window = collections.deque(maxlen=num_window_frames) triggered = False voiced_frames = [] for frame in generate_frames(audio_bytes, audio_rate, frame_duration_ms): is_speech = vad.is_speech(frame.bytes, audio_rate) sliding_window.append((frame, is_speech)) if not triggered: num_voiced = len([f for f, speech in sliding_window if speech]) if num_voiced > 0.9 * sliding_window.maxlen: triggered = True voiced_frames += [frame for frame, _ in sliding_window] sliding_window.clear() else: voiced_frames.append(frame) num_unvoiced = len( [f for f, speech in sliding_window if not speech]) if num_unvoiced > 0.9 * sliding_window.maxlen: triggered = False yield voiced_frames, audio_rate sliding_window.clear() voiced_frames = [] if voiced_frames: yield voiced_frames, audio_rate
def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None): super(VAD, self).__init__() self.rate = rate self.vad = Vad(mode) self.on_inactive = on_inactive self.limit_inactive_cnt = duration / 10 # a frame is 10 ms self.current_inactive_cnt = 0
def vad_split( audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3, ): sample_rate, channels, width = audio_format if channels != 1: raise ValueError("VAD-splitting requires mono samples") if width != 2: raise ValueError("VAD-splitting requires 16 bit samples") if sample_rate not in [8000, 16000, 32000, 48000]: raise ValueError( "VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000" ) if aggressiveness not in [0, 1, 2, 3]: raise ValueError( "VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3") ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False vad = Vad(int(aggressiveness)) voiced_frames = [] frame_duration_ms = 0 frame_index = 0 for frame_index, frame in enumerate(audio_frames): frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 if int(frame_duration_ms) not in [10, 20, 30]: raise ValueError( "VAD-splitting only supported for frame durations 10, 20, or 30 ms" ) is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > threshold * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > threshold * ring_buffer.maxlen: triggered = False yield b"".join(voiced_frames), frame_duration_ms * max( 0, frame_index - len(voiced_frames)), frame_duration_ms * frame_index ring_buffer.clear() voiced_frames = [] if len(voiced_frames) > 0: yield b"".join(voiced_frames), frame_duration_ms * ( frame_index - len(voiced_frames)), frame_duration_ms * (frame_index + 1)
def __init__(self, rate: int = 8000, mode: int = 0): """Creates a VAD detector with the given configuration Args: rate (int): The audio sample rate, in Hz. mode (int): Operational mode, must be [0, 3] """ self.__rate = rate self.__mode = mode self.__vad = Vad(mode=mode)
def __init__(self): super().__init__() self._stopPlayingFlag: Optional[AliceEvent] = None self._playing = False self._waves: Dict[str, wave.Wave_write] = dict() self._audioInputStream = None if not self.ConfigManager.getAliceConfigByName('disableCapture'): self._vad = Vad(2) self._audioInput = None self._audioOutput = None
def check_for_speech(self, frame_duration_ms=20): """Checks for speech. :param int frame_duration_ms: Audio frame length in ms. """ vad = Vad(self.vad_aggressiveness) speaking = False # to keep track of if vocalization ongoing n = int(SAMPLE_RATE * (frame_duration_ms / 1000.) * 2) # duration = n / SAMPLE_RATE / 2.0 last_timestamp_sent = 0 while not self.done.is_set(): chunk = self.data_queue.get() offset = 0 framecount = [] while offset + n < len(chunk): now = time.time( ) * 1000.0 # caveat: this is not the same as PyEPL's clock... frame = chunk[offset:offset + n] if vad.is_speech(frame, SAMPLE_RATE): framecount.append({"timestamp": now}) if len(framecount ) >= self.consecutive_frames and not speaking: speaking = True payload = { "speaking": True, "timestamp": framecount[0]["timestamp"] } self.pipe.send(ipc.message("VOCALIZATION", payload)) self.logger.debug("Started speaking at %f", now) else: if speaking: speaking = False payload = {"speaking": False, "timestamp": now} self.pipe.send(ipc.message("VOCALIZATION", payload)) self.logger.debug("Stopped speaking at %f", now) framecount = [] offset += n now = time.time() * 1000 if now - last_timestamp_sent >= 1000: self.pipe.send(ipc.message("TIMESTAMP", dict(timestamp=now))) last_timestamp_sent = now
def iter_wav_chunks(input_uri, input_format, framerate=16000, vad_duration=0.02, min_chunk_len=2, max_chunk_len=10): vad = Vad(2) bufferbytes = io.BytesIO() buffersize = 0 bufferduration = 0 remains = b'' audio_offset = .0 for ok, *payload in \ stream2wav(input_uri, input_format, framerate): if not ok: raise RuntimeError(payload[0]) header, body, _, secondsize = payload chunksize = round(secondsize * 0.02) # 20ms body = remains + body if min_chunk_len < 0: # no limit bufferbytes.write(body) buffersize += len(body) bufferduration = buffersize / secondsize continue for offset in range(0, len(body), chunksize): chunk = body[offset:offset + chunksize] if len(chunk) < chunksize: remains = chunk break if bufferduration < min_chunk_len or \ (bufferduration < max_chunk_len and vad.is_speech(chunk, framerate)): bufferbytes.write(chunk) buffersize += chunksize bufferduration += chunksize / secondsize elif buffersize > 0: audiodata = bufferbytes.getvalue() + chunk duration = len(audiodata) / secondsize yield (header, audiodata, duration, audio_offset) audio_offset += duration bufferbytes = io.BytesIO() buffersize = 0 bufferduration = 0 if buffersize > 0: audiodata = bufferbytes.getvalue() + remains duration = len(audiodata) / secondsize yield (header, audiodata, duration, audio_offset)
def vad_segment_generator(audio_file, aggressiveness): """ Generate VAD segments. Filters out non-voiced audio frames. :param audio_file: Input audio file to run VAD on. :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3) :return: Returns tuple of segments: a bytearray of multiple smaller audio frames (The longer audio split into multiple smaller one's) sample_rate: Sample rate of the input audio file audio_length: Duration of the input audio file """ audio = av.open(audio_file) sample_rate = 16000 frames = frame_generator(30, audio.decode(audio=0), sample_rate) vad = Vad(int(aggressiveness)) segments = vad_collector(sample_rate, 30, 300, 0.5, vad, frames) return segments, sample_rate, audio.duration / av.time_base
def vad_segment_generator(audio_file, aggressiveness): """ Generate VAD segments. Filters out non-voiced audio frames. :param audio_file: Input audio file to run VAD on. :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3) :return: Returns tuple of segments: a bytearray of multiple smaller audio frames (The longer audio split into multiple smaller one's) sample_rate: Sample rate of the input audio file audio_length: Duration of the input audio file """ audio = (AudioSegment.from_file(audio_file) .set_channels(1) .set_frame_rate(16000)) vad = Vad(int(aggressiveness)) frames = frame_generator(30, audio.raw_data, audio.frame_rate) segments = vad_collector(audio.frame_rate, 30, 300, 0.5, vad, frames) return segments, audio.frame_rate, audio.duration_seconds * 1000
def __init__(self, microphone, callbacks, mode=3): """ Detect Utterances of People using Voice Activity Detection Parameters ---------- microphone: AbstractMicrophone Microphone to extract Utterances from callbacks: list of callable On Utterance Callback mode: int Voice Activity Detection (VAD) 'Aggressiveness' (1..3) """ self._microphone = microphone self._microphone.callbacks += [self._on_audio] self._rate = microphone.rate self._callbacks = callbacks self._vad = Vad(mode) # Number of Elements (np.int16) in Frame self._frame_size = self.FRAME_MS * self.rate // 1000 self._ringbuffer_index = 0 self._activation = 0 # Initialize Ringbuffers, which will hold Audio data and Vad.is_speech results, respectively self._audio_ringbuffer = np.zeros((self.BUFFER_SIZE, self._frame_size), np.int16) self._vad_ringbuffer = np.zeros(self.BUFFER_SIZE, np.bool) self._audio_buffer = bytearray( ) # Audio Buffer will be filled with raw Microphone Audio self._voice_buffer = bytearray( ) # Voice Buffer will be filled with Voiced Audio self._voice = False # No Voice is present at start self._log = logger.getChild(self.__class__.__name__) self._log.debug("Booted")
def get_voice_events(filename, frame_dur, aggressiveness): """Evaluate the file for voice events. :param str filename: :param int frame_dur: :param int aggressiveness: """ assert frame_dur in [10, 20, 30] assert aggressiveness in range(4) vad = Vad() vad.set_mode(args.aggressiveness) sample_rate = 16000 frame_dur = args.frame_duration clip = downsample(filename, sample_rate).read() return [ (frame_dur*n, vad.is_speech(frame.bytes, sample_rate)) for n, frame in enumerate(frame_generator(clip, frame_dur, sample_rate)) ]
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, aggressiveness, audio): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only the voiced audio. Uses a padded, sliding window algorithm over the audio frames. When more than 90% of the frames in the window are voiced (as reported by the VAD), the collector triggers and begins yielding audio frames. Then the collector waits until 90% of the frames in the window are unvoiced to detrigger. The window is padded at the front and back to provide a small amount of silence or the beginnings/endings of speech around the voiced frames. Arguments: sample_rate - The audio sample rate, in Hz. frame_duration_ms - The frame duration in milliseconds. padding_duration_ms - The amount to pad the window, in milliseconds. vad - An instance of webrtcvad.Vad. frames - a source of audio frames (sequence or generator). Returns: A generator that yields PCM audio data. """ vad = Vad(aggressiveness) num_padding_frames = int(padding_duration_ms / frame_duration_ms) # We use a deque for our sliding window/ring buffer. ring_buffer = collections.deque(maxlen=num_padding_frames) # We have two states: TRIGGERED and NOTTRIGGERED. We start in the # NOTTRIGGERED state. triggered = False voiced_frames = [] for frame in frame_generator(30, audio, sample_rate): is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) # If we're NOTTRIGGERED and more than 90% of the frames in # the ring buffer are voiced frames, then enter the # TRIGGERED state. if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: # We're in the TRIGGERED state, so collect the audio data # and add it to the ring buffer. voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) # If more than 90% of the frames in the ring buffer are # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False yield b''.join([f for f in voiced_frames]) ring_buffer.clear() voiced_frames = [] # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: yield b''.join([f for f in voiced_frames])
def __init__(self, rate=16000, mode=0): super(VAD, self).__init__() self.rate = rate self.vad = Vad(mode)
def vad_split(audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3): """ Credit: https://github.com/mozilla/DSAlign Splits audio into segments using Voice Activity Detection. Parameters ---------- audio_frames : list List of audio frames audio_format : tuple Tuple containing the audio sample rate, channels & width num_padding_frames : int Number of frames to pad threshold : float Minimum threshold aggressiveness : int Aggressivess of VAD split Yields ------- Audio segments (tuples containing number of frames, start time & end time)) """ sample_rate, channels, width = audio_format if channels != 1: raise ValueError("VAD-splitting requires mono samples") if width != 2: raise ValueError("VAD-splitting requires 16 bit samples") if sample_rate not in [8000, 16000, 32000, 48000]: raise ValueError("VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000") if aggressiveness not in [0, 1, 2, 3]: raise ValueError("VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3") ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False vad = Vad(int(aggressiveness)) voiced_frames = [] frame_duration_ms = 0 frame_index = 0 for frame_index, frame in enumerate(audio_frames): frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000 if int(frame_duration_ms) not in [10, 20, 30]: raise ValueError("VAD-splitting only supported for frame durations 10, 20, or 30 ms") is_speech = vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) if num_voiced > threshold * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) if num_unvoiced > threshold * ring_buffer.maxlen: triggered = False yield b"".join(voiced_frames), frame_duration_ms * max( 0, frame_index - len(voiced_frames) ), frame_duration_ms * frame_index ring_buffer.clear() voiced_frames = [] if len(voiced_frames) > 0: yield b"".join(voiced_frames), frame_duration_ms * (frame_index - len(voiced_frames)), frame_duration_ms * ( frame_index + 1 )
def __init__(self): self.input_buf = np.empty((BLOCK_SIZE, 1), dtype=np.int16) self.vad = Vad(2) self.vad_q = deque([False], LAG_TIME) self.output = cycle(open_wav())