Ejemplo n.º 1
0
    def __init__(self, microphone):
        # type: (AbstractMicrophone) -> None

        self._microphone = microphone
        self._vad = Vad(VAD.MODE)

        # Voice Activity Detection Frame Size: VAD works in units of 'frames'
        self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000
        self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES

        # Audio & Voice Ring-Buffers
        self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size),
                                      VAD.AUDIO_TYPE)
        self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool)
        self._buffer_index = 0

        self._voice = None
        self._voice_queue = Queue()

        self._frame_buffer = bytearray()

        self._activation = 0

        # Subscribe VAD to Microphone on_audio event
        self.microphone.callbacks += [self._on_audio]
Ejemplo n.º 2
0
    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()

        if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            return

        with self.Commons.shutUpAlsaFFS():
            self._audio = pyaudio.PyAudio()

        self._vad = Vad(2)

        try:
            self._audioOutput = self._audio.get_default_output_device_info()
        except:
            self.logFatal('Audio output not found, cannot continue')
            return
        else:
            self.logInfo(
                f'Using **{self._audioOutput["name"]}** for audio output')

        try:
            self._audioInput = self._audio.get_default_input_device_info()
        except:
            self.logFatal('Audio input not found, cannot continue')
        else:
            self.logInfo(
                f'Using **{self._audioInput["name"]}** for audio input')
Ejemplo n.º 3
0
    def __init__(self, microphone):
        # type: (AbstractMicrophone) -> VAD
        """
        Perform Voice Activity Detection on Microphone Input
        
        Parameters
        ----------
        microphone: AbstractMicrophone
        """

        self._microphone = microphone

        self._vad = Vad(VAD.MODE)

        # Voice Activity Detection Frame Size, Atomic VAD Unit
        self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000
        self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES

        self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size),
                                      VAD.AUDIO_TYPE)
        self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool)
        self._buffer_index = 0

        self._utterance = None
        self._utterance_queue = Queue()

        self._frame_buffer = bytearray()

        self._activation = 0

        self.microphone.callbacks += [self._on_audio]
Ejemplo n.º 4
0
 def __init__(self, samplerate, on_noise=None):
     self.samplerate = samplerate
     self.speech_timeout = SPEECH_TIMEOUT
     self.on_noise = on_noise
     self.listening = Lock()
     self.vad = Vad()
     self.vad.set_mode(3) # very restrictive filtering
Ejemplo n.º 5
0
def webrtc_split(audio,
                 rate,
                 aggressiveness=3,
                 frame_duration_ms=30,
                 window_duration_ms=300):
    # adapted from https://github.com/wiseman/py-webrtcvad/blob/master/example.py
    audio_bytes, audio_rate = to_pcm16(audio, rate)

    vad = Vad(aggressiveness)
    num_window_frames = int(window_duration_ms / frame_duration_ms)
    sliding_window = collections.deque(maxlen=num_window_frames)
    triggered = False

    voiced_frames = []
    for frame in generate_frames(audio_bytes, audio_rate, frame_duration_ms):
        is_speech = vad.is_speech(frame.bytes, audio_rate)
        sliding_window.append((frame, is_speech))

        if not triggered:
            num_voiced = len([f for f, speech in sliding_window if speech])
            if num_voiced > 0.9 * sliding_window.maxlen:
                triggered = True
                voiced_frames += [frame for frame, _ in sliding_window]
                sliding_window.clear()
        else:
            voiced_frames.append(frame)
            num_unvoiced = len(
                [f for f, speech in sliding_window if not speech])
            if num_unvoiced > 0.9 * sliding_window.maxlen:
                triggered = False
                yield voiced_frames, audio_rate
                sliding_window.clear()
                voiced_frames = []
    if voiced_frames:
        yield voiced_frames, audio_rate
Ejemplo n.º 6
0
    def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
        self.on_inactive = on_inactive
        self.limit_inactive_cnt = duration / 10  # a frame is 10 ms
        self.current_inactive_cnt = 0
Ejemplo n.º 7
0
def vad_split(
    audio_frames,
    audio_format=DEFAULT_FORMAT,
    num_padding_frames=10,
    threshold=0.5,
    aggressiveness=3,
):
    sample_rate, channels, width = audio_format
    if channels != 1:
        raise ValueError("VAD-splitting requires mono samples")
    if width != 2:
        raise ValueError("VAD-splitting requires 16 bit samples")
    if sample_rate not in [8000, 16000, 32000, 48000]:
        raise ValueError(
            "VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000"
        )
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError(
            "VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3")
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    vad = Vad(int(aggressiveness))
    voiced_frames = []
    frame_duration_ms = 0
    frame_index = 0
    for frame_index, frame in enumerate(audio_frames):
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError(
                "VAD-splitting only supported for frame durations 10, 20, or 30 ms"
            )
        is_speech = vad.is_speech(frame, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > threshold * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > threshold * ring_buffer.maxlen:
                triggered = False
                yield b"".join(voiced_frames), frame_duration_ms * max(
                    0, frame_index -
                    len(voiced_frames)), frame_duration_ms * frame_index
                ring_buffer.clear()
                voiced_frames = []
    if len(voiced_frames) > 0:
        yield b"".join(voiced_frames), frame_duration_ms * (
            frame_index -
            len(voiced_frames)), frame_duration_ms * (frame_index + 1)
Ejemplo n.º 8
0
    def __init__(self, rate: int = 8000, mode: int = 0):
        """Creates a VAD detector with the given configuration

        Args:
            rate (int): The audio sample rate, in Hz.
            mode (int): Operational mode,
        must be [0, 3]
        """
        self.__rate = rate
        self.__mode = mode
        self.__vad = Vad(mode=mode)
Ejemplo n.º 9
0
    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()
        self._audioInputStream = None

        if not self.ConfigManager.getAliceConfigByName('disableCapture'):
            self._vad = Vad(2)

        self._audioInput = None
        self._audioOutput = None
Ejemplo n.º 10
0
    def check_for_speech(self, frame_duration_ms=20):
        """Checks for speech.

        :param int frame_duration_ms: Audio frame length in ms.

        """
        vad = Vad(self.vad_aggressiveness)
        speaking = False  # to keep track of if vocalization ongoing

        n = int(SAMPLE_RATE * (frame_duration_ms / 1000.) * 2)
        # duration = n / SAMPLE_RATE / 2.0

        last_timestamp_sent = 0

        while not self.done.is_set():
            chunk = self.data_queue.get()

            offset = 0
            framecount = []
            while offset + n < len(chunk):
                now = time.time(
                ) * 1000.0  # caveat: this is not the same as PyEPL's clock...
                frame = chunk[offset:offset + n]
                if vad.is_speech(frame, SAMPLE_RATE):
                    framecount.append({"timestamp": now})

                    if len(framecount
                           ) >= self.consecutive_frames and not speaking:
                        speaking = True
                        payload = {
                            "speaking": True,
                            "timestamp": framecount[0]["timestamp"]
                        }
                        self.pipe.send(ipc.message("VOCALIZATION", payload))
                        self.logger.debug("Started speaking at %f", now)
                else:
                    if speaking:
                        speaking = False
                        payload = {"speaking": False, "timestamp": now}
                        self.pipe.send(ipc.message("VOCALIZATION", payload))
                        self.logger.debug("Stopped speaking at %f", now)
                    framecount = []

                offset += n

            now = time.time() * 1000
            if now - last_timestamp_sent >= 1000:
                self.pipe.send(ipc.message("TIMESTAMP", dict(timestamp=now)))
                last_timestamp_sent = now
Ejemplo n.º 11
0
def iter_wav_chunks(input_uri,
                    input_format,
                    framerate=16000,
                    vad_duration=0.02,
                    min_chunk_len=2,
                    max_chunk_len=10):
    vad = Vad(2)
    bufferbytes = io.BytesIO()
    buffersize = 0
    bufferduration = 0
    remains = b''
    audio_offset = .0
    for ok, *payload in \
            stream2wav(input_uri, input_format, framerate):
        if not ok:
            raise RuntimeError(payload[0])
        header, body, _, secondsize = payload
        chunksize = round(secondsize * 0.02)  # 20ms
        body = remains + body
        if min_chunk_len < 0:
            # no limit
            bufferbytes.write(body)
            buffersize += len(body)
            bufferduration = buffersize / secondsize
            continue
        for offset in range(0, len(body), chunksize):
            chunk = body[offset:offset + chunksize]
            if len(chunk) < chunksize:
                remains = chunk
                break
            if bufferduration < min_chunk_len or \
                    (bufferduration < max_chunk_len and
                     vad.is_speech(chunk, framerate)):
                bufferbytes.write(chunk)
                buffersize += chunksize
                bufferduration += chunksize / secondsize
            elif buffersize > 0:
                audiodata = bufferbytes.getvalue() + chunk
                duration = len(audiodata) / secondsize
                yield (header, audiodata, duration, audio_offset)
                audio_offset += duration
                bufferbytes = io.BytesIO()
                buffersize = 0
                bufferduration = 0
    if buffersize > 0:
        audiodata = bufferbytes.getvalue() + remains
        duration = len(audiodata) / secondsize
        yield (header, audiodata, duration, audio_offset)
Ejemplo n.º 12
0
def vad_segment_generator(audio_file, aggressiveness):
    """
    Generate VAD segments. Filters out non-voiced audio frames.
    :param audio_file: Input audio file to run VAD on.
    :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3)
    :return: Returns tuple of
        segments: a bytearray of multiple smaller audio frames
                  (The longer audio split into multiple smaller one's)
        sample_rate: Sample rate of the input audio file
        audio_length: Duration of the input audio file
    """
    audio = av.open(audio_file)
    sample_rate = 16000
    frames = frame_generator(30, audio.decode(audio=0), sample_rate)
    vad = Vad(int(aggressiveness))
    segments = vad_collector(sample_rate, 30, 300, 0.5, vad, frames)
    return segments, sample_rate, audio.duration / av.time_base
Ejemplo n.º 13
0
def vad_segment_generator(audio_file, aggressiveness):
    """
    Generate VAD segments. Filters out non-voiced audio frames.
    :param audio_file: Input audio file to run VAD on.
    :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3)
    :return: Returns tuple of
        segments: a bytearray of multiple smaller audio frames
                  (The longer audio split into multiple smaller one's)
        sample_rate: Sample rate of the input audio file
        audio_length: Duration of the input audio file
    """
    audio = (AudioSegment.from_file(audio_file)
                         .set_channels(1)
                         .set_frame_rate(16000))
    vad = Vad(int(aggressiveness))
    frames = frame_generator(30, audio.raw_data, audio.frame_rate)
    segments = vad_collector(audio.frame_rate, 30, 300, 0.5, vad, frames)
    return segments, audio.frame_rate, audio.duration_seconds * 1000
Ejemplo n.º 14
0
    def __init__(self, microphone, callbacks, mode=3):
        """
        Detect Utterances of People using Voice Activity Detection

        Parameters
        ----------
        microphone: AbstractMicrophone
            Microphone to extract Utterances from
        callbacks: list of callable
            On Utterance Callback
        mode: int
            Voice Activity Detection (VAD) 'Aggressiveness' (1..3)
        """
        self._microphone = microphone
        self._microphone.callbacks += [self._on_audio]
        self._rate = microphone.rate

        self._callbacks = callbacks
        self._vad = Vad(mode)

        # Number of Elements (np.int16) in Frame
        self._frame_size = self.FRAME_MS * self.rate // 1000

        self._ringbuffer_index = 0

        self._activation = 0

        # Initialize Ringbuffers, which will hold Audio data and Vad.is_speech results, respectively
        self._audio_ringbuffer = np.zeros((self.BUFFER_SIZE, self._frame_size),
                                          np.int16)
        self._vad_ringbuffer = np.zeros(self.BUFFER_SIZE, np.bool)

        self._audio_buffer = bytearray(
        )  # Audio Buffer will be filled with raw Microphone Audio
        self._voice_buffer = bytearray(
        )  # Voice Buffer will be filled with Voiced Audio

        self._voice = False  # No Voice is present at start

        self._log = logger.getChild(self.__class__.__name__)
        self._log.debug("Booted")
Ejemplo n.º 15
0
def get_voice_events(filename, frame_dur, aggressiveness):
    """Evaluate the file for voice events.

    :param str filename:
    :param int frame_dur:
    :param int aggressiveness:

    """
    assert frame_dur in [10, 20, 30]
    assert aggressiveness in range(4)

    vad = Vad()
    vad.set_mode(args.aggressiveness)
    sample_rate = 16000
    frame_dur = args.frame_duration

    clip = downsample(filename, sample_rate).read()
    return [
        (frame_dur*n, vad.is_speech(frame.bytes, sample_rate))
        for n, frame in enumerate(frame_generator(clip, frame_dur, sample_rate))
    ]
Ejemplo n.º 16
0
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms,
                  aggressiveness, audio):
    """Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    Returns: A generator that yields PCM audio data.
    """
    vad = Vad(aggressiveness)
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frame_generator(30, audio, sample_rate):
        is_speech = vad.is_speech(frame, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b''.join([f for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f for f in voiced_frames])
Ejemplo n.º 17
0
    def __init__(self, rate=16000, mode=0):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
Ejemplo n.º 18
0
def vad_split(audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3):
    """
    Credit: https://github.com/mozilla/DSAlign

    Splits audio into segments using Voice Activity Detection.

    Parameters
    ----------
    audio_frames : list
        List of audio frames
    audio_format : tuple
        Tuple containing the audio sample rate, channels & width
    num_padding_frames : int
        Number of frames to pad
    threshold : float
        Minimum threshold
    aggressiveness : int
        Aggressivess of VAD split

    Yields
    -------
    Audio segments (tuples containing number of frames, start time & end time))
    """

    sample_rate, channels, width = audio_format
    if channels != 1:
        raise ValueError("VAD-splitting requires mono samples")
    if width != 2:
        raise ValueError("VAD-splitting requires 16 bit samples")
    if sample_rate not in [8000, 16000, 32000, 48000]:
        raise ValueError("VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000")
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError("VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3")
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    vad = Vad(int(aggressiveness))
    voiced_frames = []
    frame_duration_ms = 0
    frame_index = 0
    for frame_index, frame in enumerate(audio_frames):
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError("VAD-splitting only supported for frame durations 10, 20, or 30 ms")
        is_speech = vad.is_speech(frame, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > threshold * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > threshold * ring_buffer.maxlen:
                triggered = False
                yield b"".join(voiced_frames), frame_duration_ms * max(
                    0, frame_index - len(voiced_frames)
                ), frame_duration_ms * frame_index
                ring_buffer.clear()
                voiced_frames = []
    if len(voiced_frames) > 0:
        yield b"".join(voiced_frames), frame_duration_ms * (frame_index - len(voiced_frames)), frame_duration_ms * (
            frame_index + 1
        )
Ejemplo n.º 19
0
 def __init__(self):
     self.input_buf = np.empty((BLOCK_SIZE, 1), dtype=np.int16)
     self.vad = Vad(2)
     self.vad_q = deque([False], LAG_TIME)
     self.output = cycle(open_wav())