Ejemplo n.º 1
0
    def __init__(self, microphone):
        # type: (AbstractMicrophone) -> VAD
        """
        Perform Voice Activity Detection on Microphone Input
        
        Parameters
        ----------
        microphone: AbstractMicrophone
        """

        self._microphone = microphone

        self._vad = Vad(VAD.MODE)

        # Voice Activity Detection Frame Size, Atomic VAD Unit
        self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000
        self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES

        self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size),
                                      VAD.AUDIO_TYPE)
        self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool)
        self._buffer_index = 0

        self._utterance = None
        self._utterance_queue = Queue()

        self._frame_buffer = bytearray()

        self._activation = 0

        self.microphone.callbacks += [self._on_audio]
Ejemplo n.º 2
0
    def __init__(self, microphone):
        # type: (AbstractMicrophone) -> None

        self._microphone = microphone
        self._vad = Vad(VAD.MODE)

        # Voice Activity Detection Frame Size: VAD works in units of 'frames'
        self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000
        self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES

        # Audio & Voice Ring-Buffers
        self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size),
                                      VAD.AUDIO_TYPE)
        self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool)
        self._buffer_index = 0

        self._voice = None
        self._voice_queue = Queue()

        self._frame_buffer = bytearray()

        self._activation = 0

        # Subscribe VAD to Microphone on_audio event
        self.microphone.callbacks += [self._on_audio]
Ejemplo n.º 3
0
 def __init__(self, samplerate, on_noise=None):
     self.samplerate = samplerate
     self.speech_timeout = SPEECH_TIMEOUT
     self.on_noise = on_noise
     self.listening = Lock()
     self.vad = Vad()
     self.vad.set_mode(3) # very restrictive filtering
Ejemplo n.º 4
0
def webrtc_split(audio,
                 rate,
                 aggressiveness=3,
                 frame_duration_ms=30,
                 window_duration_ms=300):
    # adapted from https://github.com/wiseman/py-webrtcvad/blob/master/example.py
    audio_bytes, audio_rate = to_pcm16(audio, rate)

    vad = Vad(aggressiveness)
    num_window_frames = int(window_duration_ms / frame_duration_ms)
    sliding_window = collections.deque(maxlen=num_window_frames)
    triggered = False

    voiced_frames = []
    for frame in generate_frames(audio_bytes, audio_rate, frame_duration_ms):
        is_speech = vad.is_speech(frame.bytes, audio_rate)
        sliding_window.append((frame, is_speech))

        if not triggered:
            num_voiced = len([f for f, speech in sliding_window if speech])
            if num_voiced > 0.9 * sliding_window.maxlen:
                triggered = True
                voiced_frames += [frame for frame, _ in sliding_window]
                sliding_window.clear()
        else:
            voiced_frames.append(frame)
            num_unvoiced = len(
                [f for f, speech in sliding_window if not speech])
            if num_unvoiced > 0.9 * sliding_window.maxlen:
                triggered = False
                yield voiced_frames, audio_rate
                sliding_window.clear()
                voiced_frames = []
    if voiced_frames:
        yield voiced_frames, audio_rate
Ejemplo n.º 5
0
    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()

        if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            return

        with self.Commons.shutUpAlsaFFS():
            self._audio = pyaudio.PyAudio()

        self._vad = Vad(2)

        try:
            self._audioOutput = self._audio.get_default_output_device_info()
        except:
            self.logFatal('Audio output not found, cannot continue')
            return
        else:
            self.logInfo(
                f'Using **{self._audioOutput["name"]}** for audio output')

        try:
            self._audioInput = self._audio.get_default_input_device_info()
        except:
            self.logFatal('Audio input not found, cannot continue')
        else:
            self.logInfo(
                f'Using **{self._audioInput["name"]}** for audio input')
Ejemplo n.º 6
0
    def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
        self.on_inactive = on_inactive
        self.limit_inactive_cnt = duration / 10  # a frame is 10 ms
        self.current_inactive_cnt = 0
Ejemplo n.º 7
0
def vad_split(
    audio_frames,
    audio_format=DEFAULT_FORMAT,
    num_padding_frames=10,
    threshold=0.5,
    aggressiveness=3,
):
    sample_rate, channels, width = audio_format
    if channels != 1:
        raise ValueError("VAD-splitting requires mono samples")
    if width != 2:
        raise ValueError("VAD-splitting requires 16 bit samples")
    if sample_rate not in [8000, 16000, 32000, 48000]:
        raise ValueError(
            "VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000"
        )
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError(
            "VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3")
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    vad = Vad(int(aggressiveness))
    voiced_frames = []
    frame_duration_ms = 0
    frame_index = 0
    for frame_index, frame in enumerate(audio_frames):
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError(
                "VAD-splitting only supported for frame durations 10, 20, or 30 ms"
            )
        is_speech = vad.is_speech(frame, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > threshold * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > threshold * ring_buffer.maxlen:
                triggered = False
                yield b"".join(voiced_frames), frame_duration_ms * max(
                    0, frame_index -
                    len(voiced_frames)), frame_duration_ms * frame_index
                ring_buffer.clear()
                voiced_frames = []
    if len(voiced_frames) > 0:
        yield b"".join(voiced_frames), frame_duration_ms * (
            frame_index -
            len(voiced_frames)), frame_duration_ms * (frame_index + 1)
Ejemplo n.º 8
0
    def __init__(self, rate: int = 8000, mode: int = 0):
        """Creates a VAD detector with the given configuration

        Args:
            rate (int): The audio sample rate, in Hz.
            mode (int): Operational mode,
        must be [0, 3]
        """
        self.__rate = rate
        self.__mode = mode
        self.__vad = Vad(mode=mode)
Ejemplo n.º 9
0
    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()
        self._audioInputStream = None

        if not self.ConfigManager.getAliceConfigByName('disableCapture'):
            self._vad = Vad(2)

        self._audioInput = None
        self._audioOutput = None
Ejemplo n.º 10
0
    def check_for_speech(self, frame_duration_ms=20):
        """Checks for speech.

        :param int frame_duration_ms: Audio frame length in ms.

        """
        vad = Vad(self.vad_aggressiveness)
        speaking = False  # to keep track of if vocalization ongoing

        n = int(SAMPLE_RATE * (frame_duration_ms / 1000.) * 2)
        # duration = n / SAMPLE_RATE / 2.0

        last_timestamp_sent = 0

        while not self.done.is_set():
            chunk = self.data_queue.get()

            offset = 0
            framecount = []
            while offset + n < len(chunk):
                now = time.time(
                ) * 1000.0  # caveat: this is not the same as PyEPL's clock...
                frame = chunk[offset:offset + n]
                if vad.is_speech(frame, SAMPLE_RATE):
                    framecount.append({"timestamp": now})

                    if len(framecount
                           ) >= self.consecutive_frames and not speaking:
                        speaking = True
                        payload = {
                            "speaking": True,
                            "timestamp": framecount[0]["timestamp"]
                        }
                        self.pipe.send(ipc.message("VOCALIZATION", payload))
                        self.logger.debug("Started speaking at %f", now)
                else:
                    if speaking:
                        speaking = False
                        payload = {"speaking": False, "timestamp": now}
                        self.pipe.send(ipc.message("VOCALIZATION", payload))
                        self.logger.debug("Stopped speaking at %f", now)
                    framecount = []

                offset += n

            now = time.time() * 1000
            if now - last_timestamp_sent >= 1000:
                self.pipe.send(ipc.message("TIMESTAMP", dict(timestamp=now)))
                last_timestamp_sent = now
Ejemplo n.º 11
0
def iter_wav_chunks(input_uri,
                    input_format,
                    framerate=16000,
                    vad_duration=0.02,
                    min_chunk_len=2,
                    max_chunk_len=10):
    vad = Vad(2)
    bufferbytes = io.BytesIO()
    buffersize = 0
    bufferduration = 0
    remains = b''
    audio_offset = .0
    for ok, *payload in \
            stream2wav(input_uri, input_format, framerate):
        if not ok:
            raise RuntimeError(payload[0])
        header, body, _, secondsize = payload
        chunksize = round(secondsize * 0.02)  # 20ms
        body = remains + body
        if min_chunk_len < 0:
            # no limit
            bufferbytes.write(body)
            buffersize += len(body)
            bufferduration = buffersize / secondsize
            continue
        for offset in range(0, len(body), chunksize):
            chunk = body[offset:offset + chunksize]
            if len(chunk) < chunksize:
                remains = chunk
                break
            if bufferduration < min_chunk_len or \
                    (bufferduration < max_chunk_len and
                     vad.is_speech(chunk, framerate)):
                bufferbytes.write(chunk)
                buffersize += chunksize
                bufferduration += chunksize / secondsize
            elif buffersize > 0:
                audiodata = bufferbytes.getvalue() + chunk
                duration = len(audiodata) / secondsize
                yield (header, audiodata, duration, audio_offset)
                audio_offset += duration
                bufferbytes = io.BytesIO()
                buffersize = 0
                bufferduration = 0
    if buffersize > 0:
        audiodata = bufferbytes.getvalue() + remains
        duration = len(audiodata) / secondsize
        yield (header, audiodata, duration, audio_offset)
Ejemplo n.º 12
0
def vad_filter(sample_rate: int, vad: webrtcvad.Vad,
               frames: List[bytes]) -> bytes:
    """
    # Adapted from https://github.com/wiseman/py-webrtcvad/blob/3b39545dbb026d998bf407f1cb86e0ed6192a5a6/example.py#L45
    Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, returns the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    All frames between [prev(first(speech_frame)), next(last(speech_frame))] will be considered as speech frames.
    This is done because we want to remove leading and tailing silence here.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames.
    Returns: the speech withing the frames.
    """
    voiced_frames_offsets = [
        i for i, frame in enumerate(frames)
        if vad.is_speech(frame, sample_rate)
    ]
    if len(voiced_frames_offsets) > 0:
        return b''.join(frames[max(voiced_frames_offsets[0] -
                                   1, 0):min(voiced_frames_offsets[-1] +
                                             2, len(frames))])
        # + 2 because b is not included in list[a:b]
    else:
        return b''.join(frames)
Ejemplo n.º 13
0
class VoiceOver(object):
    """
    Bag of state for tracking stuff in the stream callback.
    """
    def __init__(self):
        self.input_buf = np.empty((BLOCK_SIZE, 1), dtype=np.int16)
        self.vad = Vad(2)
        self.vad_q = deque([False], LAG_TIME)
        self.output = cycle(open_wav())

    def output_take(self, n_bytes):
        return list(islice(self.output, n_bytes))

    def input_is_talking(self):
        return sum(self.vad_q) > len(self.vad_q) * NECESSARY_FRACTION

    def callback(self, in_data, out_data, time_info, status):
        self.input_buf = np.concatenate((self.input_buf, in_data))
        if len(self.input_buf) > HOP_SIZE:  # we can pass data to vad
            ten_ms, rest = (self.input_buf[0:HOP_SIZE],
                            self.input_buf[HOP_SIZE:])
            resampled_to_32k = resample(ten_ms, 3 * 320,
                                        axis=0).astype(np.int16).tostring()
            self.vad_q.append(self.vad.is_speech(resampled_to_32k, 32000))
            self.input_buf = rest
        if self.input_is_talking():
            out_data[:] = self.output_take(len(out_data))
        else:
            out_data[:] = np.zeros(out_data.shape)
        return continue_flag
Ejemplo n.º 14
0
class VAD(Element):
    def __init__(self, rate=16000, mode=0, duration=1000, on_inactive=None):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
        self.on_inactive = on_inactive
        self.limit_inactive_cnt = duration / 10  # a frame is 10 ms
        self.current_inactive_cnt = 0

    def put(self, data):
        active = self.vad.is_speech(data, self.rate)
        if active:
            self.current_inactive_cnt = 0
        else:
            self.current_inactive_cnt += 1

        if self.current_inactive_cnt == self.limit_inactive_cnt:
            if callable(self.on_inactive):
                self.on_inactive()
            self.current_inactive_cnt = 0
        super(VAD, self).put(data)

    def on_inactive(self, cb):
        self.on_inactive = cb
Ejemplo n.º 15
0
class Listener:
    q = Queue()
    def __init__(self, samplerate, on_noise=None):
        self.samplerate = samplerate
        self.speech_timeout = SPEECH_TIMEOUT
        self.on_noise = on_noise
        self.listening = Lock()
        self.vad = Vad()
        self.vad.set_mode(3) # very restrictive filtering

    @staticmethod
    def _device_callback(indata, frames, time, status):
        """
        This is called (from a separate thread) for each audio block.
        """
        Listener.q.put(bytes(indata))

    def record(self):
        recorded_data = b''
        current = time.time()
        end = time.time() + self.speech_timeout

        # record until no sound is detected or time is over
        while current <= end:
            data = Listener.q.get()
            recorded_data += data
            if self.vad.is_speech(data, self.samplerate):
                end = time.time() + self.speech_timeout
            current = time.time()
        #print(end - start)
        return recorded_data
        
    def _start(self):
        self.listening.acquire()
        with sd.RawInputStream(samplerate=self.samplerate, channels=1, callback=Listener._device_callback, dtype='int16', blocksize=int(self.samplerate * 0.03)):
            while self.listening.locked():
                data = Listener.q.get()
                if self.on_noise is not None:
                    self.on_noise(data)

    def start(self):
        Thread(target=self._start).start()

    def stop(self):
        if self.listening.locked():
            self.listening.release()
Ejemplo n.º 16
0
def get_voice_events(filename, frame_dur, aggressiveness):
    """Evaluate the file for voice events.

    :param str filename:
    :param int frame_dur:
    :param int aggressiveness:

    """
    assert frame_dur in [10, 20, 30]
    assert aggressiveness in range(4)

    vad = Vad()
    vad.set_mode(args.aggressiveness)
    sample_rate = 16000
    frame_dur = args.frame_duration

    clip = downsample(filename, sample_rate).read()
    return [
        (frame_dur*n, vad.is_speech(frame.bytes, sample_rate))
        for n, frame in enumerate(frame_generator(clip, frame_dur, sample_rate))
    ]
Ejemplo n.º 17
0
class VAD(Element):
    def __init__(self, rate=16000, mode=0):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)

    def put(self, data):
        voice = '1' if self.vad.is_speech(data, self.rate) else '0'
        sys.stdout.write(voice)
        sys.stdout.flush()

        super(VAD, self).put(data)
Ejemplo n.º 18
0
def trim_long_silences(wav: np.ndarray,
                       vad: webrtcvad.Vad = None) -> np.ndarray:
    """
    Ensures that segments without voice in the waveform remain no longer than a
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats
    :param vad: an webrtcvad.Vad object. A new one with mode=3 will be created if None.
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (vad_window_length * sampling_rate) // 1000

    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]

    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack(f'{len(wav)}h',
                           *(np.round(wav * int16_max)).astype(np.int16))

    # Perform voice activation detection
    voice_flags = []
    if vad is None:
        vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(
            vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                          sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)

    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros(
            (width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width

    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)

    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask,
                                 np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    return wav[audio_mask]
Ejemplo n.º 19
0
def vad_segment_generator(audio_file, aggressiveness):
    """
    Generate VAD segments. Filters out non-voiced audio frames.
    :param audio_file: Input audio file to run VAD on.
    :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3)
    :return: Returns tuple of
        segments: a bytearray of multiple smaller audio frames
                  (The longer audio split into multiple smaller one's)
        sample_rate: Sample rate of the input audio file
        audio_length: Duration of the input audio file
    """
    audio = av.open(audio_file)
    sample_rate = 16000
    frames = frame_generator(30, audio.decode(audio=0), sample_rate)
    vad = Vad(int(aggressiveness))
    segments = vad_collector(sample_rate, 30, 300, 0.5, vad, frames)
    return segments, sample_rate, audio.duration / av.time_base
Ejemplo n.º 20
0
def vad_segment_generator(audio_file, aggressiveness):
    """
    Generate VAD segments. Filters out non-voiced audio frames.
    :param audio_file: Input audio file to run VAD on.
    :param aggressiveness: How aggressive filtering out non-speech is (between 0 and 3)
    :return: Returns tuple of
        segments: a bytearray of multiple smaller audio frames
                  (The longer audio split into multiple smaller one's)
        sample_rate: Sample rate of the input audio file
        audio_length: Duration of the input audio file
    """
    audio = (AudioSegment.from_file(audio_file)
                         .set_channels(1)
                         .set_frame_rate(16000))
    vad = Vad(int(aggressiveness))
    frames = frame_generator(30, audio.raw_data, audio.frame_rate)
    segments = vad_collector(audio.frame_rate, 30, 300, 0.5, vad, frames)
    return segments, audio.frame_rate, audio.duration_seconds * 1000
Ejemplo n.º 21
0
    def __init__(self, microphone, callbacks, mode=3):
        """
        Detect Utterances of People using Voice Activity Detection

        Parameters
        ----------
        microphone: AbstractMicrophone
            Microphone to extract Utterances from
        callbacks: list of callable
            On Utterance Callback
        mode: int
            Voice Activity Detection (VAD) 'Aggressiveness' (1..3)
        """
        self._microphone = microphone
        self._microphone.callbacks += [self._on_audio]
        self._rate = microphone.rate

        self._callbacks = callbacks
        self._vad = Vad(mode)

        # Number of Elements (np.int16) in Frame
        self._frame_size = self.FRAME_MS * self.rate // 1000

        self._ringbuffer_index = 0

        self._activation = 0

        # Initialize Ringbuffers, which will hold Audio data and Vad.is_speech results, respectively
        self._audio_ringbuffer = np.zeros((self.BUFFER_SIZE, self._frame_size),
                                          np.int16)
        self._vad_ringbuffer = np.zeros(self.BUFFER_SIZE, np.bool)

        self._audio_buffer = bytearray(
        )  # Audio Buffer will be filled with raw Microphone Audio
        self._voice_buffer = bytearray(
        )  # Voice Buffer will be filled with Voiced Audio

        self._voice = False  # No Voice is present at start

        self._log = logger.getChild(self.__class__.__name__)
        self._log.debug("Booted")
Ejemplo n.º 22
0
    def __init__(self, rate=16000, mode=0):
        super(VAD, self).__init__()

        self.rate = rate
        self.vad = Vad(mode)
Ejemplo n.º 23
0
class VAD:
    """This class implements a Voice Activity Detector.

    The voice activity detector is a critical component in any speech
    processing application. It is able to identify the presence or absence of
    human speech in an audio frame.

    Generally, It is used to deactivate some processes during non-speech
    section of an audio session, saving on computation and on network bandwidth.

    Notes:
        This algorithm was implemented in the WebRTC project. The algorithm was
        originally designed to work with 8KHz, 16 bit PCM, mono audio samples.

        The algorithm accepts sampling rates of 8000Hz, 16000Hz, 32000Hz and
        48000Hz, but internally all processing will be done 8000 Hz, input data
        in higher sample rates will just be down-sampled first.
    """
    def __init__(self, rate: int = 8000, mode: int = 0):
        """Creates a VAD detector with the given configuration

        Args:
            rate (int): The audio sample rate, in Hz.
            mode (int): Operational mode,
        must be [0, 3]
        """
        self.__rate = rate
        self.__mode = mode
        self.__vad = Vad(mode=mode)

    @property
    def mode(self) -> int:
        """Returns an integer representing the operational mode"""
        return self.__mode

    @property
    def sample_rate(self) -> int:
        """Returns the sampling rate in Hz."""
        return self.__rate

    @mode.setter
    def mode(self, mode: int):
        """Set the operational mode of the VAD

        A more aggressive (higher mode) VAD is more restrictive in reporting
        speech.

        Put in other words the probability of being speech when the VAD
        returns 1 is increased with increasing mode. As a consequence also the
        missed detection rate goes up.

        Valid modes are:
            - 0 ("quality"):
            - 1 ("low bitrate"),
            - 2 ("aggressive")
            - 3 ("very aggressive").

        The default mode is 0.

        Args:
            mode (int): Operational moder, must be [0, 3]
        """
        self.__mode = mode
        self.__vad.set_mode(mode)

    @profile
    def process(self, data: np.ndarray) -> bool:
        """Checks if the given data contains human speech.

        Args:
            data (np.ndarray): An array containing the data

        Returns:
            True if the audio data contains speech, false otherwise

        Notes:
            The input data must be an array of signed 16-bit samples or an array
            of floating points storing values in the same range [-32,768,
            32,768]

            Only mono frames with a length of 10, 20 or 30 ms are supported. For
            instance, if the class is using a sampling rate of 8KHz, the
            processing function is expecting an numpy.ndarray of shape [80, N],
            [160, N] or [240, N] where N is the number of channels in the input
            data. The signal may be down-mixed to a single channel before
            processing.
        """
        mono = np.mean(a=data, axis=0, dtype=np.float32)
        mono = Converter.fromFloatToInt16(mono)
        mono = Converter.interleave(mono)
        result = self.__vad.is_speech(buf=mono,
                                      sample_rate=self.sample_rate,
                                      length=mono.size())
        if (result < 0):
            raise RuntimeError(
                "Invalid frame length. Only frames with a length of 10, 20 or 30 ms are supported."
            )
        return result
Ejemplo n.º 24
0
def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms,
                  aggressiveness, audio):
    """Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    Returns: A generator that yields PCM audio data.
    """
    vad = Vad(aggressiveness)
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frame_generator(30, audio, sample_rate):
        is_speech = vad.is_speech(frame, sample_rate)

        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                yield b''.join([f for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f for f in voiced_frames])
Ejemplo n.º 25
0
class VAD(object):

    AUDIO_FRAME_MS = 10
    BUFFER_SIZE = 100

    AUDIO_TYPE = np.int16
    AUDIO_TYPE_BYTES = 2

    VOICE_THRESHOLD = 0.6
    VOICE_WINDOW = 50

    MODE = 3

    def __init__(self, microphone):
        # type: (AbstractMicrophone) -> VAD
        """
        Perform Voice Activity Detection on Microphone Input
        
        Parameters
        ----------
        microphone: AbstractMicrophone
        """

        self._microphone = microphone

        self._vad = Vad(VAD.MODE)

        # Voice Activity Detection Frame Size, Atomic VAD Unit
        self._frame_size = VAD.AUDIO_FRAME_MS * self.microphone.rate // 1000
        self._frame_size_bytes = self._frame_size * VAD.AUDIO_TYPE_BYTES

        self._audio_buffer = np.zeros((VAD.BUFFER_SIZE, self._frame_size),
                                      VAD.AUDIO_TYPE)
        self._voice_buffer = np.zeros(VAD.BUFFER_SIZE, np.bool)
        self._buffer_index = 0

        self._utterance = None
        self._utterance_queue = Queue()

        self._frame_buffer = bytearray()

        self._activation = 0

        self.microphone.callbacks += [self._on_audio]

    @property
    def microphone(self):
        """
        VAD Microphone
        
        Returns
        -------
        microphone: AbstractMicrophone
        """
        return self._microphone

    @property
    def activation(self):
        """
        VAD Activation
        
        Returns
        -------
        activation: float
        """
        return self._activation

    @property
    def utterances(self):
        # type: () -> Iterable[Utterance]
        """
        Get Utterances from Microphone Stream

        Yields
        -------
        voices: Iterable of Voice
        """
        while True:
            yield self._utterance_queue.get()

    def _on_audio(self, audio):
        # type: (np.ndarray) -> None

        # Work through Microphone Stream Frame by Frame
        self._frame_buffer.extend(audio.tobytes())
        while len(self._frame_buffer) >= self._frame_size_bytes:
            self._on_frame(
                np.frombuffer(self._frame_buffer[:self._frame_size_bytes],
                              VAD.AUDIO_TYPE))
            del self._frame_buffer[:self._frame_size_bytes]

    def _on_frame(self, frame):
        self._activation = self._calculate_activation(frame)

        if not self._utterance:
            if self.activation > VAD.VOICE_THRESHOLD:

                # Create New Utterance Object
                self._utterance = Utterance()

                # Add Buffer Contents to Utterance
                self._utterance.add_frame(
                    self._audio_buffer[self._buffer_index:].ravel())
                self._utterance.add_frame(
                    self._audio_buffer[:self._buffer_index].ravel())

                # Add Utterance to Utterance Queue
                self._utterance_queue.put(self._utterance)
        else:
            # If Utterance Ongoing: Add Frame to Utterance Object
            if self.activation > VAD.VOICE_THRESHOLD:
                self._utterance.add_frame(frame)

            # Else: Terminate Utterance
            else:
                self._utterance.add_frame(None)
                self._utterance = None

    def _calculate_activation(self, frame):
        # Update Buffers
        self._audio_buffer[self._buffer_index] = frame
        self._voice_buffer[self._buffer_index] = self._vad.is_speech(
            frame.tobytes(), self.microphone.rate, len(frame))
        self._buffer_index = (self._buffer_index + 1) % VAD.BUFFER_SIZE

        # Calculate Activation
        voice_window = np.arange(self._buffer_index - VAD.VOICE_WINDOW,
                                 self._buffer_index) % VAD.BUFFER_SIZE
        return np.mean(self._voice_buffer[voice_window])

    def __iter__(self):
        return self.utterances
Ejemplo n.º 26
0
class AudioManager(Manager):
    SAMPLERATE = 16000
    FRAMES_PER_BUFFER = 320

    LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav'
    SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav'

    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()
        self._audioInputStream = None

        if not self.ConfigManager.getAliceConfigByName('disableCapture'):
            self._vad = Vad(2)

        self._audioInput = None
        self._audioOutput = None

    def onStart(self):
        super().onStart()

        if not self.ConfigManager.getAliceConfigByName('inputDevice'):
            self.logWarning(
                'Input device not set in config, trying to find default device'
            )
            try:
                self._audioInput = sd.query_devices(kind='input')['name']
            except:
                self.logFatal('Audio input not found, cannot continue')
                return
            self.ConfigManager.updateAliceConfiguration(key='inputDevice',
                                                        value=self._audioInput)
        else:
            self._audioInput = self.ConfigManager.getAliceConfigByName(
                'inputDevice')

        if not self.ConfigManager.getAliceConfigByName('outputDevice'):
            self.logWarning(
                'Output device not set in config, trying to find default device'
            )
            try:
                self._audioOutput = sd.query_devices(kind='output')['name']
            except:
                self.logFatal('Audio output not found, cannot continue')
                return
            self.ConfigManager.updateAliceConfiguration(
                key='outputDevice', value=self._audioOutput)
        else:
            self._audioOutput = self.ConfigManager.getAliceConfigByName(
                'outputDevice')

        self.setDefaults()

        self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying')
        self.MqttManager.mqttClient.subscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')))

    def onBooted(self):
        if not self.ConfigManager.getAliceConfigByName('disableCapture'):
            self.ThreadManager.newThread(name='audioPublisher',
                                         target=self.publishAudio)

    def setDefaults(self):
        self.logInfo(f'Using **{self._audioInput}** for audio input')
        self.logInfo(f'Using **{self._audioOutput}** for audio output')

        sd.default.device = self._audioInput, self._audioOutput

    def onStop(self):
        super().onStop()
        if self._audioInputStream:
            self._audioInputStream.stop(ignore_errors=True)
            self._audioInputStream.close(ignore_errors=True)
        self.MqttManager.mqttClient.unsubscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.DeviceManager.getMainDevice().uid))

    def onStartListening(self, session: DialogSession):
        if not self.ConfigManager.getAliceConfigByName(
                'recordAudioAfterWakeword'
        ) and self.WakewordRecorder.state != WakewordRecorderState.RECORDING:
            return

        path = Path(
            self.LAST_USER_SPEECH.format(session.user, session.deviceUid))

        if path.exists():
            path.rename(
                Path(
                    self.SECOND_LAST_USER_SPEECH.format(
                        session.user, session.deviceUid)))

        waveFile = wave.open(str(path), 'wb')
        waveFile.setsampwidth(2)
        waveFile.setframerate(self.AudioServer.SAMPLERATE)
        waveFile.setnchannels(1)
        self._waves[session.deviceUid] = waveFile

    def onCaptured(self, session: DialogSession):
        wav = self._waves.pop(session.deviceUid, None)
        if not wav:
            return
        wav.close()

    def recordFrame(self, deviceUid: str, frame: bytes):
        if deviceUid not in self._waves:
            return

        self._waves[deviceUid].writeframes(frame)

    def publishAudio(self) -> None:
        """
		captures the audio and broadcasts it via publishAudioFrames to the topic 'hermes/audioServer/{}/audioFrame'
		furthermore it will publish VAD_UP and VAD_DOWN when detected
		:return:
		"""
        self.logInfo('Starting audio publisher')
        self._audioInputStream = sd.RawInputStream(
            dtype='int16',
            channels=1,
            samplerate=self.SAMPLERATE,
            blocksize=self.FRAMES_PER_BUFFER,
        )
        self._audioInputStream.start()

        speech = False
        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
        speechFrames = 0
        minSpeechFrames = round(silence / 3)

        while True:
            if self.ProjectAlice.shuttingDown:
                break

            try:
                frames = self._audioInputStream.read(
                    frames=self.FRAMES_PER_BUFFER)[0]

                if self._vad.is_speech(frames, self.SAMPLERATE):
                    if not speech and speechFrames < minSpeechFrames:
                        speechFrames += 1
                    elif speechFrames >= minSpeechFrames:
                        speech = True
                        self.MqttManager.publish(
                            topic=constants.TOPIC_VAD_UP.format(
                                self.DeviceManager.getMainDevice().uid),
                            payload={
                                'siteId':
                                self.DeviceManager.getMainDevice().uid
                            })
                        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
                        speechFrames = 0
                else:
                    if speech:
                        if silence > 0:
                            silence -= 1
                        else:
                            speech = False
                            self.MqttManager.publish(
                                topic=constants.TOPIC_VAD_DOWN.format(
                                    self.DeviceManager.getMainDevice().uid),
                                payload={
                                    'siteId':
                                    self.DeviceManager.getMainDevice().uid
                                })
                    else:
                        speechFrames = 0

                self.publishAudioFrames(frames)
            except Exception as e:
                self.logDebug(f'Error publishing frame: {e}')

    def publishAudioFrames(self, frames: bytes) -> None:
        """
		receives some audio frames, adds them to the buffer and publishes them to MQTT
		:param frames:
		:return:
		"""
        with io.BytesIO() as buffer:
            with wave.open(buffer, 'wb') as wav:
                wav.setnchannels(1)
                wav.setsampwidth(2)
                wav.setframerate(self.SAMPLERATE)
                wav.writeframes(frames)

            audioFrames = buffer.getvalue()
            self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format(
                self.DeviceManager.getMainDevice().uid),
                                     payload=bytearray(audioFrames))

    def onPlayBytes(self,
                    payload: bytearray,
                    deviceUid: str,
                    sessionId: str = None,
                    requestId: str = None):
        """
		Handles the playing of arbitrary bytes, be it sound, voice or even music.
		Triggered via MQTT onPlayBytes topic.
		Ignoring any request when sound is disabled via config
		:param payload:
		:param deviceUid:
		:param sessionId:
		:param requestId:
		:return:
		"""
        if deviceUid != self.DeviceManager.getMainDevice(
        ).uid or self.ConfigManager.getAliceConfigByName(
                'disableSound') or self.DeviceManager.getDevice(
                    uid=deviceUid).getParam('soundMuted'):
            return

        requestId = requestId or sessionId or str(uuid.uuid4())

        if self.ConfigManager.getAliceConfigByName('debug'):
            with Path('/tmp/onPlayBytes.wav').open('wb') as file:
                file.write(payload)

        self._playing = True
        with io.BytesIO(payload) as buffer:
            try:
                with wave.open(buffer, 'rb') as wav:
                    channels = wav.getnchannels()
                    framerate = wav.getframerate()

                    def streamCallback(outData: buffer, frames: int,
                                       _time: CData,
                                       _status: sd.CallbackFlags):
                        data = wav.readframes(frames)
                        if len(data) < len(outData):
                            outData[:len(data)] = data
                            outData[len(data):] = b'\x00' * (len(outData) -
                                                             len(data))
                            raise sd.CallbackStop
                        else:
                            outData[:] = data

                    stream = sd.RawOutputStream(dtype='int16',
                                                channels=channels,
                                                samplerate=framerate,
                                                callback=streamCallback)

                    self.logDebug(
                        f'Playing wav stream using **{self._audioOutput}** audio output from device **{self.DeviceManager.getDevice(uid=deviceUid).displayName}** (channels: {channels}, rate: {framerate})'
                    )
                    stream.start()
                    while stream.active:
                        if self._stopPlayingFlag.is_set():
                            if not sessionId:
                                raise PlayBytesStopped

                            session = self.DialogManager.getSession(
                                sessionId=sessionId)
                            if session.lastWasSoundPlayOnly:
                                raise PlayBytesStopped

                            self.MqttManager.publish(
                                topic=constants.TOPIC_TTS_FINISHED,
                                payload={
                                    'id': requestId,
                                    'sessionId': sessionId,
                                    'siteId': deviceUid
                                })
                            self.DialogManager.onEndSession(session)

                        time.sleep(0.1)
            except PlayBytesStopped:
                self.logDebug('Playing bytes stopped')
            except Exception as e:
                self.logError(f'Playing wav failed with error: {e}')
            finally:
                self.logDebug('Playing bytes finished')
                stream.stop()
                stream.close()
                self._stopPlayingFlag.clear()
                self._playing = False

        # Session id support is not Hermes protocol official
        self.MqttManager.publish(
            topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(deviceUid),
            payload={
                'id': requestId,
                'sessionId': sessionId
            })

    def stopPlaying(self):
        self._stopPlayingFlag.set()

    def updateAudioDevices(self):
        self._audioInput = self.ConfigManager.getAliceConfigByName(
            'inputDevice')
        self._audioOutput = self.ConfigManager.getAliceConfigByName(
            'outputDevice')
        self.setDefaults()

    @property
    def isPlaying(self) -> bool:
        return self._playing
Ejemplo n.º 27
0
class AudioManager(Manager):

    SAMPLERATE = 16000
    FRAMES_PER_BUFFER = 320

    LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav'
    SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav'

    # Inspired by https://github.com/koenvervloesem/hermes-audio-server

    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()

        if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            return

        with self.Commons.shutUpAlsaFFS():
            self._audio = pyaudio.PyAudio()

        self._vad = Vad(2)

        try:
            self._audioOutput = self._audio.get_default_output_device_info()
        except:
            self.logFatal('Audio output not found, cannot continue')
            return
        else:
            self.logInfo(
                f'Using **{self._audioOutput["name"]}** for audio output')

        try:
            self._audioInput = self._audio.get_default_input_device_info()
        except:
            self.logFatal('Audio input not found, cannot continue')
        else:
            self.logInfo(
                f'Using **{self._audioInput["name"]}** for audio input')

    def onStart(self):
        super().onStart()
        self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying')
        self.MqttManager.mqttClient.subscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')))

        if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            self.ThreadManager.newThread(name='audioPublisher',
                                         target=self.publishAudio)

    def onStop(self):
        super().onStop()
        self.MqttManager.mqttClient.unsubscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')))

        if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            self._audio.terminate()

    def onStartListening(self, session: DialogSession):
        if not self.ConfigManager.getAliceConfigByName(
                'recordAudioAfterWakeword'):
            return

        path = Path(self.LAST_USER_SPEECH.format(session.user, session.siteId))

        if path.exists():
            path.rename(
                Path(
                    self.SECOND_LAST_USER_SPEECH.format(
                        session.user, session.siteId)))

        waveFile = wave.open(str(path), 'wb')
        waveFile.setsampwidth(2)
        waveFile.setframerate(self.AudioServer.SAMPLERATE)
        waveFile.setnchannels(1)
        self._waves[session.siteId] = waveFile

    def onCaptured(self, session: DialogSession):
        wav = self._waves.pop(session.siteId, None)
        if not wav:
            return
        wav.close()

    def recordFrame(self, siteId: str, frame: bytes):
        if siteId not in self._waves:
            return

        self._waves[siteId].writeframes(frame)

    def publishAudio(self):
        self.logInfo('Starting audio publisher')
        audioStream = self._audio.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self.SAMPLERATE,
            frames_per_buffer=self.FRAMES_PER_BUFFER,
            input=True)

        speech = False
        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
        speechFrames = 0
        minSpeechFrames = round(silence / 3)

        while True:
            if self.ProjectAlice.shuttingDown:
                break

            try:
                frames = audioStream.read(num_frames=self.FRAMES_PER_BUFFER,
                                          exception_on_overflow=False)

                if self._vad.is_speech(frames, self.SAMPLERATE):
                    if not speech and speechFrames < minSpeechFrames:
                        speechFrames += 1
                    elif speechFrames >= minSpeechFrames:
                        speech = True
                        self.MqttManager.publish(
                            topic=constants.TOPIC_VAD_UP.format(
                                self.ConfigManager.getAliceConfigByName(
                                    'uuid')),
                            payload={
                                'siteId':
                                self.ConfigManager.getAliceConfigByName('uuid')
                            })
                        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
                        speechFrames = 0
                else:
                    if speech:
                        if silence > 0:
                            silence -= 1
                        else:
                            speech = False
                            self.MqttManager.publish(
                                topic=constants.TOPIC_VAD_DOWN.format(
                                    self.ConfigManager.getAliceConfigByName(
                                        'uuid')),
                                payload={
                                    'siteId':
                                    self.ConfigManager.getAliceConfigByName(
                                        'uuid')
                                })
                    else:
                        speechFrames = 0

                self.publishAudioFrames(frames)
            except Exception as e:
                self.logDebug(f'Error publishing frame: {e}')

    def publishAudioFrames(self, frames: bytes):
        with io.BytesIO() as buffer:
            with wave.open(buffer, 'wb') as wav:
                wav.setnchannels(1)
                wav.setsampwidth(2)
                wav.setframerate(self.SAMPLERATE)
                wav.writeframes(frames)

            audioFrames = buffer.getvalue()
            self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')),
                                     payload=bytearray(audioFrames))

    def onPlayBytes(self,
                    requestId: str,
                    payload: bytearray,
                    siteId: str,
                    sessionId: str = None):
        if siteId != self.ConfigManager.getAliceConfigByName(
                'uuid') or self.ConfigManager.getAliceConfigByName(
                    'disableSoundAndMic'):
            return

        self._playing = True
        with io.BytesIO(payload) as buffer:
            try:
                with wave.open(buffer, 'rb') as wav:
                    sampleWidth = wav.getsampwidth()
                    nFormat = self._audio.get_format_from_width(sampleWidth)
                    channels = wav.getnchannels()
                    framerate = wav.getframerate()

                    def streamCallback(_inData, frameCount, _timeInfo,
                                       _status) -> tuple:
                        data = wav.readframes(frameCount)
                        return data, pyaudio.paContinue

                    audioStream = self._audio.open(
                        format=nFormat,
                        channels=channels,
                        rate=framerate,
                        output=True,
                        output_device_index=self._audioOutput['index'],
                        stream_callback=streamCallback)

                    self.logDebug(
                        f'Playing wav stream using **{self._audioOutput["name"]}** audio output from site id **{self.DeviceManager.siteIdToDeviceName(siteId)}** (Format: {nFormat}, channels: {channels}, rate: {framerate})'
                    )
                    audioStream.start_stream()
                    while audioStream.is_active():
                        if self._stopPlayingFlag.is_set():
                            audioStream.stop_stream()
                            audioStream.close()

                            if sessionId:
                                self.MqttManager.publish(
                                    topic=constants.TOPIC_TTS_FINISHED,
                                    payload={
                                        'id': requestId,
                                        'sessionId': sessionId,
                                        'siteId': siteId
                                    })
                                self.DialogManager.onEndSession(
                                    self.DialogManager.getSession(sessionId))

                            raise PlayBytesStopped
                        time.sleep(0.1)

                    audioStream.stop_stream()
                    audioStream.close()
            except PlayBytesStopped:
                self.logDebug('Playing bytes stopped')
            except Exception as e:
                self.logError(f'Playing wav failed with error: {e}')
            finally:
                self._stopPlayingFlag.clear()
                self._playing = False

        # Session id support is not Hermes protocol official
        self.MqttManager.publish(
            topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(siteId),
            payload={
                'id': requestId,
                'sessionId': sessionId
            })

    def stopPlaying(self):
        self._stopPlayingFlag.set()

    @property
    def isPlaying(self) -> bool:
        return self._playing
Ejemplo n.º 28
0
def vad_split(audio_frames, audio_format=DEFAULT_FORMAT, num_padding_frames=10, threshold=0.5, aggressiveness=3):
    """
    Credit: https://github.com/mozilla/DSAlign

    Splits audio into segments using Voice Activity Detection.

    Parameters
    ----------
    audio_frames : list
        List of audio frames
    audio_format : tuple
        Tuple containing the audio sample rate, channels & width
    num_padding_frames : int
        Number of frames to pad
    threshold : float
        Minimum threshold
    aggressiveness : int
        Aggressivess of VAD split

    Yields
    -------
    Audio segments (tuples containing number of frames, start time & end time))
    """

    sample_rate, channels, width = audio_format
    if channels != 1:
        raise ValueError("VAD-splitting requires mono samples")
    if width != 2:
        raise ValueError("VAD-splitting requires 16 bit samples")
    if sample_rate not in [8000, 16000, 32000, 48000]:
        raise ValueError("VAD-splitting only supported for sample rates 8000, 16000, 32000, or 48000")
    if aggressiveness not in [0, 1, 2, 3]:
        raise ValueError("VAD-splitting aggressiveness mode has to be one of 0, 1, 2, or 3")
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    triggered = False
    vad = Vad(int(aggressiveness))
    voiced_frames = []
    frame_duration_ms = 0
    frame_index = 0
    for frame_index, frame in enumerate(audio_frames):
        frame_duration_ms = get_pcm_duration(len(frame), audio_format) * 1000
        if int(frame_duration_ms) not in [10, 20, 30]:
            raise ValueError("VAD-splitting only supported for frame durations 10, 20, or 30 ms")
        is_speech = vad.is_speech(frame, sample_rate)
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            if num_voiced > threshold * ring_buffer.maxlen:
                triggered = True
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            if num_unvoiced > threshold * ring_buffer.maxlen:
                triggered = False
                yield b"".join(voiced_frames), frame_duration_ms * max(
                    0, frame_index - len(voiced_frames)
                ), frame_duration_ms * frame_index
                ring_buffer.clear()
                voiced_frames = []
    if len(voiced_frames) > 0:
        yield b"".join(voiced_frames), frame_duration_ms * (frame_index - len(voiced_frames)), frame_duration_ms * (
            frame_index + 1
        )
Ejemplo n.º 29
0
 def __init__(self):
     self.input_buf = np.empty((BLOCK_SIZE, 1), dtype=np.int16)
     self.vad = Vad(2)
     self.vad_q = deque([False], LAG_TIME)
     self.output = cycle(open_wav())
Ejemplo n.º 30
0
class AudioManager(Manager):

    SAMPLERATE = 16000
    FRAMES_PER_BUFFER = 320

    LAST_USER_SPEECH = 'var/cache/lastUserpeech_{}_{}.wav'
    SECOND_LAST_USER_SPEECH = 'var/cache/secondLastUserSpeech_{}_{}.wav'

    def __init__(self):
        super().__init__()

        self._stopPlayingFlag: Optional[AliceEvent] = None
        self._playing = False
        self._waves: Dict[str, wave.Wave_write] = dict()
        self._audioInputStream = None

        if self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            return

        self._vad = Vad(2)
        self._audioInput = None
        self._audioOutput = None

    def onStart(self):
        super().onStart()

        if not self.ConfigManager.getAliceConfigByName('inputDevice'):
            self.logWarning(
                'Input device not set in config, trying to find default device'
            )
            try:
                self._audioInput = sd.query_devices(kind='input')['name']
            except:
                self.logFatal('Audio input not found, cannot continue')
                return
            self.ConfigManager.updateAliceConfiguration(key='inputDevice',
                                                        value=self._audioInput)
        else:
            self._audioInput = self.ConfigManager.getAliceConfigByName(
                'inputDevice')

        if not self.ConfigManager.getAliceConfigByName('outputDevice'):
            self.logWarning(
                'Output device not set in config, trying to find default device'
            )
            try:
                self._audioOutput = sd.query_devices(kind='output')['name']
            except:
                self.logFatal('Audio output not found, cannot continue')
                return
            self.ConfigManager.updateAliceConfiguration(
                key='outputDevice', value=self._audioOutput)
        else:
            self._audioOutput = self.ConfigManager.getAliceConfigByName(
                'outputDevice')

        self.setDefaults()

        self._stopPlayingFlag = self.ThreadManager.newEvent('stopPlaying')
        self.MqttManager.mqttClient.subscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')))

        if not self.ConfigManager.getAliceConfigByName('disableSoundAndMic'):
            self.ThreadManager.newThread(name='audioPublisher',
                                         target=self.publishAudio)

    def setDefaults(self):
        self.logInfo(f'Using **{self._audioInput}** for audio input')
        self.logInfo(f'Using **{self._audioOutput}** for audio output')

        sd.default.device = self._audioInput, self._audioOutput

    def onStop(self):
        super().onStop()
        self._audioInputStream.stop(ignore_errors=True)
        self._audioInputStream.close(ignore_errors=True)
        self.MqttManager.mqttClient.unsubscribe(
            constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')))

    def onStartListening(self, session: DialogSession):
        if not self.ConfigManager.getAliceConfigByName(
                'recordAudioAfterWakeword'):
            return

        path = Path(self.LAST_USER_SPEECH.format(session.user, session.siteId))

        if path.exists():
            path.rename(
                Path(
                    self.SECOND_LAST_USER_SPEECH.format(
                        session.user, session.siteId)))

        waveFile = wave.open(str(path), 'wb')
        waveFile.setsampwidth(2)
        waveFile.setframerate(self.AudioServer.SAMPLERATE)
        waveFile.setnchannels(1)
        self._waves[session.siteId] = waveFile

    def onCaptured(self, session: DialogSession):
        wav = self._waves.pop(session.siteId, None)
        if not wav:
            return
        wav.close()

    def recordFrame(self, siteId: str, frame: bytes):
        if siteId not in self._waves:
            return

        self._waves[siteId].writeframes(frame)

    def publishAudio(self):
        self.logInfo('Starting audio publisher')
        self._audioInputStream = sd.RawInputStream(
            dtype='int16',
            channels=1,
            samplerate=self.SAMPLERATE,
            blocksize=self.FRAMES_PER_BUFFER,
        )
        self._audioInputStream.start()

        speech = False
        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
        speechFrames = 0
        minSpeechFrames = round(silence / 3)

        while True:
            if self.ProjectAlice.shuttingDown:
                break

            try:
                frames = self._audioInputStream.read(
                    frames=self.FRAMES_PER_BUFFER)[0]

                if self._vad.is_speech(frames, self.SAMPLERATE):
                    if not speech and speechFrames < minSpeechFrames:
                        speechFrames += 1
                    elif speechFrames >= minSpeechFrames:
                        speech = True
                        self.MqttManager.publish(
                            topic=constants.TOPIC_VAD_UP.format(
                                self.ConfigManager.getAliceConfigByName(
                                    'uuid')),
                            payload={
                                'siteId':
                                self.ConfigManager.getAliceConfigByName('uuid')
                            })
                        silence = self.SAMPLERATE / self.FRAMES_PER_BUFFER
                        speechFrames = 0
                else:
                    if speech:
                        if silence > 0:
                            silence -= 1
                        else:
                            speech = False
                            self.MqttManager.publish(
                                topic=constants.TOPIC_VAD_DOWN.format(
                                    self.ConfigManager.getAliceConfigByName(
                                        'uuid')),
                                payload={
                                    'siteId':
                                    self.ConfigManager.getAliceConfigByName(
                                        'uuid')
                                })
                    else:
                        speechFrames = 0

                self.publishAudioFrames(frames)
            except Exception as e:
                self.logDebug(f'Error publishing frame: {e}')

    def publishAudioFrames(self, frames: bytes):
        with io.BytesIO() as buffer:
            with wave.open(buffer, 'wb') as wav:
                wav.setnchannels(1)
                wav.setsampwidth(2)
                wav.setframerate(self.SAMPLERATE)
                wav.writeframes(frames)

            audioFrames = buffer.getvalue()
            self.MqttManager.publish(topic=constants.TOPIC_AUDIO_FRAME.format(
                self.ConfigManager.getAliceConfigByName('uuid')),
                                     payload=bytearray(audioFrames))

    def onPlayBytes(self,
                    requestId: str,
                    payload: bytearray,
                    siteId: str,
                    sessionId: str = None):
        if siteId != self.ConfigManager.getAliceConfigByName(
                'uuid') or self.ConfigManager.getAliceConfigByName(
                    'disableSoundAndMic'):
            return

        self._playing = True
        with io.BytesIO(payload) as buffer:
            try:
                with wave.open(buffer, 'rb') as wav:
                    channels = wav.getnchannels()
                    framerate = wav.getframerate()

                    def streamCallback(outdata, frameCount, _timeInfo,
                                       _status):
                        data = wav.readframes(frameCount)
                        if len(data) < len(outdata):
                            outdata[:len(data)] = data
                            outdata[len(data):] = b'\x00' * (len(outdata) -
                                                             len(data))
                            raise sd.CallbackStop
                        else:
                            outdata[:] = data

                    stream = sd.RawOutputStream(dtype='int16',
                                                channels=channels,
                                                samplerate=framerate,
                                                callback=streamCallback)

                    self.logDebug(
                        f'Playing wav stream using **{self._audioOutput}** audio output from site id **{self.DeviceManager.siteIdToDeviceName(siteId)}** (channels: {channels}, rate: {framerate})'
                    )
                    stream.start()
                    while stream.active:
                        if self._stopPlayingFlag.is_set():
                            stream.stop()
                            stream.close()

                            if sessionId:
                                self.MqttManager.publish(
                                    topic=constants.TOPIC_TTS_FINISHED,
                                    payload={
                                        'id': requestId,
                                        'sessionId': sessionId,
                                        'siteId': siteId
                                    })
                                self.DialogManager.onEndSession(
                                    self.DialogManager.getSession(sessionId))

                            raise PlayBytesStopped
                        time.sleep(0.1)

                    stream.stop()
                    stream.close()
            except PlayBytesStopped:
                self.logDebug('Playing bytes stopped')
            except Exception as e:
                self.logError(f'Playing wav failed with error: {e}')
            finally:
                self._stopPlayingFlag.clear()
                self._playing = False

        # Session id support is not Hermes protocol official
        self.MqttManager.publish(
            topic=constants.TOPIC_PLAY_BYTES_FINISHED.format(siteId),
            payload={
                'id': requestId,
                'sessionId': sessionId
            })

    def stopPlaying(self):
        self._stopPlayingFlag.set()

    def updateAudioDevices(self):
        self._audioInput = self.ConfigManager.getAliceConfigByName(
            'inputDevice')
        self._audioOutput = self.ConfigManager.getAliceConfigByName(
            'outputDevice')
        self.setDefaults()

    @property
    def isPlaying(self) -> bool:
        return self._playing