Ejemplo n.º 1
0
class Listener:
    q = Queue()
    def __init__(self, samplerate, on_noise=None):
        self.samplerate = samplerate
        self.speech_timeout = SPEECH_TIMEOUT
        self.on_noise = on_noise
        self.listening = Lock()
        self.vad = Vad()
        self.vad.set_mode(3) # very restrictive filtering

    @staticmethod
    def _device_callback(indata, frames, time, status):
        """
        This is called (from a separate thread) for each audio block.
        """
        Listener.q.put(bytes(indata))

    def record(self):
        recorded_data = b''
        current = time.time()
        end = time.time() + self.speech_timeout

        # record until no sound is detected or time is over
        while current <= end:
            data = Listener.q.get()
            recorded_data += data
            if self.vad.is_speech(data, self.samplerate):
                end = time.time() + self.speech_timeout
            current = time.time()
        #print(end - start)
        return recorded_data
        
    def _start(self):
        self.listening.acquire()
        with sd.RawInputStream(samplerate=self.samplerate, channels=1, callback=Listener._device_callback, dtype='int16', blocksize=int(self.samplerate * 0.03)):
            while self.listening.locked():
                data = Listener.q.get()
                if self.on_noise is not None:
                    self.on_noise(data)

    def start(self):
        Thread(target=self._start).start()

    def stop(self):
        if self.listening.locked():
            self.listening.release()
Ejemplo n.º 2
0
def get_voice_events(filename, frame_dur, aggressiveness):
    """Evaluate the file for voice events.

    :param str filename:
    :param int frame_dur:
    :param int aggressiveness:

    """
    assert frame_dur in [10, 20, 30]
    assert aggressiveness in range(4)

    vad = Vad()
    vad.set_mode(args.aggressiveness)
    sample_rate = 16000
    frame_dur = args.frame_duration

    clip = downsample(filename, sample_rate).read()
    return [
        (frame_dur*n, vad.is_speech(frame.bytes, sample_rate))
        for n, frame in enumerate(frame_generator(clip, frame_dur, sample_rate))
    ]
Ejemplo n.º 3
0
class VAD:
    """This class implements a Voice Activity Detector.

    The voice activity detector is a critical component in any speech
    processing application. It is able to identify the presence or absence of
    human speech in an audio frame.

    Generally, It is used to deactivate some processes during non-speech
    section of an audio session, saving on computation and on network bandwidth.

    Notes:
        This algorithm was implemented in the WebRTC project. The algorithm was
        originally designed to work with 8KHz, 16 bit PCM, mono audio samples.

        The algorithm accepts sampling rates of 8000Hz, 16000Hz, 32000Hz and
        48000Hz, but internally all processing will be done 8000 Hz, input data
        in higher sample rates will just be down-sampled first.
    """
    def __init__(self, rate: int = 8000, mode: int = 0):
        """Creates a VAD detector with the given configuration

        Args:
            rate (int): The audio sample rate, in Hz.
            mode (int): Operational mode,
        must be [0, 3]
        """
        self.__rate = rate
        self.__mode = mode
        self.__vad = Vad(mode=mode)

    @property
    def mode(self) -> int:
        """Returns an integer representing the operational mode"""
        return self.__mode

    @property
    def sample_rate(self) -> int:
        """Returns the sampling rate in Hz."""
        return self.__rate

    @mode.setter
    def mode(self, mode: int):
        """Set the operational mode of the VAD

        A more aggressive (higher mode) VAD is more restrictive in reporting
        speech.

        Put in other words the probability of being speech when the VAD
        returns 1 is increased with increasing mode. As a consequence also the
        missed detection rate goes up.

        Valid modes are:
            - 0 ("quality"):
            - 1 ("low bitrate"),
            - 2 ("aggressive")
            - 3 ("very aggressive").

        The default mode is 0.

        Args:
            mode (int): Operational moder, must be [0, 3]
        """
        self.__mode = mode
        self.__vad.set_mode(mode)

    @profile
    def process(self, data: np.ndarray) -> bool:
        """Checks if the given data contains human speech.

        Args:
            data (np.ndarray): An array containing the data

        Returns:
            True if the audio data contains speech, false otherwise

        Notes:
            The input data must be an array of signed 16-bit samples or an array
            of floating points storing values in the same range [-32,768,
            32,768]

            Only mono frames with a length of 10, 20 or 30 ms are supported. For
            instance, if the class is using a sampling rate of 8KHz, the
            processing function is expecting an numpy.ndarray of shape [80, N],
            [160, N] or [240, N] where N is the number of channels in the input
            data. The signal may be down-mixed to a single channel before
            processing.
        """
        mono = np.mean(a=data, axis=0, dtype=np.float32)
        mono = Converter.fromFloatToInt16(mono)
        mono = Converter.interleave(mono)
        result = self.__vad.is_speech(buf=mono,
                                      sample_rate=self.sample_rate,
                                      length=mono.size())
        if (result < 0):
            raise RuntimeError(
                "Invalid frame length. Only frames with a length of 10, 20 or 30 ms are supported."
            )
        return result