Ejemplo n.º 1
0
 def test_valid_rate_and_frame_length(self):
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, 160))
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(16000, 160))
     self.assertFalse(webrtcvad.valid_rate_and_frame_length(32000, 160))
     self.assertRaises(
         ValueError,
         webrtcvad.valid_rate_and_frame_length, 2 ** 35, 10)
Ejemplo n.º 2
0
def hasSpeech(wav_bytes, sample_rate, num_channels):
    # Use webrtc's VAD with the lowest level of aggressiveness.
    mono_channel_bytes = wav_bytes
    if num_channels == 2:
        # just take the left channel for simplicity purposes.
        # We're just trying to get a quick sanity check, no need
        # to mix the two channels.
        mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)])
    vad = webrtcvad.Vad(1)
    frame_duration = 10  # ms
    bytes_per_sample = 2 # assuming 16-bit PCM.
    samples_per_vaded_chunk = (sample_rate * frame_duration / 1000)
    bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample)
    num_speech_frames = 0
    num_non_speech_frames = 0
    for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk):
        chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk]
        vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
        if webrtcvad.valid_rate_and_frame_length(sample_rate, vad_frame_length) and vad.is_speech(chunk_to_vad, sample_rate):
            num_speech_frames += 1
        else:
            num_non_speech_frames += 1
    emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0))
    percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames))
    print ("percentage non-speech:", percentage_non_speech,
        "num_speech_frames", num_speech_frames,
        "num_non_speech_frames", num_non_speech_frames)
    return not emptyAudio and percentage_non_speech < 0.5
Ejemplo n.º 3
0
 def test_process_zeroes(self):
     frame_len = 160
     self.assertTrue(
         webrtcvad.valid_rate_and_frame_length(8000, frame_len))
     sample = b'\x00' * frame_len * 2
     vad = webrtcvad.Vad()
     self.assertFalse(vad.is_speech(sample, 16000))
Ejemplo n.º 4
0
def is_valid_chunk(chunks, sample_rate11):
    aggressiveness = 3
    vad = webrtcvad.Vad(aggressiveness)
    yes_count = 0
    no_count = 0
    frame_dur_ms = 20
    window_chunks = int(frame_dur_ms * sample_rate11 / 1000)
    curr_ind = 0.0
    has_more = (len(chunks) > window_chunks)
    from_time = 0.0
    to_time = frame_dur_ms / 1000
    while (has_more):
        vad_input = chunks[int(curr_ind):int(curr_ind + window_chunks)]
        assert webrtcvad.valid_rate_and_frame_length(sample_rate11,
                                                     len(vad_input))
        try:
            is_speech = vad.is_speech(vad_input, sample_rate11)
        except Exception as e:
            print(e)
            is_speech = False
        if is_speech:
            yes_count += 1
        else:
            no_count += 1
        #print('{} - {} : {}'.format(from_time, to_time, is_speech))
        curr_ind += window_chunks
        from_time += frame_dur_ms / 1000
        to_time += frame_dur_ms / 1000
        has_more = (len(chunks) > curr_ind)
    print('{} : {}'.format(yes_count, no_count))
    if ((yes_count * 100 / (yes_count + no_count)) > 80):
        return True
    else:
        return False
Ejemplo n.º 5
0
def has_speech(
        wav_bytes,
        sample_rate_hz,
        num_channels,
        non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION,
        verbose=False):
    """
    Returns true if at least (1 - non_speech_threshold_fraction) percentage of frames contain voice activity.
    Note: webrtc VAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio.
    """

    # Use webrtc's VAD with the lowest level of aggressiveness.
    mono_channel_bytes = wav_bytes

    if num_channels == 2:
        # just take the left channel for simplicity purposes.
        # We're just trying to get a quick sanity check, no need
        # to mix the two channels.
        mono_channel_bytes = b"".join(
            [wav_bytes[i:i + 2] for i in range(0, len(wav_bytes), 4)])

    vad = webrtcvad.Vad(1)
    frame_duration = 10  # ms
    bytes_per_sample = 2  # assuming 16-bit PCM.
    samples_per_vaded_chunk = (sample_rate_hz * frame_duration / 1000)
    bytes_per_vaded_chunk = int(samples_per_vaded_chunk * bytes_per_sample)
    num_speech_frames = 0
    num_non_speech_frames = 0

    for i in range(0,
                   len(mono_channel_bytes) - bytes_per_vaded_chunk,
                   bytes_per_vaded_chunk):
        chunk_to_vad = mono_channel_bytes[i:i + bytes_per_vaded_chunk]
        vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample)
        if (webrtcvad.valid_rate_and_frame_length(sample_rate_hz,
                                                  vad_frame_length)
                and vad.is_speech(chunk_to_vad, sample_rate_hz)):
            num_speech_frames += 1
        else:
            num_non_speech_frames += 1

    has_frames = (num_speech_frames + num_non_speech_frames > 0)
    emptyAudio = (num_speech_frames == 0
                  or (num_speech_frames and num_non_speech_frames == 0))

    if has_frames:
        percentage_non_speech = (
            float(num_non_speech_frames) /
            float(num_non_speech_frames + num_speech_frames))
    else:
        # If there are no frames, return a default (positive > 0.5) number.
        percentage_non_speech = NO_FRAMES_VALUE

    if verbose:
        print("percentage non-speech:", percentage_non_speech,
              "num_speech_frames", num_speech_frames, "num_non_speech_frames",
              num_non_speech_frames)

    return not emptyAudio and percentage_non_speech < non_speech_threshold_fraction
Ejemplo n.º 6
0
def get_voice_segments(frames, frame_duration_ms, padding_duration_ms,
                       sample_rate, vad):
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_segments = []
    voiced_frames = []
    for frame in frames:
        webrtcvad.valid_rate_and_frame_length(sample_rate, len(frame.bytes))
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        ring_buffer.append((frame, is_speech))

        if not triggered:
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)

            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                triggered = False
                ring_buffer.clear()
                voiced_segments.append(
                    [voiced_frames[0].timestamp, voiced_frames[-1].timestamp])
                voiced_frames = []
    return voiced_segments
 def test_process_file(self):
     with open('test-audio.raw', 'rb') as f:
         data = f.read()
     frame_ms = 30
     n = int(8000 * 2 * 30 / 1000.0)
     frame_len = int(n / 2)
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, frame_len))
     chunks = list(data[pos:pos + n] for pos in range(0, len(data), n))
     if len(chunks[-1]) != n:
         chunks = chunks[:-1]
     expecteds = [
         '011110111111111111111111111100', '011110111111111111111111111100',
         '000000111111111111111111110000', '000000111111111111111100000000'
     ]
     for mode in (0, 1, 2, 3):
         vad = webrtcvad.Vad(mode)
         result = ''
         for chunk in chunks:
             voiced = vad.is_speech(chunk, 8000)
             result += '1' if voiced else '0'
         self.assertEqual(expecteds[mode], result)
Ejemplo n.º 8
0
 def test_process_file(self):
     with open('test-audio.raw', 'rb') as f:
         data = f.read()
     frame_ms = 30
     n = int(8000 * 2 * 30 / 1000.0)
     frame_len = int(n / 2)
     self.assertTrue(
         webrtcvad.valid_rate_and_frame_length(8000, frame_len))
     chunks = list(data[pos:pos + n] for pos in range(0, len(data), n))
     if len(chunks[-1]) != n:
         chunks = chunks[:-1]
     expecteds = [
         '011110111111111111111111111100',
         '011110111111111111111111111100',
         '000000111111111111111111110000',
         '000000111111111111111100000000'
     ]
     for mode in (0, 1, 2, 3):
         vad = webrtcvad.Vad(mode)
         result = ''
         for chunk in chunks:
             voiced = vad.is_speech(chunk, 8000)
             result += '1' if voiced else '0'
         self.assertEqual(expecteds[mode], result)
 def test_process_zeroes(self):
     frame_len = 160
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, frame_len))
     sample = b'\x00' * frame_len * 2
     vad = webrtcvad.Vad()
     self.assertFalse(vad.is_speech(sample, 16000))
 def test_valid_rate_and_frame_length(self):
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, 160))
     self.assertTrue(webrtcvad.valid_rate_and_frame_length(16000, 160))
     self.assertFalse(webrtcvad.valid_rate_and_frame_length(32000, 160))
     self.assertRaises((ValueError, OverflowError),
                       webrtcvad.valid_rate_and_frame_length, 2**35, 10)
Ejemplo n.º 11
0
    def segmenter(
        self,
        q: queue.Queue,
        block_size: int,
        sample_rate: int,
        padding_ms: int = 300,
        ratio: float = 0.75,
    ):
        """

        :param q:
        :param block_size:
        :param sample_rate:
        :param padding_ms:
            Number of milliseconds desired in padding.
            Effective padding duration = (1 - ratio) * padding_ms ?
            TODO: check
        :param ratio:
            Minimum fraction of padding_ms that has to be voiced/non-voice to activate.
        :return:
        """
        frame_duration_ms = 1000 * block_size / sample_rate
        num_padding_frames = int(padding_ms / frame_duration_ms)
        ring_buffer = collections.deque(maxlen=num_padding_frames)
        triggered = False

        while True:
            try:
                data = q.get(timeout=5)
                # data = q.get_nowait()
            except queue.Empty:
                logger.warning('Buffer is empty: increase buffersize?')
                time.sleep(1)
                continue
            frame = data
            if len(frame) < 640:
                return

            assert webrtcvad.valid_rate_and_frame_length(
                sample_rate, int(len(frame) / 2)
            ), "WebRTC VAD only supports frames that are 10, 20, or 30 ms long"

            is_speech = self.vad.is_speech(frame, sample_rate)

            if not triggered:
                ring_buffer.append((frame, is_speech))
                num_voiced = len(
                    [f for f, is_speech in ring_buffer if is_speech])
                # TODO: replace with sum?
                if num_voiced > ratio * ring_buffer.maxlen:
                    triggered = True
                    for f, s in ring_buffer:
                        yield f
                    ring_buffer.clear()

            else:
                yield frame
                ring_buffer.append((frame, is_speech))
                num_unvoiced = len(
                    [f for f, is_speech in ring_buffer if not is_speech])
                # TODO: replace with sum?
                if num_unvoiced > ratio * ring_buffer.maxlen:
                    triggered = False
                    yield None
                    ring_buffer.clear()