def test_valid_rate_and_frame_length(self): self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, 160)) self.assertTrue(webrtcvad.valid_rate_and_frame_length(16000, 160)) self.assertFalse(webrtcvad.valid_rate_and_frame_length(32000, 160)) self.assertRaises( ValueError, webrtcvad.valid_rate_and_frame_length, 2 ** 35, 10)
def hasSpeech(wav_bytes, sample_rate, num_channels): # Use webrtc's VAD with the lowest level of aggressiveness. mono_channel_bytes = wav_bytes if num_channels == 2: # just take the left channel for simplicity purposes. # We're just trying to get a quick sanity check, no need # to mix the two channels. mono_channel_bytes = b"".join([wav_bytes[i:i+2] for i in range(0, len(wav_bytes), 4)]) vad = webrtcvad.Vad(1) frame_duration = 10 # ms bytes_per_sample = 2 # assuming 16-bit PCM. samples_per_vaded_chunk = (sample_rate * frame_duration / 1000) bytes_per_vaded_chunk = int(samples_per_vaded_chunk*bytes_per_sample) num_speech_frames = 0 num_non_speech_frames = 0 for i in range(0, len(mono_channel_bytes)-bytes_per_vaded_chunk, bytes_per_vaded_chunk): chunk_to_vad = mono_channel_bytes[i:i+bytes_per_vaded_chunk] vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample) if webrtcvad.valid_rate_and_frame_length(sample_rate, vad_frame_length) and vad.is_speech(chunk_to_vad, sample_rate): num_speech_frames += 1 else: num_non_speech_frames += 1 emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0)) percentage_non_speech = (float(num_non_speech_frames) / float(num_non_speech_frames+num_speech_frames)) print ("percentage non-speech:", percentage_non_speech, "num_speech_frames", num_speech_frames, "num_non_speech_frames", num_non_speech_frames) return not emptyAudio and percentage_non_speech < 0.5
def test_process_zeroes(self): frame_len = 160 self.assertTrue( webrtcvad.valid_rate_and_frame_length(8000, frame_len)) sample = b'\x00' * frame_len * 2 vad = webrtcvad.Vad() self.assertFalse(vad.is_speech(sample, 16000))
def is_valid_chunk(chunks, sample_rate11): aggressiveness = 3 vad = webrtcvad.Vad(aggressiveness) yes_count = 0 no_count = 0 frame_dur_ms = 20 window_chunks = int(frame_dur_ms * sample_rate11 / 1000) curr_ind = 0.0 has_more = (len(chunks) > window_chunks) from_time = 0.0 to_time = frame_dur_ms / 1000 while (has_more): vad_input = chunks[int(curr_ind):int(curr_ind + window_chunks)] assert webrtcvad.valid_rate_and_frame_length(sample_rate11, len(vad_input)) try: is_speech = vad.is_speech(vad_input, sample_rate11) except Exception as e: print(e) is_speech = False if is_speech: yes_count += 1 else: no_count += 1 #print('{} - {} : {}'.format(from_time, to_time, is_speech)) curr_ind += window_chunks from_time += frame_dur_ms / 1000 to_time += frame_dur_ms / 1000 has_more = (len(chunks) > curr_ind) print('{} : {}'.format(yes_count, no_count)) if ((yes_count * 100 / (yes_count + no_count)) > 80): return True else: return False
def has_speech( wav_bytes, sample_rate_hz, num_channels, non_speech_threshold_fraction=DEFAULT_NON_SPEECH_THRESHOLD_FRACTION, verbose=False): """ Returns true if at least (1 - non_speech_threshold_fraction) percentage of frames contain voice activity. Note: webrtc VAD does not currently support 44.1MHz, so we have no way of checking those files for empty audio. """ # Use webrtc's VAD with the lowest level of aggressiveness. mono_channel_bytes = wav_bytes if num_channels == 2: # just take the left channel for simplicity purposes. # We're just trying to get a quick sanity check, no need # to mix the two channels. mono_channel_bytes = b"".join( [wav_bytes[i:i + 2] for i in range(0, len(wav_bytes), 4)]) vad = webrtcvad.Vad(1) frame_duration = 10 # ms bytes_per_sample = 2 # assuming 16-bit PCM. samples_per_vaded_chunk = (sample_rate_hz * frame_duration / 1000) bytes_per_vaded_chunk = int(samples_per_vaded_chunk * bytes_per_sample) num_speech_frames = 0 num_non_speech_frames = 0 for i in range(0, len(mono_channel_bytes) - bytes_per_vaded_chunk, bytes_per_vaded_chunk): chunk_to_vad = mono_channel_bytes[i:i + bytes_per_vaded_chunk] vad_frame_length = int(len(chunk_to_vad) / bytes_per_sample) if (webrtcvad.valid_rate_and_frame_length(sample_rate_hz, vad_frame_length) and vad.is_speech(chunk_to_vad, sample_rate_hz)): num_speech_frames += 1 else: num_non_speech_frames += 1 has_frames = (num_speech_frames + num_non_speech_frames > 0) emptyAudio = (num_speech_frames == 0 or (num_speech_frames and num_non_speech_frames == 0)) if has_frames: percentage_non_speech = ( float(num_non_speech_frames) / float(num_non_speech_frames + num_speech_frames)) else: # If there are no frames, return a default (positive > 0.5) number. percentage_non_speech = NO_FRAMES_VALUE if verbose: print("percentage non-speech:", percentage_non_speech, "num_speech_frames", num_speech_frames, "num_non_speech_frames", num_non_speech_frames) return not emptyAudio and percentage_non_speech < non_speech_threshold_fraction
def get_voice_segments(frames, frame_duration_ms, padding_duration_ms, sample_rate, vad): num_padding_frames = int(padding_duration_ms / frame_duration_ms) # We use a deque for our sliding window/ring buffer. ring_buffer = collections.deque(maxlen=num_padding_frames) # We have two states: TRIGGERED and NOTTRIGGERED. We start in the # NOTTRIGGERED state. triggered = False voiced_segments = [] voiced_frames = [] for frame in frames: webrtcvad.valid_rate_and_frame_length(sample_rate, len(frame.bytes)) is_speech = vad.is_speech(frame.bytes, sample_rate) ring_buffer.append((frame, is_speech)) if not triggered: num_voiced = len([f for f, speech in ring_buffer if speech]) # If we're NOTTRIGGERED and more than 90% of the frames in # the ring buffer are voiced frames, then enter the # TRIGGERED state. if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. for f, s in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: # We're in the TRIGGERED state, so collect the audio data # and add it to the ring buffer. voiced_frames.append(frame) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) # If more than 90% of the frames in the ring buffer are # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False ring_buffer.clear() voiced_segments.append( [voiced_frames[0].timestamp, voiced_frames[-1].timestamp]) voiced_frames = [] return voiced_segments
def test_process_file(self): with open('test-audio.raw', 'rb') as f: data = f.read() frame_ms = 30 n = int(8000 * 2 * 30 / 1000.0) frame_len = int(n / 2) self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, frame_len)) chunks = list(data[pos:pos + n] for pos in range(0, len(data), n)) if len(chunks[-1]) != n: chunks = chunks[:-1] expecteds = [ '011110111111111111111111111100', '011110111111111111111111111100', '000000111111111111111111110000', '000000111111111111111100000000' ] for mode in (0, 1, 2, 3): vad = webrtcvad.Vad(mode) result = '' for chunk in chunks: voiced = vad.is_speech(chunk, 8000) result += '1' if voiced else '0' self.assertEqual(expecteds[mode], result)
def test_process_file(self): with open('test-audio.raw', 'rb') as f: data = f.read() frame_ms = 30 n = int(8000 * 2 * 30 / 1000.0) frame_len = int(n / 2) self.assertTrue( webrtcvad.valid_rate_and_frame_length(8000, frame_len)) chunks = list(data[pos:pos + n] for pos in range(0, len(data), n)) if len(chunks[-1]) != n: chunks = chunks[:-1] expecteds = [ '011110111111111111111111111100', '011110111111111111111111111100', '000000111111111111111111110000', '000000111111111111111100000000' ] for mode in (0, 1, 2, 3): vad = webrtcvad.Vad(mode) result = '' for chunk in chunks: voiced = vad.is_speech(chunk, 8000) result += '1' if voiced else '0' self.assertEqual(expecteds[mode], result)
def test_process_zeroes(self): frame_len = 160 self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, frame_len)) sample = b'\x00' * frame_len * 2 vad = webrtcvad.Vad() self.assertFalse(vad.is_speech(sample, 16000))
def test_valid_rate_and_frame_length(self): self.assertTrue(webrtcvad.valid_rate_and_frame_length(8000, 160)) self.assertTrue(webrtcvad.valid_rate_and_frame_length(16000, 160)) self.assertFalse(webrtcvad.valid_rate_and_frame_length(32000, 160)) self.assertRaises((ValueError, OverflowError), webrtcvad.valid_rate_and_frame_length, 2**35, 10)
def segmenter( self, q: queue.Queue, block_size: int, sample_rate: int, padding_ms: int = 300, ratio: float = 0.75, ): """ :param q: :param block_size: :param sample_rate: :param padding_ms: Number of milliseconds desired in padding. Effective padding duration = (1 - ratio) * padding_ms ? TODO: check :param ratio: Minimum fraction of padding_ms that has to be voiced/non-voice to activate. :return: """ frame_duration_ms = 1000 * block_size / sample_rate num_padding_frames = int(padding_ms / frame_duration_ms) ring_buffer = collections.deque(maxlen=num_padding_frames) triggered = False while True: try: data = q.get(timeout=5) # data = q.get_nowait() except queue.Empty: logger.warning('Buffer is empty: increase buffersize?') time.sleep(1) continue frame = data if len(frame) < 640: return assert webrtcvad.valid_rate_and_frame_length( sample_rate, int(len(frame) / 2) ), "WebRTC VAD only supports frames that are 10, 20, or 30 ms long" is_speech = self.vad.is_speech(frame, sample_rate) if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len( [f for f, is_speech in ring_buffer if is_speech]) # TODO: replace with sum? if num_voiced > ratio * ring_buffer.maxlen: triggered = True for f, s in ring_buffer: yield f ring_buffer.clear() else: yield frame ring_buffer.append((frame, is_speech)) num_unvoiced = len( [f for f, is_speech in ring_buffer if not is_speech]) # TODO: replace with sum? if num_unvoiced > ratio * ring_buffer.maxlen: triggered = False yield None ring_buffer.clear()