Esempio n. 1
0
def trim_long_silences(wav):
    """
    Ensures that segments without voice in the waveform remain no longer than a 
    threshold determined by the VAD parameters in params.py.

    :param wav: the raw waveform as a numpy array of floats 
    :return: the same waveform with silences trimmed away (length <= original wav length)
    """
    # Compute the voice detection window size
    samples_per_window = (vad_window_length * sampling_rate) // 1000

    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]

    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav),
                           *(np.round(wav * int16_max)).astype(np.int16))

    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(
            vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                          sample_rate=sampling_rate))
    voice_flags = np.array(voice_flags)

    # Smooth the voice detection with a moving average
    def moving_average(array, width):
        array_padded = np.concatenate((np.zeros(
            (width - 1) // 2), array, np.zeros(width // 2)))
        ret = np.cumsum(array_padded, dtype=float)
        ret[width:] = ret[width:] - ret[:-width]
        return ret[width - 1:] / width

    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)

    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask,
                                 np.ones(vad_max_silence_length + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)

    return wav[audio_mask == True]
Esempio n. 2
0
def main():
    RATE = 16000
    frame_duration_ms = 30
    CHUNK = int(RATE * (frame_duration_ms / 1000.0))
    FORMAT = pyaudio.paInt16
    CHANNELS = 1

    if not os.path.isdir('wavfile'):
        os.mkdir('wavfile')

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK)

    vad = webrtcvad.Vad(3)

    root = Tk()
    root.geometry("800x800")
    root.title('Result')
    lbl = Label(root, text="이름")
    lbl.config()
    lbl.config(width=50)
    lbl.config(font=("Courier", 44))
    lbl.place(relx=0.5, rely=0.5, anchor=CENTER)

    t1 = threading.Thread(target=vad_thread,
                          args=(RATE, frame_duration_ms, 300, vad, stream))
    t2 = threading.Thread(target=speaker_recog_thread, args=(lbl, ))
    t1.daemon = True
    t2.daemon = True
    t1.start()
    t2.start()

    try:
        root.mainloop()
    except:
        pass
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()
Esempio n. 3
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: example.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    #audio, sample_rate = read_m4a(args[1])
    #audio, sample_rate = read_libri(args[1])
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    total_wav = b""
    for i, segment in enumerate(segments):
        total_wav += segment

    path = 'test.wav'
    write_wave(path, total_wav, sample_rate)
Esempio n. 4
0
def detect_audio(audio_file_name):
    return_dict = {}
    audio, sample_rate = read_wave(os.path.join("static", audio_file_name))
    vad = webrtcvad.Vad(2)
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    number = 0
    for i, segment in enumerate(segments):
        number += 1
    return_dict['count'] = number
    if (number > 0):
        return_dict['msg'] = "Voice detected"
        return_dict['code'] = "D"
    else:
        return_dict['msg'] = "Voice not detected"
        return_dict['code'] = "ND"
    return return_dict
Esempio n. 5
0
def detect_speech(signal,
                  sample_rate,
                  window_size,
                  aggressiveness,
                  window_size_dilate=None):
    vad = webrtcvad.Vad(aggressiveness)
    frame_len = int(window_size * sample_rate)
    speech = torch.as_tensor([[
        len(chunk) == frame_len
        and vad.is_speech(bytearray(chunk.numpy()), sample_rate)
        for chunk in channel.split(frame_len)
    ] for channel in signal])

    #if window_size_dilate is not None:
    #	kernel_size = int(window_size_dilate / window_size)
    #	speech = F.max_pool1d(speech.unsqueeze(1).float(), stride = 1, kernel_size = kernel_size, padding = kernel_size // 2).squeeze(1).to(speech.dtype)

    return speech.repeat_interleave(frame_len, dim=-1)[:, :signal.shape[1]]
def apply_webrtc_vad(signal, sample_rate, frame_duration, agressiveness=3):
    vad = webrtcvad.Vad(agressiveness)
        
    frame_size = np.int(sample_rate * frame_duration / 1000)
    nb_frames = np.int(len(signal) / frame_size)
    signal_clean_vad = []
    no_speech = []
    for i in range(0, nb_frames):
        if vad.is_speech(signal[i*frame_size:(i+1)*frame_size], sample_rate) is True:
            signal_clean_vad = np.append(signal_clean_vad, signal[i*frame_size:(i+1)*frame_size-1])
            no_speech = np.append(no_speech, np.zeros(frame_size))
        else:
            no_speech = np.append(no_speech, np.ones(frame_size))
    signal_clean_vad = np.divide(signal_clean_vad, max(signal_clean_vad))
    max_signal = max(signal)
    for i in range(0, len(no_speech)):
        no_speech[i] = no_speech[i]*max_signal
    return signal_clean_vad, no_speech
Esempio n. 7
0
def main(args):
    if len(args) != 2:
        sys.stderr.write('Usage: vad.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    vad = webrtcvad.Vad(int(args[0]))
    # frames = frame_generator(30, audio, sample_rate)
    frames = frame_generator(
        10, audio, sample_rate)  #increase resolution from 30 ms to 10 ms
    frames = list(frames)
    # segments = vad_collector(sample_rate, 30, 300, vad, frames)
    segments = vad_collector(sample_rate, 10, 300, vad,
                             frames)  #reduce padding from 300 ms to 200 ms
    for i, segment in enumerate(segments):
        path = 'C:\\Users\\rober\\Documents\\Projects\\vad_dataset\\chunk-%002d.wav' % (
            i, )
        print(' Writing %s' % (path, ))
        write_wave(path, segment, sample_rate)
Esempio n. 8
0
def init(_recording_dir, _callback_func):
    global vad, pa, stream, recording_dir
    global recording_dir, callback_func

    recording_dir = _recording_dir
    callback_func = _callback_func

    vad = webrtcvad.Vad(0)

    pa = pyaudio.PyAudio()
    stream = pa.open(
        format=FORMAT,
        channels=CHANNELS,
        rate=RATE,
        input=True,
        start=False,
        # input_device_index=2,
        frames_per_buffer=CHUNK_SIZE)
Esempio n. 9
0
    def __init__(self, config, audio_files=[]):

        # empty list
        self.audio_files = audio_files

        # create voice acitvity detector with agressiveness of 3
        self.vad = webrtcvad.Vad(3)

        # config parameters, explained in its respective file
        print(config)
        self._PATH_IN = config.IN_PATH
        self._PATH_OUT = config.OUT_PATH
        self._FORMAT_IN = config.FORMAT_IN
        self._FORMAT_OUT = config.FORMAT_OUT
        self._PREFIX = config.PREFIX
        self._NUM_SAMPLES = config.NUM_SAMPLES
        self._MAX_NUM_SPEAKER = config.MAX_NUM_SPEAKER
        self._MIXTURE_DURATION = config.CLIP_DURATION
Esempio n. 10
0
    def vad_file(self, filename):
        print(f"Starting {filename}")
        t0 = time.time()
        input_path = os.path.join(self.input_folder, filename)
        audio, sample_rate = read_wave(input_path)
        vad = webrtcvad.Vad(self.aggressivity)
        frames = frame_generator(30, audio, sample_rate)
        frames = list(frames)
        segments = vad_collector(sample_rate, 30, 300, vad, frames)

        # Segmenting the Voice audio and save it in list as bytes
        concataudio = [segment for segment in segments]
        joinedaudio = b"".join(concataudio)
        output_path = os.path.join(self.output_folder,
                                   filename.replace(".wav", f".wav"))

        write_wave(output_path, joinedaudio, sample_rate)
        print(f"{filename} done in {time.time()-t0:.3f}s")
Esempio n. 11
0
    def __init__(self,
                 vad=webrtcvad.Vad(2),
                 filter_order=1,
                 filter_frequency=0.0002,
                 threshold=0.2):
        """
        :param vad: webrtcvad.vad object
        :param a: a array for lowpass filter
        :param b: b array for lowpass filter
        """
        b, a = butter(filter_order, filter_frequency, 'low')

        self.vad = vad
        self.threshold = threshold
        self.filter_a = a
        self.filter_b = b
        self.frame_length_ms = 20
        self.sample_rate = 8000
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: VAD_tool.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    temp = b''
    for i, segment in enumerate(segments):
        path = 'chunk-%002d.wav' % (i, )
        #print(' Writing %s' % (path,))
        #print(segment)
        temp += segment

    write_wave(args[1][:len(args[1]) - 4] + '_vad.wav', temp, sample_rate)
Esempio n. 13
0
def main():
    vad = webrtcvad.Vad(3)

    speech_count = 0
    chunks = []
    doa_chunks = int(DOA_FRAMES / VAD_FRAMES)

    try:
        with MicArray(RATE, CHANNELS, RATE * VAD_FRAMES / 1000) as mic:
            audInstance = mic.pyaudio_instance
            for chunk in mic.read_chunks():
                wavframes.append(chunk.tostring())
                # Use single channel audio to detect voice activity
                if vad.is_speech(chunk[0::CHANNELS].tobytes(), RATE):
                    speech_count += 1

                chunks.append(chunk)
                if len(chunks) == doa_chunks:
                    if speech_count > (doa_chunks / 2):
                        frames = np.concatenate(chunks)
                        direction = mic.get_direction(frames)
                        show(direction)
                        now = datetime.datetime.now()
                        file.write('{},{}\n'.format(
                            now.strftime("%H:%M:%S %d-%m-%Y"), int(direction)))
                        print('\n{},{}'.format(
                            now.strftime("%H:%M:%S %d-%m-%Y"), int(direction)))

                    speech_count = 0
                    chunks = []

    except KeyboardInterrupt:
        file.close()
        wav = wave.open('session.wav', 'wb')
        wav.setnchannels(CHANNELS)
        wav.setsampwidth(audInstance.get_sample_size(pyaudio.paInt16))
        wav.setframerate(RATE)
        wav.writeframes(b''.join(wavframes))
        wav.close()

        print(" Audio recording is saved in file: session.wav")
        print(" Direction of arrival recorded in file: speaking.csv")

        print("Good Bye.....")
Esempio n. 14
0
def vad_split(audio, rate, frame_duration, aggressiveness=1):
    """Splits the audio into audio segments on non-speech frames.

    Args:
        audio: A numpy ndarray, which has 1 dimension and values within
               -1.0 to 1.0 (inclusive)
        rate: An integer, which is the rate at which samples are taken
        frame_duration: A float, which is the duration of each frame
                        to check

    Returns:
        A list of numpy ndarray, which are 1 dimension each and
            have values within -1.0 to 1.0 (inclusive)
    """
    assert rate in (8000, 16000, 32000,
                    48000), ('Invalid Rate, use 8000, 16000, 32000, or 48000')
    assert frame_duration in (.01, .02,
                              .03), ('Invalid frame_dur, use .01, .02, .03')
    assert 0 <= aggressiveness <= 3, (
        'Invalid aggressiveness, must be between 0 and 3')

    audio = (audio * np.iinfo('int16').max).astype('int16')

    vad = webrtcvad.Vad(aggressiveness)
    frame_size = int(rate * frame_duration)
    offset = 0
    off = True
    voiced_frames = []
    while offset + frame_size < len(audio):
        frame = audio[offset:offset + frame_size]
        if vad.is_speech(frame.tobytes(), rate):
            if off is True:
                off = False
                voiced_frames.append([frame])
            else:
                voiced_frames[-1].append(frame)
        else:
            off = True
        offset += frame_size
    if len(voiced_frames) == 0:
        return np.array([audio])
    for ndx in range(len(voiced_frames)):
        voiced_frames[ndx] = np.hstack(voiced_frames[ndx])
    return voiced_frames
Esempio n. 15
0
def record_vad(filePath="speech.wav", speechCount=40):
    """
    测试时发现鸟叫声也会造成影响,可以设定连续200-300ms的时间
    :return:
    """
    audio = pyaudio.PyAudio()
    vad = webrtcvad.Vad()
    vad.set_mode(0)
    frame = []
    framesNum = 16000 * 20 // 1000
    stream = audio.open(format=paInt16,
                        channels=channels,
                        rate=framerate,
                        input=True,
                        frames_per_buffer=framesNum)

    stream.start_stream()
    print("开始录音")
    count = 0
    speechnum = 0
    while not (speechnum > speechCount and count >= 10):
        if (count > 60):
            # 长时间没有说话,直接结束
            return -2
        data = stream.read(framesNum)
        frame.append(data)
        isSpeech = vad.is_speech(data, framerate)
        if (isSpeech):
            count = 0
            speechnum += 1
        else:
            count += 1
        print("current speech:{}".format(isSpeech))
    stream.stop_stream()
    audio.terminate()
    with wave.open(filePath, "wb") as f:
        f.setframerate(framerate)
        f.setnchannels(channels)
        f.setsampwidth(audio.get_sample_size(paInt16))
        f.writeframes(b"".join(frame))
    print("结束录音")
    pcm_path = filePath.split("wav")[0] + "pcm"  #生成pcm文件名
    wav2pcm.wav2pcm(filePath, pcm_path)
    return 0
Esempio n. 16
0
def home():
    global model
    model_path = './flask_app/static/h5_file/CRNN_04_epochs70_adam_CE_batch1_lr1e-05.pth.tar'
    if request.method == 'POST':
        #members = ['one','four','five','two','three','six']
        members = ['five', 'three', 'six', 'four', 'one', 'two']
        real = ['Ryan', 'Rick', 'Yanbo', 'Hsiaoen', 'Kunyu', 'Joyee']
        #real = ['Kunyu','Hsiaoen','Ryan','Joyee','Rick','Yanbo']
        # index = random.randint(0,len(members)-1)
        # name = members[index]
        from flask_app.static.CRNN import CRNN_04 as model
        model = load_model(model, model_path)
        audio, sample_rate = read_wave(
            './flask_app/static/wav_file/predict.wav')
        vad = webrtcvad.Vad(1)
        frames = frame_generator(30, audio, sample_rate)
        frames = list(frames)
        segments = vad_collector(sample_rate, 30, 300, vad, frames)
        for segment in segments:
            path = './flask_app/static/wav_file/after_vad.wav'
            write_wave(path, segment, sample_rate)
        noisy, sr = librosa.load('./flask_app/static/wav_file/after_vad.wav',
                                 sr=16000,
                                 mono=True)
        noisy = noisy / max(abs(noisy))
        MFCC_fea = transforms.MFCC(16000,
                                   melkwargs={
                                       'n_fft': 512,
                                       'hop_length': 160
                                   })(torch.from_numpy(noisy)).squeeze().t()
        MFCC_fea = MFCC_fea.unsqueeze(0)
        pred = model(MFCC_fea).max(1)[1].numpy()

        # noisy,sr = librosa.load('./flask_app/static/wav_file/predict.wav', sr=16000, mono=True)
        #noisy = noisy[13000:26440]
        #mfcc = librosa.feature.mfcc(y=noisy, sr=sr,n_mfcc=40, dct_type=2, hop_length=256,  n_fft=512, center=False)
        #fea = mfcc.transpose()
        #test = np.reshape(fea,(1,51,-1,1))
        #model = load_model('./flask_app/static/h5_file/model.43-0.04.h5')
        #pre = model.predict(test)
        name = members[int(pred)]
        real_name = real[int(pred)]
        return render_template('home.html', name=name, real_name=real_name)
    return render_template('home.html', name='')
 def test_leak(self):
     sound, fs = self._load_wave('leak-test.wav')
     frame_ms = 0.010
     frame_len = int(round(fs * frame_ms))
     n = int(len(sound) / (2 * frame_len))
     nrepeats = 1000
     vad = webrtcvad.Vad(3)
     used_memory_before = memory_usage(-1)[0]
     for counter in range(nrepeats):
         find_voice = False
         for frame_ind in range(n):
             slice_start = (frame_ind * 2 * frame_len)
             slice_end = ((frame_ind + 1) * 2 * frame_len)
             if vad.is_speech(sound[slice_start:slice_end], fs):
                 find_voice = True
         self.assertTrue(find_voice)
     used_memory_after = memory_usage(-1)[0]
     self.assertGreaterEqual(used_memory_before / 5.0,
                             used_memory_after - used_memory_before)
Esempio n. 18
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: example.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    # print(len(audio))
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    # print(len(segments))
    # print(segments[0])
    for i, segment in enumerate(segments):
        print(len(segment))
        # print(segment)
        path = 'chunk-%002d.wav' % (i, )
        print(' Writing %s' % (path, ))
        write_wave(path, segment, sample_rate)
Esempio n. 19
0
    def initialize(self):
        """Initialize a Hermes audio recorder."""
        self.logger.debug('Probing for available input devices...')
        for index in range(self.audio.get_device_count()):
            device = self.audio.get_device_info_by_index(index)
            name = device['name']
            channels = device['maxInputChannels']
            if channels:
                self.logger.debug('[%d] %s', index, name)
        try:
            self.audio_in = self.audio.get_default_input_device_info()['name']
        except OSError:
            raise NoDefaultAudioDeviceError('input')
        self.logger.info('Connected to audio input %s.', self.audio_in)

        if self.config.vad.enabled:
            self.logger.info('Voice Activity Detection enabled with mode %s.',
                             self.config.vad.mode)
            self.vad = webrtcvad.Vad(self.config.vad.mode)
Esempio n. 20
0
def brain(df_list, agg, frame, padding, output):
    Process = os.getpid()
    try:
        for i in range(len(df_list)):
            abs_wav_files = df_list.values[i][0]
            abs_filename = os.path.splitext(os.path.basename(abs_wav_files))[0]
            audio, sample_rate = read_wave(abs_wav_files)
            vad = webrtcvad.Vad(int(agg))
            frames = frame_generator(frame, audio, sample_rate)
            frames = list(frames)
            segments = vad_collector(sample_rate, frame, padding, vad, frames)
            for j, segment in enumerate(segments):
                path = os.path.join(output,
                                    abs_filename) + '-%0002d.wav' % (j + 1, )
                print("Process {}: Created file {}".format(Process, path))
                write_wave(path, segment, sample_rate)
    except EOFError as e:
        print("Empty file {}".format(abs_wav_files_filename))
        pass
Esempio n. 21
0
def vad_process(path, dataset):
    # VAD Process
    if dataset == "vox1":
        audio, sample_rate = read_wave(path)
    elif dataset == "vox2":
        audio, sample_rate = read_m4a(path)
    elif dataset == "librispeech":
        audio, sample_rate = read_libri(path)
    vad = webrtcvad.Vad(1)
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    total_wav = b""
    for i, segment in enumerate(segments):
        total_wav += segment
    # Without writing, unpack total_wav into numpy [N,1] array
    wav_arr = np.frombuffer(total_wav, dtype=np.int16)
    #print("read audio data from byte string. np array of shape:"+str(wav_arr.shape))
    return wav_arr, sample_rate
Esempio n. 22
0
    def __init__(self,
                 aggressiveness=2,
                 sample_rate=SAMPLE_RATE,
                 min_utt_length=MIN_UTT_LENGTH,
                 max_utt_length=MAX_UTT_LENGTH,
                 max_utt_gap=MAX_UTT_GAP):

        self.sample_rate = sample_rate

        self.vad = webrtcvad.Vad()
        self.vad.set_mode(aggressiveness)

        self.state = STATE_IDLE
        self.buf = []
        self.buf_sent = 0

        self.min_buf_entries = int(min_utt_length * 1000) / BUFFER_DURATION
        self.max_buf_entries = int(max_utt_length * 1000) / BUFFER_DURATION
        self.max_gap = int(max_utt_gap * 1000) / BUFFER_DURATION
Esempio n. 23
0
def detect_voices(path, convert=True, agg=3, save_samples=False, voices_folder='voices', int16_folder='int16'):
    """
    path: path for file to detect voices on. If not int16 then it must be converted as well
    convert: if this is True then convert the file to an int16 version and save in a specified folder
    + run the rest of function using that file
    agg: the aggressiveness of the voice detection. 3 is most aggressive. I suggest 3 all the
    time.(https://github.com/wiseman/py-webrtcvad)
    save_voices: if you want to save file of all voices to make sure they are actually voices
    voices_folder: where you want to save the tagged audio(voices)
    int16_folder: where you want to save the int16 version use for detecting voices
    """
    print 'detecting voices'
    

    if convert:
        #change path to path of converted file
        path = convert_file_format(path, int16_folder)
            
    audio, sample_rate = read_wave(path)
    vad = webrtcvad.Vad(int(agg))#https://github.com/wiseman/py-webrtcvad check that out
    
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)#list of 3ms frames
    
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    
    bytes = b''
    timestamps = []
    for segment in segments:
        timestamps.append([segment[0].timestamp, segment[-1].timestamp])
        if save_samples:
            for s in segment:
                bytes += s.bytes
    
    if save_samples:
        if not os.path.exists(voices_folder):
            os.makedirs(voices_folder)
        
        basename = os.path.basename(path).split('.')[0]
        voices_path = voices_folder + '/' + basename + '_voices.wav'
        write_wave(voices_path, bytes, sample_rate)#voices can be used to listen to the 'tagged audio'
    
    return timestamps
Esempio n. 24
0
 def __init__(self) -> None:
     self.count = 0  # for interval
     self.is_run = True
     self.stream = None
     with tempfile.NamedTemporaryFile() as f:
         self.path = f.name
     self.vad_mode = 3
     self.vad = webrtcvad.Vad(self.vad_mode)
     self.pipeline = torch.hub.load('pyannote/pyannote-audio',
                                    'sad_ami',
                                    pipeline=True)
     self.rate = 16000
     self.chunk_duration_ms = 30  # supports 10, 20 and 30 (ms)
     self.chunk_size = int(self.rate * self.chunk_duration_ms /
                           1000)  # chunk to read
     self.voiced_frames_rate = 0.9
     self.unvoiced_frames_rate = 0.9
     self.max_voiced_frames = 100
     self.leave = False
Esempio n. 25
0
def main(args):
    if len(args) != 2:
        sys.stderr.write('Usage: example.py <out_dir> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    out_dir = args[0]
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    os.makedirs(out_dir)
    vad = webrtcvad.Vad(3)
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    #print(len(frames))
    segments = vad_collector(sample_rate, 30, 900, vad, frames)
    for i, segment in enumerate(segments):
        path = os.path.join(out_dir, 'chunk-%0003d-%d.wav' % (i, segment[1]))
        #print(segment[1])
        #print(' Writing %s' % (path,))
        write_wave(path, segment[0], sample_rate)
Esempio n. 26
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: example.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    audio, sample_rate = read_wave(args[1])
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)

    if not os.path.isdir(args[1]+'_splitted'):
        os.mkdir(args[1]+'_splitted')

    for i, segment in enumerate(segments):
        path = args[1]+'_splitted/chunk-%002d.wav' % (i,)
        print(' Writing %s' % (path,))
        write_wave(path, segment, sample_rate)
        print("\n")
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: silenceremove.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    pcm_data, sample_rate, num_channels, sample_width = read_wave(args[1])
    # Aggressiveness mode
    # An integer between 0 and 3.
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)

    # Segmenting the Voice audio and save it in list as bytes
    concataudio = [segment for segment in segments]

    joinedaudio = b"".join(concataudio)

    write_wave("Non-Silenced-Audio.wav", joinedaudio, sample_rate)
Esempio n. 28
0
def filter_voice(signal, sample_rate, mode=3, threshold_voice=80):

    signal = np.array(signal, dtype=np.int16)
    signal = np.ascontiguousarray(signal)
    vad = webrtcvad.Vad(mode)
    frames = frame_generator(10, signal, sample_rate)
    frames = list(frames)

    if len(frames) == 0:
        return 0

    match = 0
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)
        if is_speech:
            match += 1

    percentage_voice = match * 100 / len(frames)
    return percentage_voice > threshold_voice
Esempio n. 29
0
def main(args):
    if len(args) != 2:
        sys.stderr.write(
            'Usage: example.py <aggressiveness> <path to wav file>\n')
        sys.exit(1)
    fn = args[1]
    fn_base = os.path.splitext(fn)[0].split('/')[-1]
    audio, sample_rate = read_wave(fn)
    vad = webrtcvad.Vad(int(args[0]))
    frames = frame_generator(30, audio, sample_rate)
    frames = list(frames)
    segments = vad_collector(sample_rate, 30, 300, vad, frames)
    for i, segment in enumerate(segments):
        path = '%s-chunk-%002d.wav' % (
            fn_base,
            i,
        )
        print(' Writing %s' % (path, ))
        write_wave(path, segment, sample_rate, stt=0)
Esempio n. 30
0
def remove_silence(wav, sr=_sr, max_silence_ms=20):
    """
    去除语音中的静音。
    :param wav:
    :param sr:
    :param max_silence_ms: 单位ms
    :return:
    """
    # Compute the voice detection window size
    wav = librosa.resample(wav, orig_sr=sr, target_sr=_sr)

    vad_window_length = 20
    vad_moving_average_width = 10

    samples_per_window = (vad_window_length * _sr) // 1000

    # Trim the end of the audio to have a multiple of the window size
    wav = wav[:len(wav) - (len(wav) % samples_per_window)]

    # Convert the float waveform to 16-bit mono PCM
    pcm_wave = struct.pack("%dh" % len(wav),
                           *(np.round(wav * _int16_max)).astype(np.int16))

    # Perform voice activation detection
    voice_flags = []
    vad = webrtcvad.Vad(mode=3)
    for window_start in range(0, len(wav), samples_per_window):
        window_end = window_start + samples_per_window
        voice_flags.append(
            vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
                          sample_rate=_sr))
    voice_flags = np.array(voice_flags)

    audio_mask = moving_average(voice_flags, vad_moving_average_width)
    audio_mask = np.round(audio_mask).astype(np.bool)

    # Dilate the voiced regions
    audio_mask = binary_dilation(audio_mask, np.ones(max_silence_ms + 1))
    audio_mask = np.repeat(audio_mask, samples_per_window)
    out = wav[audio_mask == True]
    out = librosa.resample(out, orig_sr=_sr, target_sr=sr)
    return out