def preprocessing(y, sr): # Resampling to 16kHz if sr != 16000: sr_re = 16000 # sampling rate of resampling y = librosa.resample(y, sr, sr_re) sr = sr_re # Denoising y[np.argwhere(y == 0)] = 1e-10 y_denoise = scipy.signal.wiener(y, mysize=None, noise=None) # Pre Emphasis filter y_Emphasis = np.append(y_denoise[0], y_denoise[1:] - pre_emphasis * y_denoise[:-1]) # Normalization (Peak) y_max = max(y_Emphasis) y_Emphasis = y_Emphasis / y_max # VAD 인식을 위해 normalize # Voice Activity Detection (VAD) vad_mode = 2 # VAD mode = 0 ~ 3 y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode, thr=0.01) ## VAD 사용하여 trim 수행 if y_vad is None: y_vad = y_Emphasis # De normalization y_vad = y_vad * y_max # Obtain the mel spectrogram S = librosa.feature.melspectrogram(y=y_vad, sr=sr, hop_length=int(sr * Stride), n_fft=int(sr * Window_size), n_mels=Num_mels, power=2.0) EPS = 1e-8 S = np.log(S + EPS) r, Frame_length = S.shape # Obtain the normalized mel spectrogram S_norm = (S - np.mean(S)) / np.std(S) # zero padding Input_Mels = np.zeros((r, Num_Frame), dtype=float) if Frame_length < Num_Frame: Input_Mels[:, :Frame_length] = S_norm[:, :Frame_length] else: Input_Mels[:, :Num_Frame] = S_norm[:, :Num_Frame] # Input_Mels = np.expand_dims(Input_Mels, axis=0) # Input_Mels = np.transpose(Input_Mels, (0, 2, 1)) # Input_Mels = np.expand_dims(Input_Mels, axis=-1) return Input_Mels, Frame_length
def preprocessing(y, sr, num_frame=1500, stride=0.01, window_size=0.025, \ num_mels=40, pre_emphasis=0.97, is_mel=False): # Resampling to 16kHz if sr != 16000: sr_re = 16000 # sampling rate of resampling y = librosa.resample(y, sr, sr_re) sr = sr_re # Denoising y[np.argwhere(y == 0)] = 1e-10 y_denoise = scipy.signal.wiener(y, mysize=None, noise=None) # Pre Emphasis filter y_emphasis = np.append(y_denoise[0], \ y_denoise[1:] - pre_emphasis * y_denoise[:-1]) # Normalization (Peak) y_max = max(y_emphasis) y_emphasis = y_emphasis / y_max # Voice Activity Detection (VAD) vad_mode = 2 # VAD mode = 0 ~ 3 y_vad = trim(y_emphasis, sr, vad_mode=vad_mode) if y_vad is None: y_vad = y_emphasis # De normalization y_vad = y_vad * y_max # Obtain the mel spectrogram S = librosa.feature.melspectrogram(y=y_vad, sr=sr, \ hop_length=int(sr * stride), \ n_fft=int(sr * window_size), \ n_mels=num_mels, power=2.0) # Mel or Log Mel EPS = 1e-8 S = np.log(S + EPS) r, frame_length = S.shape print('\n** log mel **') print('S.shape', S.shape) # Obtain the normalized mel spectrogram s_norm = (S - np.mean(S)) / np.std(S) # zero padding input_mels = np.zeros((r, num_frame), dtype=float) if frame_length < num_frame: input_mels[:, :frame_length] = s_norm[:, :frame_length] else: input_mels[:, :num_frame] = s_norm[:, :num_frame] return input_mels, frame_length
def _load_wav(self, wav_file): sig, sample_rate = librosa.load(wav_file, sr=self.config.sample_rate) tmp = sig if self.config.use_vad: sig = trim(sig, sample_rate, fs_vad=self.config.sample_rate, hoplength=30, thr=0, vad_mode=2) if sig is None: return tmp else: return sig
def extract_feature(y, sr): ### Pre-processing Num_Frame = 1500 # max wave length (15 sec) Stride = 0.01 # stride (10ms) Window_size = 0.025 # filter window size (25ms) Num_data = 1 Num_mels = 40 # Mel filter number pre_emphasis = 0.97 # Pre-Emphasis filter coefficient # Resampling to 16kHz if sr != 16000: sr_re = 16000 # sampling rate of resampling y = librosa.resample(y, sr, sr_re) sr = sr_re # Denoising y[np.argwhere(y == 0)] = 1e-10 y_denoise = scipy.signal.wiener(y, mysize=None, noise=None) # Pre Emphasis filter → high frequency를 높여주는 부분 y_Emphasis = np.append(y_denoise[0], y_denoise[1:] - pre_emphasis * y_denoise[:-1]) # Normalization (Peak) y_max = max(y_Emphasis) y_Emphasis = y_Emphasis / y_max # VAD 인식을 위해 normalize # Voice Activity Detection (VAD) vad_mode = 2 # VAD mode = 0 ~ 3 y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode, thr=0.01) ## VAD 사용하여 trim 수행 if y_vad is None: y_vad = y_Emphasis # De normalization y_vad = y_vad * y_max # Obtain the mel spectrogram S = librosa.feature.melspectrogram(y=y_vad, sr=sr, hop_length=int(sr * Stride), n_fft=int(sr * Window_size), n_mels=Num_mels, power=2.0) r, Frame_length = S.shape S = np.log(S + 1e-8) # Obtain the normalized mel spectrogram S_norm = (S - np.mean(S)) / np.std(S) return S_norm
def load(file, do_vad=True): sig, sr = ta.load(file, channels_first=True, normalization=True) assert sr == 16000 if do_vad: start, end = trim(sig.transpose(0, 1).numpy(), fs=sr, fs_vad=16000, hop_length=30, vad_mode=2) if start != 0 and end != 0: return sig[:, start:end] else: return sig else: return sig
#!/usr/bin/env python # -*- coding: utf-8 -*- from itertools import product import numpy as np from librosa import load from pyvad import trim fs_vads = (8000, 16000, 32000, 48000) hops = (10, 20, 30) vad_modes = (0, 1, 2, 3) name = "voice/arctic_a0007.wav" data, fs = load(name, sr=None) for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=0) fs = 16000 data = (np.random.rand(fs * 3) - 0.5) * 0.1 for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=0)
#!/usr/bin/env python # -*- coding: utf-8 -*- from itertools import product import numpy as np from librosa import load from pyvad import trim fs_vads = (8000, 16000, 32000, 48000) hops = (10, 20, 30) vad_modes = (0, 1, 2, 3) name = "voice/arctic_a0007.wav" data, fs = load(name, sr=None) for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) assert vact[1] - vact[0] > 0, vact data = (np.random.rand(fs * 3) - 0.5) * 0.05 for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = trim(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) assert vact[1] - vact[0] == 0, vact
def preprocessing(y, sr): # Resampling to 16kHz if sr != 16000: sr_re = 16000 # sampling rate of resampling y = librosa.resample(y, sr, sr_re) sr = sr_re # Denoising y[np.argwhere(y == 0)] = 1e-10 y_denoise = scipy.signal.wiener(y, mysize=None, noise=None) # Pre Emphasis filter y_Emphasis = np.append(y_denoise[0], y_denoise[1:] - pre_emphasis * y_denoise[:-1]) # Normalization (Peak) y_max = max(y_Emphasis) y_Emphasis = y_Emphasis / y_max # normalize for VAD # Voice Activity Detection (VAD) vad_mode = 2 # VAD mode = 0 ~ 3 y_vad = trim(y_Emphasis, sr, vad_mode=vad_mode, thr=0.01) # trim using VAD module if y_vad is None: y_vad = y_Emphasis # De normalization y_vad = y_vad * y_max # Obtain the mel spectrogram S = librosa.feature.melspectrogram(y=y_vad, sr=sr, hop_length=int(sr * Stride), n_fft=int(sr * Window_size), n_mels=Num_mels, power=2.0) r, Frame_length = S.shape # Obtain the normalized mel spectrogram S_norm = (S - np.mean(S)) / np.std(S) # zero padding Input_Mels = np.zeros((r, Num_Frame), dtype=float) if Frame_length < Num_Frame: Input_Mels[:, :Frame_length] = S_norm[:, :Frame_length] else: Input_Mels[:, :Num_Frame] = S_norm[:, :Num_Frame] # Obtain the log mel spectrogram w = 1e+6 S_mel_log = np.log(1 + w * S) # Feature Input_DCT, Input_DST = Feature_DCT_DST(S_mel_log) Input_DCT = np.expand_dims(np.expand_dims(Input_DCT, axis=0), axis=-1) Input_DST = np.expand_dims(np.expand_dims(Input_DST, axis=0), axis=-1) Input_Hist = np.expand_dims(np.expand_dims(Feature_Hist(S_mel_log), axis=0), axis=-1) Input_Moments = np.expand_dims(np.expand_dims(Feature_Moments(S_mel_log), axis=0), axis=-1) return Input_Mels, Input_DCT, Input_DST, Input_Hist, Input_Moments, Frame_length
#plt.show() for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = vad(data, fs, fs_vad=16000, hoplength=30, vad_mode=0) fig, ax1 = plt.subplots() ax1.plot(time, data, color='b', label='speech waveform') ax1.set_xlabel("TIME [s]") ax2 = ax1.twinx() ax2.plot(time, vact, color="r", label='vad') plt.yticks([0, 1], ('unvoice', 'voice')) ax2.set_ylim([-0.01, 1.01]) plt.legend() #plt.show() #输出剪切之后的音频 trimed = trim(data, fs, fs_vad=16000, hoplength=30, vad_mode=3) time = np.linspace(0, len(trimed) / fs, len(trimed)) # time axis fig, ax1 = plt.subplots() ax1.plot(time, trimed, color='b', label='speech waveform') ax1.set_xlabel("TIME [s]") plt.show() plt.plot(trimed) plt.show()
def preprocessing(wav, sampling_rate): """ Args: wav: wave sr: sampling rate Returns: input_mels """ # Resampling to 16kHz if sampling_rate != 16000: sampling_rate_re = 16000 # sampling rate of resampling wav = librosa.resample(wav, sampling_rate, sampling_rate_re) sampling_rate = sampling_rate_re # Denoising wav[np.argwhere(wav == 0)] = 1e-10 wav_denoise = scipy.signal.wiener(wav, mysize=None, noise=None) # Pre Emphasis filter wav_emphasis = np.append( wav_denoise[0], wav_denoise[1:] - PRE_EMPHASIS_COEFF * wav_denoise[:-1]) # Normalization (Peak) wav_max = np.abs(wav_emphasis).max() / 0.9 wav_emphasis = wav_emphasis / wav_max # normalize for VAD # Voice Activity Detection (VAD) vad_mode = 2 # VAD mode = 0 ~ 3 wav_vad = trim(wav_emphasis, sampling_rate, vad_mode=vad_mode, thr=0.01) ## trim if wav_vad is None: wav_vad = wav_emphasis # De normalization wav_vad = wav_vad * wav_max # Obtain the spectrogram sftf_vad = librosa.core.stft(y=wav_vad, hop_length=int(sampling_rate * STRIDE), n_fft=int(sampling_rate * WINDOW_SIZE)) spec = np.abs(sftf_vad)**2 # mel spectrogram mel_spec = librosa.feature.melspectrogram(S=spec, n_mels=NUM_MELS) # log scaled mel spectrogram log_weight = 1e+6 log_mel_spec = np.log(1 + log_weight * mel_spec) frame_length = log_mel_spec.shape[1] # zero padding input_mels = np.zeros((NUM_MELS, MAX_FRAME_LENGTH), dtype=float) if frame_length < MAX_FRAME_LENGTH: input_mels[:, :frame_length] = log_mel_spec[:, :frame_length] else: input_mels[:, :MAX_FRAME_LENGTH] = log_mel_spec[:, :MAX_FRAME_LENGTH] return input_mels