def clean_keyword(audiofile, keyword): ''' taken from https://github.com/F-Tag/python-vad/blob/master/example.ipynb ''' show = False curdir = os.getcwd() data, fs = librosa.core.load(audiofile) time = np.linspace(0, len(data) / fs, len(data)) try: vact = vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=3) vact = list(vact) while len(time) > len(vact): vact.append(0.0) utterances = list() for i in range(len(vact)): try: if vact[i] != vact[i - 1]: # voice shift if vact[i] == 1: start = i else: # this means it is end end = i utterances.append([start, end]) except: pass print(utterances) vact = np.array(vact) tempfiles = list() keptfiles = list() for i in range(len(utterances)): trimmed = data[utterances[i][0]:utterances[i][1]] tempfile = str(uuid.uuid4()) + '.wav' librosa.output.write_wav(tempfile, trimmed, fs) tempfiles.append(tempfile) for i in range(len(tempfiles)): if os.path.getsize(tempfiles[i]) > 20000: pass transcript = transcribe_audiofile(tempfiles[i]) print('TRANSCRIPT --> %s' % (transcript)) if transcript == keyword: keptfiles.append(tempfiles[i]) else: os.remove(tempfiles[i]) else: os.remove(tempfiles[i]) except: print( 'ERROR - ValueError: When data.type is float, data must be -1.0 <= data <= 1.0.' ) os.remove(audiofile) return keptfiles
def webrtc_segmentation(file_path: str, sample_rate: int, hop_length: int, aggressiveness: int) -> tuple: data, fs = librosa.load(file_path, mono=True, sr=sample_rate) data /= max(np.abs(data)) vact = vad(data, fs, fs_vad=sample_rate, hop_length=hop_length, vad_mode=aggressiveness) segments = list() previous = 0 for idx, pred in enumerate(vact): if previous == 0 and pred == 1: start = idx previous = 1 if previous == 1 and pred == 0: end = idx previous = 0 segments.append((start, end)) return segments, data
def clean_utterances(audiofile): ''' taken from https://github.com/F-Tag/python-vad/blob/master/example.ipynb ''' show = False curdir = os.getcwd() data, fs = librosa.core.load(audiofile) time = np.linspace(0, len(data) / fs, len(data)) vact = vad(data, fs, fs_vad=16000, hop_length=30, vad_mode=3) vact = list(vact) while len(time) > len(vact): vact.append(0.0) utterances = list() for i in range(len(vact)): try: if vact[i] != vact[i - 1]: # voice shift if vact[i] == 1: start = i else: # this means it is end end = i utterances.append([start, end]) except: pass print(utterances) vact = np.array(vact) files = list() for i in range(len(utterances)): trimmed = data[utterances[i][0]:utterances[i][1]] tempfile = str(uuid.uuid4()) + '.wav' librosa.output.write_wav(tempfile, trimmed, fs) files.append(tempfile) os.remove(audiofile) return files
def compute_WAPRQ(ref_path, test_path, sr=16000, n_mfcc=12, fmax=5000, patch_size=0.4, sigma=np.array([[1, 1], [3, 2], [1, 3]])): # Inputs: # refPath: path of reference speech # disPath: path pf degraded speech # sr: sampling frequency, Hz # n_mfcc: number of MFCCs # fmax: cutoff frequency # patch_size: size of each patch in s # sigma: step size conditon for DTW # Output: # WARP-Q quality score between refPath and disPath ####################### Load speech files ################################# # Load Ref Speech if ref_path[-4:] == '.wav': speech_Ref, sr_Ref = librosa.load(ref_path, sr=sr) else: if ref_path[-4:] == '.SRC': #For ITUT database if applicable speech_Ref, sr_Ref = sf.read(ref_path, format='RAW', channels=1, samplerate=16000, subtype='PCM_16', endian='LITTLE') if sr_Ref != sr: speech_Ref = librosa.resample(speech_Ref, sr_Ref, sr) sr_Ref = sr # Load Coded Speech if test_path[-4:] == '.wav': speech_Coded, sr_Coded = librosa.load(test_path, sr=sr) else: if test_path[-4:] == '.OUT': #For ITUT database if applicable speech_Coded, sr_Coded = sf.read(test_path, format='RAW', channels=1, samplerate=16000, subtype='PCM_16', endian='LITTLE') if sr_Coded != sr: speech_Coded = librosa.resample(speech_Coded, sr_Coded, sr) sr_Coded = sr if sr_Ref != sr_Coded: raise ValueError( "Reference and degraded signals should have same sampling rate!") # Make sure amplitudes are in the range of [-1, 1] otherwise clipping to -1 to 1 # after resampling (if applicable). We experienced this issue for TCD-VOIP database only speech_Ref[speech_Ref > 1] = 1.0 speech_Ref[speech_Ref < -1] = -1.0 speech_Coded[speech_Coded > 1] = 1.0 speech_Coded[speech_Coded < -1] = -1.0 ########################################################################### win_length = int(0.032 * sr) #32 ms frame hop_length = int(0.004 * sr) #4 ms overlap #hop_length = int(0.016*sr) n_fft = 2 * win_length lifter = 3 # DTW Parameters Metric = 'euclidean' # VAD Parameters hop_size_vad = 30 sr_vad = sr aggresive = 0 # VAD for Ref speech vact1 = vad(speech_Ref, sr, fs_vad=sr_vad, hop_length=hop_size_vad, vad_mode=aggresive) speech_Ref_vad = speech_Ref[vact1 == 1] # VAD for Coded speech vact2 = vad(speech_Coded, sr, fs_vad=sr_vad, hop_length=hop_size_vad, vad_mode=aggresive) speech_Coded_vad = speech_Coded[vact2 == 1] # Compute MFCC features for the two signals mfcc_Ref = librosa.feature.mfcc(speech_Ref_vad, sr=sr, n_mfcc=n_mfcc, fmax=fmax, n_fft=n_fft, win_length=win_length, hop_length=hop_length, lifter=lifter) mfcc_Coded = librosa.feature.mfcc(speech_Coded_vad, sr=sr, n_mfcc=n_mfcc, fmax=fmax, n_fft=n_fft, win_length=win_length, hop_length=hop_length, lifter=lifter) # Feature Normalisation using CMVNW method mfcc_Ref = speechpy.processing.cmvnw(mfcc_Ref.T, win_size=201, variance_normalization=True).T mfcc_Coded = speechpy.processing.cmvnw(mfcc_Coded.T, win_size=201, variance_normalization=True).T # Divid MFCC features of Coded speech into patches cols = int(patch_size / (hop_length / sr)) window_shape = (np.size(mfcc_Ref, 0), cols) step = int(cols / 2) mfcc_Coded_patch = view_as_windows(mfcc_Coded, window_shape, step) Acc = [] band_rad = 0.25 weights_mul = np.array([1, 1, 1]) # Compute alignment cose between each patch and Ref MFCC for i in range(mfcc_Coded_patch.shape[1]): patch = mfcc_Coded_patch[0][i] D, P = librosa.sequence.dtw(X=patch, Y=mfcc_Ref, metric=Metric, step_sizes_sigma=sigma, weights_mul=weights_mul, band_rad=band_rad, subseq=True, backtrack=True) P_librosa = P[::-1, :] b_ast = P_librosa[-1, 1] Acc.append(D[-1, b_ast] / D.shape[0]) # Final score return np.median(Acc)
def detect_voice(audio, sr): vact = vad(audio, sr, vad_mode=3, hoplength=30) return vact
temp = np.load(target_path + '/vocals/' + file[:-9] + '.npz') ye_vocals = temp['arr_0'] temp = np.load(target_path + '/accompaniment/' + file[:-9] + '.npz') ye_accomp = temp['arr_0'] #calculation of PES per track (input/4 segments) print("Estimating predicted energy at silence...") ln = len(ye_vocals) seglen = int(sys.argv[3]) for i in range(0, ln - seglen, seglen): y = librosa.resample(yr_vocals[i:i + seglen], 22050, 16000) vact_labels = pyvad.vad(y, 16000, hop_length=20, vad_mode=3) vact_labels = vact_labels[ 0:-1:320] #vocal activity labels of the reference track ye = librosa.resample(ye_vocals[i:i + seglen], 22050, 16000) veact_labels = pyvad.vad(ye, 16000, hop_length=20, vad_mode=3) veact_labels = veact_labels[ 0:-1:320] #vocal activity labels of the estimated track l = len(vact_labels) vact_temp = np.mean( vact_labels ) > 0.5 #returns a mean vact_label of the reference segment if (vact_temp == 0):
if __name__ == '__main__': listen_th = threading.Thread(target=listen) listen_th.daemon = True listen_th.start() times = list() # Profile the prediction inference speed try: print("Listening...") while True: if len(buffer) < buffer.maxlen: continue audio = np.hstack(list(buffer)) try: vact = vad(audio, RATE, fs_vad=RATE, hop_length=30, vad_mode=1) except: # Skip when audio clips continue if np.mean(vact) > 0.5: audio /= np.max(np.abs(audio)) audio = audio[None, :] now = time.time() embeddings = embedder.run(audio) prediction = cluster_obj.update_predict(np.squeeze(embeddings)) times.append(time.time() - now) print(" ", end="\r") print(f"Detected speaker {prediction}", end="\r") else: pass print(" ", end="\r")
import numpy as np fs_vads = (8000, 16000, 32000, 48000) hops = (10, 20, 30) vad_modes = (0, 1, 2, 3) #name ='E:/项目/ASRInLesson/lessons/test05mi.wav' name = 'E:/项目/ASRInLesson/same_segment/audio77_T.wav' data, fs = load(name, sr=None) time = np.linspace(0, len(data) / fs, len(data)) # time axis fig, ax0 = plt.subplots() plt.plot(time, data) #plt.show() for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): vact = vad(data, fs, fs_vad=16000, hoplength=30, vad_mode=0) fig, ax1 = plt.subplots() ax1.plot(time, data, color='b', label='speech waveform') ax1.set_xlabel("TIME [s]") ax2 = ax1.twinx() ax2.plot(time, vact, color="r", label='vad') plt.yticks([0, 1], ('unvoice', 'voice')) ax2.set_ylim([-0.01, 1.01]) plt.legend() #plt.show() #输出剪切之后的音频
fs_vads = (8000, 16000, 32000, 48000) hops = (10, 20, 30) vad_modes = (0, 1, 2, 3) fss = [16000, 22050] name = "voice/arctic_a0007.wav" for fs in fss: data, fs_r = load(name, sr=fs) for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): # print(fs, fs_vad, hop, vad_mode) vact = vad(data, fs_r, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode) assert vact.sum() > data.size // 2, vact.sum() """ import matplotlib.pyplot as plt plt.plot(data) plt.plot(vact) plt.savefig(("voice_"+str(fs_r)+str(fs_vad)+str(hop)+str(vad_mode)+".png")) plt.close() """ """ data = (np.random.rand(fs*3)-0.5)*0.1 for fs_vad, hop, vad_mode in product(fs_vads, hops, vad_modes): print(fs, fs_vad, hop, vad_mode) vact = vad(data, fs, fs_vad=fs_vad, hop_length=hop, vad_mode=vad_mode)