def apriori_SNR(Noisy, Clean, mask=True): """Function to Calculate a-priori SNR mask=True puts out sigmoidal mapping function""" m_ibm = [] Noisy = librosa.db_to_power(Noisy) Clean = librosa.db_to_power(Clean) N = np.subtract(Noisy, Clean) ##small values to avoid divide by zero N[N == 0] += 0.000001 Clean[Clean == 0] += 0.000001 apisnr = 20 * np.log10( np.divide(Clean, N, out=np.zeros_like(Noisy), where=N != 0)) """shifting towards zero mean""" apisnr = np.nan_to_num(apisnr, nan=100) me = np.mean(apisnr[apisnr <= 50]) print("MEAN OF A PRIORI SNR <= 80: " + str(me)) apisnr = np.subtract(apisnr, me) """sigmoidal mapping function""" if mask == True: m_ibm = np.divide(1, (1 + np.exp(-0.1 * apisnr))) return m_ibm else: return apisnr
def create_scale_mask(vocal_spec, bg_spec): """ Take in log spectrogram and return a mask map for TF bins 1 if the vocal sound is dominated in the TF-bin, while 0 for not """ vocal_spec = librosa.db_to_power(vocal_spec.numpy()) bg_spec = librosa.db_to_power(bg_spec.numpy()) return np.array(vocal_spec / (vocal_spec + bg_spec), dtype=np.float32)
def IBM2(Clean, Noisy, mask=True): """IBM without log conversion""" M = [] Noisy = librosa.db_to_power(Noisy) Clean = librosa.db_to_power(Clean) N = np.subtract(Noisy, Clean) m_ibm = np.divide(Clean, N, out=(np.ones_like(Noisy) * -80), where=N != 0) if mask == True: m_ibm = (m_ibm >= 0).astype(int) return m_ibm
def IRM2_noisemask(N, S): M = [] N = librosa.db_to_power(N) S = librosa.db_to_power(S) noise = N - S for i in range(len(S)): c = np.divide(noise[i], N[i], out=np.ones_like(N[i]), where=noise[i] != 0) M.append(c) return M
def IRM_lit(S, N): """IRM with parameter beta, using power spectrum""" M = [] b = 0.5 S = librosa.db_to_power(S) N = librosa.db_to_power(N) N[N == 0] += 0.00000001 for i in range(len(S)): c = np.divide(S[i], N[i], out=np.zeros_like(S[i]), where=N[i] != 0) M.append(c) M = np.array(M) M = np.power(M, b) return M
def istft(spect, win_length, hop_length, window): ref = np.max(spect) spect = librosa.db_to_power(spect, ref=ref) return librosa.istft(spect, hop_length=hop_length, win_length=win_length, window=windows[window])
def save_feature(num_snr, i_speech: int, s_path_speech: str, speech: ndarray) -> tuple: spec_clean = np.ascontiguousarray(librosa.stft(speech, **hp.kwargs_stft)) mag_clean = np.ascontiguousarray(np.abs(spec_clean)[..., np.newaxis]) signal_power = np.mean(np.abs(speech)**2) list_dict = [] list_snr_db = [] for _ in enumerate(range(num_snr)): snr_db = -6*np.random.rand() list_snr_db.append(snr_db) snr = librosa.db_to_power(snr_db) noise_power = signal_power / snr noisy = speech + np.sqrt(noise_power) * np.random.randn(len(speech)) spec_noisy = librosa.stft(noisy, **hp.kwargs_stft) spec_noisy = np.ascontiguousarray(spec_noisy) list_dict.append( dict(spec_noisy=spec_noisy, speech=speech, spec_clean=spec_clean, mag_clean=mag_clean, path_speech=s_path_speech, length=len(speech), ) ) return list_snr_db, list_dict
def intensify(self, spectrogram, inverse=False): # Use now the power_to_db and db_to_power if not inverse: return librosa.power_to_db(spectrogram) else: return librosa.db_to_power(spectrogram)
def griffin_lim_aud(self, spec, save_audio=False): if self.config['use_logMel']: spec = librosa.db_to_power(spec) else: spec = spec y = librosa.feature.inverse.mel_to_audio( spec, sr=self.config['resampled_rate'], n_fft=self.config['n_fft'], hop_length=self.config['hop_length'], win_length=self.config['win_length']) if save_audio: savepath = os.path.join(self.config['vis_dir'], 'Mel_{}'.format(str(self.n_mels))) os.makedirs(savepath, exist_ok=True) savepath = os.path.join(savepath, 'epoch_{}.wav'.format(self.epoch)) soundfile.write(savepath, y, samplerate=self.config['resampled_rate']) return y
def _mel_2_audio(mel, sr=44100, n_fft=2048, hop_length=512, do_power=True): if do_power: mel = librosa.db_to_power(mel) audio = librosa.feature.inverse.mel_to_audio(mel, sr=sr, n_fft=n_fft, hop_length=hop_length) audio = normalize([audio], norm="max") audio = np.clip(audio, -1, 1) return audio.flatten()
def feature_to_audio(cfg, mel_spec, phs): # Load # mfcc_max = np.load() # mfcc = np.load() # phs = np.load() # MFCC to Mel-spectrogram # mel_spec = librosa.feature.inverse.mfcc_to_mel(mfcc=mfcc, n_mels=cfg['MEL_DIM']) # Mel-spectrogram to magnitude(power) mag = librosa.feature.inverse.mel_to_stft(M=librosa.db_to_power(mel_spec), sr=cfg['SR'], n_fft=cfg['FFT_LEN']) # ISTFT j = 1j D = mag * np.cos(phs) + j * mag * np.sin(phs) audio = librosa.core.istft(stft_matrix=D, win_length=cfg['FFT_LEN'], hop_length=cfg['HOP_LEN'], window='hamming') audio = signal.lfilter([1], [1, -cfg['PREEMPH']], audio) return audio
def melspec_to_audio(self, mel_spectrogram, log=True, phase=None, transpose=True, audio_out=True): if transpose: mel_spectrogram = mel_spectrogram.T if log: mel_spectrogram = librosa.db_to_power(mel_spectrogram) mel_spectrogram = mel_spectrogram**0.5 magnitude = np.dot(np.linalg.pinv(self._MEL_FILTER), mel_spectrogram) if phase is not None: inverted_signal = librosa.istft(magnitude * phase, hop_length=self._HOP_LENGTH) else: inverted_signal = griffin_lim(magnitude, self._N_FFT, self._HOP_LENGTH, n_iterations=10) if audio_out: return Audio(inverted_signal, rate=self._SAMPLE_RATE) else: return inverted_signal
def spectrogram_inversion(melspec, sr, fmin, fmax, use_db=True): if use_db: melspec = librosa.db_to_power(melspec) inv_melspec = librosa.feature.inverse.mel_to_audio(melspec, sr=sr, fmin=fmin, fmax=fmax) return inv_melspec
def res(train_loader, validation_loader, test_loader, num): ### generator for model def data_generator(data_loader): while True: for index, data_item in enumerate(data_loader): yield np.expand_dims(np.array(data_item['mix']), -1), np.expand_dims( np.array(data_item['target']), -1) test_generator = data_generator(test_loader) X_test, y_test = next(test_generator) ### origin dataset for index, data_item in enumerate(test_loader): if index == 0: break vocal = data_item['vocal'][num] mix = data_item['mix'][num] bg = data_item['bg'][num] target = data_item['target'][num] predict_model = Model(inputs=[inputs], outputs=[outputs]) predict_model.load_weights('./model/unet_mask.h5') pre_mask = predict_model.predict(X_test) mix_amplitude = librosa.db_to_power(X_test[num, :, :, 0]) plt.figure() plt.imshow(mix, aspect='auto', origin='lower') plt.tight_layout() plt.show() plt.figure() plt.imshow(vocal, aspect='auto', origin='lower') plt.tight_layout() plt.show() pre_spec = np.array(mix_amplitude * pre_mask[num, :, :, 0], dtype=np.float32) plt.figure() plt.imshow(librosa.power_to_db(pre_spec), aspect='auto', origin='lower') plt.tight_layout() plt.show() plt.figure() plt.imshow(pre_mask[num, :, :, 0], aspect='auto', origin='lower') plt.tight_layout() plt.show() mix_signal = mel_converter.m(mix.numpy(), log=True, audio_out=True) groudtruth_signal = mel_converter.m(vocal.numpy(), log=True, audio_out=True) pre_signal = mel_converter.m(librosa.power_to_db(pre_spec), log=True, audio_out=True) return mix_signal, groudtruth_signal, pre_signal
def deprep(S): S = denormalize(S) + ref_level_db S = librosa.db_to_power(S) wv = GRAD(np.expand_dims(S, 0), melspecfunc, maxiter=2000, evaiter=10, tol=1e-8) return np.array(np.squeeze(wv))
def stft_inversion(inputs): """ Inverse melspectrograms by reusing the phase Parameters: inputs: tuple (melspecs, stft_mixture) melspecs: list of ndarray MelSpectrograms to invert stft_mixture: ndarray STFT of the mixture to separate Returns: i_melspecs: list of ndarray """ melspecs, stft_mixture = inputs n_src = len(melspecs) use_wiener_filter = wiener_filter and (n_src > 1) melspecs, stft_mixture = np.array(melspecs), np.array(stft_mixture) mel_stfts = [] i_melspecs = [] if args.scale == "dB": melspecs = librosa.db_to_power(melspecs) for i in range(len(melspecs)): mel_stft = librosa.feature.inverse.mel_to_stft(melspecs[i], sr=sr, fmin=fmin, fmax=fmax, n_fft=n_fft) if use_wiener_filter: mel_stft = mel_stft**2 mel_stfts.append(mel_stft) mel_stfts = np.array(mel_stfts) if use_wiener_filter: stft_complexs = single_channel_wiener_filter( mel_stfts, stft_mixture) for i in range(len(melspecs)): if use_wiener_filter: stft_complex = stft_complexs[i] else: stft_complex = complex_array(mel_stfts[i], np.angle(stft_mixture)) istft = librosa.istft(stft_complex, hop_length=hop_length) i_melspecs.append(istft) return i_melspecs
def mlsp2wav(sound, sr, fft_size, hop_length): import librosa if torch.is_tensor(sound): sound = to_np(sound) sound_mel = np.multiply(sound, -80) sound_mel = librosa.db_to_power(sound_mel) sound_wav = librosa.feature.inverse.mel_to_audio(sound_mel, sr=sr, n_fft=fft_size, hop_length=hop_length) return sound_wav, sound_mel
def extract_audio( Z, feature, params): # if normalized Z: unnormalize first, then pass to func. # convert to audio if feature == "Stft": # undo log-magnitude scaling S = librosa.db_to_amplitude(Z) # upsample S = _upsample_fft(S, params["fft_sample_rate"], params["stft_window_length"]) yhat = librosa.griffinlim(S, hop_length=params["stft_hop_length"]) elif feature == "Mel": # undo log-power scaling S = librosa.db_to_power(Z) yhat = librosa.feature.inverse.mel_to_audio( S, sr=params["fft_sample_rate"], n_fft=params["stft_window_length"], hop_length=params["stft_hop_length"], ) elif feature == "Cqt": # undo log-amplitude scaling S = librosa.db_to_amplitude(Z) yhat = librosa.griffinlim_cqt( S, sr=params["fft_sample_rate"], hop_length=params["stft_hop_length"], fmin=librosa.note_to_hz(params["cqt_min_frequency"]), ) elif feature == "Mfcc": yhat = librosa.feature.inverse.mfcc_to_audio( Z, n_mels=params["frequency_bins"], sr=params["fft_sample_rate"], n_fft=params["stft_window_length"], hop_length=params["stft_hop_length"], ) else: print("Error: feature invalid") # throw/raise something return -1 return yhat, params["fft_sample_rate"]
def reconstruct_wave(spec, rate=16000, normalize_data=False): """ Reconstruct waveform spec: spectrogram generated using Librosa rate: sampling rate """ power = librosa.db_to_power(spec, ref=5.0) audio = librosa.feature.inverse.mel_to_audio(power, sr=rate, n_fft=2048, hop_length=512) out_audio = audio / np.max(audio) if normalize_data else audio return out_audio
def play_spectrogram(self, spectrogram, sr=22050, n_fft=1024, hop_length=256): array = np.asarray(spectrogram) mels = librosa.db_to_power(array, ref=1) return Audio( librosa.feature.inverse.mel_to_audio(mels, sr=sr, n_fft=self.hop_length * 4, hop_length=self.hop_length), rate=sr)
def writeAudio(spectrogram, sample_rate, mean, sigma, output_file_name): ''' For given normalized mel spectrogram, sample_rate, mean and standard deviation of the original recording, write .wav audiofile with the name output_file_name + '.wav' ''' spectrogram = spectrogram * sigma + mean spectrogram = librosa.db_to_power(spectrogram) file_name = output_file_name + '.wav' audio_signal = librosa.feature.inverse.mel_to_audio(spectrogram, sr=sample_rate) sf.write(file_name, audio_signal, sample_rate)
def apriori_SNR(Noisy,Clean,mask=True): m_ibm=[] Noisy=librosa.db_to_power(Noisy) Clean=librosa.db_to_power(Clean) N = np.subtract(Noisy,Clean) N[N==0] += 0.000001 Clean[Clean==0] += 0.000001 apisnr= 20*np.log10(np.divide(Clean, N, out=np.zeros_like(Noisy), where=N!=0)) apisnr= np.nan_to_num(apisnr,nan=100) me = np.mean(apisnr[apisnr<=80]) print("MEAN OF A PRIORI SNR <= 80: " +str(me)) apisnr = np.subtract(apisnr,me) if mask==True: m_ibm = np.divide(1,(1+np.exp(-0.1*apisnr))) return m_ibm else: return apisnr
def mel_to_audio(self, mel): mag = mel.T.numpy() mel_db = np.clip(mag, 0, 1) * self.max_db - self.max_db + self.ref_db mel_abs = librosa.db_to_power(mel_db, ref=self.max_db) audio = librosa.feature.inverse.mel_to_audio( mel_abs, sr=self.sampling_rate, n_fft=self.n_fft, hop_length=self.hop_length, win_length=self.win_length, ) audio = lfilter([1], [1, -self.preemphasis], audio) audio, _ = librosa.effects.trim(audio) return audio
def setup_noise_augmented_dataset(files_list, num_snr, kwargs_stft, dest, desc): os.makedirs(dest) with open(files_list, 'r') as list_file: all_lines = [line for line in list_file] list_file_pbar = tqdm(all_lines, desc=desc, dynamic_ncols=True) i_speech = 0 for line in list_file_pbar: audio_file = line.split('|')[0] speech = sf.read(audio_file)[0].astype(np.float32) spec_clean = np.ascontiguousarray( librosa.stft(speech, **kwargs_stft)) mag_clean = np.ascontiguousarray( np.abs(spec_clean)[..., np.newaxis]) signal_power = np.mean(np.abs(speech)**2) y = spec_clean.view(dtype=np.float32).reshape( (*spec_clean.shape, 2)) ##y = torch.from_numpy(y) T_y = spec_clean.shape[1] ##mag_clean = torch.from_numpy(mag_clean) for k in range(num_snr): snr_db = -6 * np.random.rand() snr = librosa.db_to_power(snr_db) noise_power = signal_power / snr noisy = speech + np.sqrt(noise_power) * np.random.randn( len(speech)) spec_noisy = librosa.stft(noisy, **kwargs_stft) spec_noisy = np.ascontiguousarray(spec_noisy) T_x = spec_noisy.shape[1] x = spec_noisy.view(dtype=np.float32).reshape( (*spec_noisy.shape, 2)) ##x = torch.from_numpy(x) mdict = dict(x=x, y=y, y_mag=mag_clean, path_speech=audio_file, length=len(speech), T_x=T_x, T_y=T_y) np.savez( f"{dest}/audio_{i_speech}_{k}.npz", **mdict, ) i_speech = i_speech + 1 return i_speech
def spectrogram_to_audio(data, n_mels, n_frames, sr, n_fft, hop_length, fmin, fmax): mel = np.reshape(data, (n_mels, n_frames)) mel = -(1 - mel) * 80 mel = librosa.db_to_power(mel) y = librosa.feature.inverse.mel_to_audio(mel, sr=sr, n_fft=n_fft, hop_length=hop_length, window=scipy.signal.hamming, fmin=fmin, fmax=fmax) return mel, y
def griffin_lim_aud(self, spec): if self.config['use_logMel']: spec = librosa.db_to_power(spec) else: spec = spec y = librosa.feature.inverse.mel_to_audio(spec, sr=self.config['resampled_rate'], n_fft=self.config['n_fft'], hop_length=self.config['hop_length'], win_length=self.config['win_length']) soundfile.write(os.path.join(self.config['vis_dir'], '{}.wav'.format(self.epoch)), y, samplerate=self.config['resampled_rate']) return y
def griffin_lim_aud(self, spec, emotion, save_audio=False): """ Generate audio samples from waveforms using Griffin approach """ if config['use_logMel']: spec = librosa.db_to_power(spec.detach().numpy()) else: spec = spec.detach().numpy() audio = librosa.feature.inverse.mel_to_audio( spec, sr=config['resampled_rate'], n_fft=config['n_fft'], hop_length=config['hop_length'], win_length=config['win_length']) if save_audio: savepath = os.path.join(os.getcwd(), 'emotion_{}.wav'.format(emotion)) soundfile.write(savepath, audio, samplerate=config['resampled_rate'])
def reconstruct_signal_from_mel_spectrogram(self, mel_spectrogram, log=True, phase=None): if log: mel_spectrogram = librosa.db_to_power(mel_spectrogram) mel_spectrogram = mel_spectrogram**0.5 magnitude = np.dot(np.linalg.pinv(self._MEL_FILTER), mel_spectrogram) if phase is not None: inverted_signal = librosa.istft(magnitude * phase, hop_length=self._HOP_LENGTH) else: inverted_signal = griffin_lim(magnitude, self._N_FFT, self._HOP_LENGTH, n_iterations=10) return AudioSignal(inverted_signal, self._SAMPLE_RATE)
def spectrogram2wav(mag): '''# Generate wave file from spectrogram''' # transpose mag = mag.T # de-noramlize mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db # to amplitude mag = librosa.db_to_power(mag) # wav reconstruction wav = griffin_lim(mag) # de-preemphasis wav = signal.lfilter([1], [1, -hp.preemphasis], wav) # trim wav, _ = librosa.effects.trim(wav) return wav
def save_sound(data, path, filename, sound_norm): for i in range(data.size(0)): # Unormalize data wave = data[i].cpu() * (sound_norm['max'] - sound_norm['min']) + sound_norm['min'] # Permute channels and remove channel wave = wave.permute(0, 2, 1).squeeze(0) # DB to Power wave = librosa.db_to_power(wave) # Generate wave using Griffin-Lim algorithm sound_wav = librosa.feature.inverse.mel_to_audio( wave.squeeze(0).data.numpy(), sr=16000, n_iter=60) # Save data f_filename = filename + "_" + str(i) + ".wav" torchaudio.save(os.path.join(path, f_filename), torch.from_numpy(sound_wav) * np.iinfo(np.int16).max, 16000) return
def __test(y, rp, x_true): x = librosa.db_to_power(y, ref=rp) assert np.isclose(x, x_true), (x, x_true, y, rp)
def __test(ref): db = librosa.power_to_db(xp, ref=ref, top_db=None) xp2 = librosa.db_to_power(db, ref=ref) assert np.allclose(xp, xp2)