Exemple #1
0
def apriori_SNR(Noisy, Clean, mask=True):
    """Function to Calculate a-priori SNR
    mask=True puts out sigmoidal mapping function"""

    m_ibm = []

    Noisy = librosa.db_to_power(Noisy)
    Clean = librosa.db_to_power(Clean)

    N = np.subtract(Noisy, Clean)
    ##small values to avoid divide by zero
    N[N == 0] += 0.000001
    Clean[Clean == 0] += 0.000001

    apisnr = 20 * np.log10(
        np.divide(Clean, N, out=np.zeros_like(Noisy), where=N != 0))
    """shifting towards zero mean"""
    apisnr = np.nan_to_num(apisnr, nan=100)
    me = np.mean(apisnr[apisnr <= 50])
    print("MEAN OF A PRIORI SNR <= 80: " + str(me))
    apisnr = np.subtract(apisnr, me)
    """sigmoidal mapping function"""
    if mask == True:
        m_ibm = np.divide(1, (1 + np.exp(-0.1 * apisnr)))
        return m_ibm
    else:
        return apisnr
Exemple #2
0
def create_scale_mask(vocal_spec, bg_spec):
    """
    Take in log spectrogram and return a mask map for TF bins
    1 if the vocal sound is dominated in the TF-bin, while 0 for not
    """
    vocal_spec = librosa.db_to_power(vocal_spec.numpy())
    bg_spec = librosa.db_to_power(bg_spec.numpy())
    return np.array(vocal_spec / (vocal_spec + bg_spec), dtype=np.float32)
Exemple #3
0
def IBM2(Clean, Noisy, mask=True):
    """IBM without log conversion"""
    M = []
    Noisy = librosa.db_to_power(Noisy)
    Clean = librosa.db_to_power(Clean)
    N = np.subtract(Noisy, Clean)
    m_ibm = np.divide(Clean, N, out=(np.ones_like(Noisy) * -80), where=N != 0)

    if mask == True:
        m_ibm = (m_ibm >= 0).astype(int)

    return m_ibm
Exemple #4
0
def IRM2_noisemask(N, S):
    M = []
    N = librosa.db_to_power(N)
    S = librosa.db_to_power(S)
    noise = N - S
    for i in range(len(S)):
        c = np.divide(noise[i],
                      N[i],
                      out=np.ones_like(N[i]),
                      where=noise[i] != 0)
        M.append(c)

    return M
Exemple #5
0
def IRM_lit(S, N):
    """IRM with parameter beta, using power spectrum"""
    M = []
    b = 0.5
    S = librosa.db_to_power(S)
    N = librosa.db_to_power(N)
    N[N == 0] += 0.00000001

    for i in range(len(S)):
        c = np.divide(S[i], N[i], out=np.zeros_like(S[i]), where=N[i] != 0)
        M.append(c)

    M = np.array(M)
    M = np.power(M, b)
    return M
def istft(spect, win_length, hop_length, window):
    ref = np.max(spect)
    spect = librosa.db_to_power(spect, ref=ref)
    return librosa.istft(spect,
                         hop_length=hop_length,
                         win_length=win_length,
                         window=windows[window])
Exemple #7
0
    def save_feature(num_snr, i_speech: int, s_path_speech: str, speech: ndarray) -> tuple:
        spec_clean = np.ascontiguousarray(librosa.stft(speech, **hp.kwargs_stft))
        mag_clean = np.ascontiguousarray(np.abs(spec_clean)[..., np.newaxis])

        signal_power = np.mean(np.abs(speech)**2)
        list_dict = []
        list_snr_db = []
        for _ in enumerate(range(num_snr)):
            snr_db = -6*np.random.rand()
            list_snr_db.append(snr_db)
            snr = librosa.db_to_power(snr_db)
            noise_power = signal_power / snr
            noisy = speech + np.sqrt(noise_power) * np.random.randn(len(speech))
            spec_noisy = librosa.stft(noisy, **hp.kwargs_stft)
            spec_noisy = np.ascontiguousarray(spec_noisy)

            list_dict.append(
                dict(spec_noisy=spec_noisy,
                    speech=speech,
                    spec_clean=spec_clean,
                    mag_clean=mag_clean,
                    path_speech=s_path_speech,
                    length=len(speech),
                    )
            )
        return list_snr_db, list_dict
Exemple #8
0
    def intensify(self, spectrogram, inverse=False):

        # Use now the power_to_db and db_to_power
        if not inverse:
            return librosa.power_to_db(spectrogram)
        else:
            return librosa.db_to_power(spectrogram)
    def griffin_lim_aud(self, spec, save_audio=False):
        if self.config['use_logMel']:
            spec = librosa.db_to_power(spec)
        else:
            spec = spec

        y = librosa.feature.inverse.mel_to_audio(
            spec,
            sr=self.config['resampled_rate'],
            n_fft=self.config['n_fft'],
            hop_length=self.config['hop_length'],
            win_length=self.config['win_length'])

        if save_audio:
            savepath = os.path.join(self.config['vis_dir'],
                                    'Mel_{}'.format(str(self.n_mels)))
            os.makedirs(savepath, exist_ok=True)

            savepath = os.path.join(savepath,
                                    'epoch_{}.wav'.format(self.epoch))

            soundfile.write(savepath,
                            y,
                            samplerate=self.config['resampled_rate'])
        return y
 def _mel_2_audio(mel, sr=44100, n_fft=2048, hop_length=512, do_power=True):
     if do_power:
         mel = librosa.db_to_power(mel)
     audio = librosa.feature.inverse.mel_to_audio(mel, sr=sr, n_fft=n_fft, hop_length=hop_length)
     audio = normalize([audio], norm="max")
     audio = np.clip(audio, -1, 1)
     return audio.flatten()
Exemple #11
0
def feature_to_audio(cfg, mel_spec, phs):

    # Load
    # mfcc_max = np.load()
    # mfcc = np.load()
    # phs = np.load()

    # MFCC to Mel-spectrogram
    # mel_spec = librosa.feature.inverse.mfcc_to_mel(mfcc=mfcc, n_mels=cfg['MEL_DIM'])

    # Mel-spectrogram to magnitude(power)
    mag = librosa.feature.inverse.mel_to_stft(M=librosa.db_to_power(mel_spec),
                                              sr=cfg['SR'],
                                              n_fft=cfg['FFT_LEN'])

    # ISTFT
    j = 1j
    D = mag * np.cos(phs) + j * mag * np.sin(phs)
    audio = librosa.core.istft(stft_matrix=D,
                               win_length=cfg['FFT_LEN'],
                               hop_length=cfg['HOP_LEN'],
                               window='hamming')
    audio = signal.lfilter([1], [1, -cfg['PREEMPH']], audio)

    return audio
    def melspec_to_audio(self,
                         mel_spectrogram,
                         log=True,
                         phase=None,
                         transpose=True,
                         audio_out=True):
        if transpose:
            mel_spectrogram = mel_spectrogram.T
        if log:
            mel_spectrogram = librosa.db_to_power(mel_spectrogram)

        mel_spectrogram = mel_spectrogram**0.5

        magnitude = np.dot(np.linalg.pinv(self._MEL_FILTER), mel_spectrogram)

        if phase is not None:
            inverted_signal = librosa.istft(magnitude * phase,
                                            hop_length=self._HOP_LENGTH)
        else:
            inverted_signal = griffin_lim(magnitude,
                                          self._N_FFT,
                                          self._HOP_LENGTH,
                                          n_iterations=10)

        if audio_out:
            return Audio(inverted_signal, rate=self._SAMPLE_RATE)
        else:
            return inverted_signal
Exemple #13
0
def spectrogram_inversion(melspec, sr, fmin, fmax, use_db=True):
    if use_db:
        melspec = librosa.db_to_power(melspec)
    inv_melspec = librosa.feature.inverse.mel_to_audio(melspec,
                                                       sr=sr,
                                                       fmin=fmin,
                                                       fmax=fmax)
    return inv_melspec
Exemple #14
0
def res(train_loader, validation_loader, test_loader, num):
    ### generator for model
    def data_generator(data_loader):
        while True:
            for index, data_item in enumerate(data_loader):
                yield np.expand_dims(np.array(data_item['mix']),
                                     -1), np.expand_dims(
                                         np.array(data_item['target']), -1)

    test_generator = data_generator(test_loader)
    X_test, y_test = next(test_generator)
    ### origin dataset
    for index, data_item in enumerate(test_loader):
        if index == 0:
            break

    vocal = data_item['vocal'][num]
    mix = data_item['mix'][num]
    bg = data_item['bg'][num]
    target = data_item['target'][num]

    predict_model = Model(inputs=[inputs], outputs=[outputs])
    predict_model.load_weights('./model/unet_mask.h5')
    pre_mask = predict_model.predict(X_test)
    mix_amplitude = librosa.db_to_power(X_test[num, :, :, 0])

    plt.figure()
    plt.imshow(mix, aspect='auto', origin='lower')
    plt.tight_layout()
    plt.show()

    plt.figure()
    plt.imshow(vocal, aspect='auto', origin='lower')
    plt.tight_layout()
    plt.show()

    pre_spec = np.array(mix_amplitude * pre_mask[num, :, :, 0],
                        dtype=np.float32)
    plt.figure()
    plt.imshow(librosa.power_to_db(pre_spec), aspect='auto', origin='lower')
    plt.tight_layout()
    plt.show()

    plt.figure()
    plt.imshow(pre_mask[num, :, :, 0], aspect='auto', origin='lower')
    plt.tight_layout()
    plt.show()

    mix_signal = mel_converter.m(mix.numpy(), log=True, audio_out=True)
    groudtruth_signal = mel_converter.m(vocal.numpy(),
                                        log=True,
                                        audio_out=True)
    pre_signal = mel_converter.m(librosa.power_to_db(pre_spec),
                                 log=True,
                                 audio_out=True)

    return mix_signal, groudtruth_signal, pre_signal
Exemple #15
0
def deprep(S):
    S = denormalize(S) + ref_level_db
    S = librosa.db_to_power(S)
    wv = GRAD(np.expand_dims(S, 0),
              melspecfunc,
              maxiter=2000,
              evaiter=10,
              tol=1e-8)
    return np.array(np.squeeze(wv))
Exemple #16
0
    def stft_inversion(inputs):
        """
        Inverse melspectrograms by reusing the phase

        Parameters:
            inputs: tuple
                (melspecs, stft_mixture)
                melspecs: list of ndarray
                    MelSpectrograms to invert
                stft_mixture: ndarray
                    STFT of the mixture to separate

        Returns:
            i_melspecs: list of ndarray
        """
        melspecs, stft_mixture = inputs
        n_src = len(melspecs)
        use_wiener_filter = wiener_filter and (n_src > 1)
        melspecs, stft_mixture = np.array(melspecs), np.array(stft_mixture)

        mel_stfts = []
        i_melspecs = []
        if args.scale == "dB":
            melspecs = librosa.db_to_power(melspecs)

        for i in range(len(melspecs)):
            mel_stft = librosa.feature.inverse.mel_to_stft(melspecs[i],
                                                           sr=sr,
                                                           fmin=fmin,
                                                           fmax=fmax,
                                                           n_fft=n_fft)
            if use_wiener_filter:
                mel_stft = mel_stft**2

            mel_stfts.append(mel_stft)

        mel_stfts = np.array(mel_stfts)

        if use_wiener_filter:
            stft_complexs = single_channel_wiener_filter(
                mel_stfts, stft_mixture)

        for i in range(len(melspecs)):

            if use_wiener_filter:
                stft_complex = stft_complexs[i]
            else:
                stft_complex = complex_array(mel_stfts[i],
                                             np.angle(stft_mixture))

            istft = librosa.istft(stft_complex, hop_length=hop_length)
            i_melspecs.append(istft)

        return i_melspecs
Exemple #17
0
def mlsp2wav(sound, sr, fft_size, hop_length):
    import librosa

    if torch.is_tensor(sound):
        sound = to_np(sound)
    sound_mel = np.multiply(sound, -80)
    sound_mel = librosa.db_to_power(sound_mel)
    sound_wav = librosa.feature.inverse.mel_to_audio(sound_mel,
                                                     sr=sr,
                                                     n_fft=fft_size,
                                                     hop_length=hop_length)
    return sound_wav, sound_mel
def extract_audio(
        Z, feature,
        params):  # if normalized Z: unnormalize first, then pass to func.

    # convert to audio
    if feature == "Stft":
        # undo log-magnitude scaling
        S = librosa.db_to_amplitude(Z)

        # upsample
        S = _upsample_fft(S, params["fft_sample_rate"],
                          params["stft_window_length"])

        yhat = librosa.griffinlim(S, hop_length=params["stft_hop_length"])

    elif feature == "Mel":
        # undo log-power scaling
        S = librosa.db_to_power(Z)

        yhat = librosa.feature.inverse.mel_to_audio(
            S,
            sr=params["fft_sample_rate"],
            n_fft=params["stft_window_length"],
            hop_length=params["stft_hop_length"],
        )

    elif feature == "Cqt":
        # undo log-amplitude scaling
        S = librosa.db_to_amplitude(Z)

        yhat = librosa.griffinlim_cqt(
            S,
            sr=params["fft_sample_rate"],
            hop_length=params["stft_hop_length"],
            fmin=librosa.note_to_hz(params["cqt_min_frequency"]),
        )

    elif feature == "Mfcc":

        yhat = librosa.feature.inverse.mfcc_to_audio(
            Z,
            n_mels=params["frequency_bins"],
            sr=params["fft_sample_rate"],
            n_fft=params["stft_window_length"],
            hop_length=params["stft_hop_length"],
        )

    else:
        print("Error: feature invalid")
        # throw/raise something
        return -1

    return yhat, params["fft_sample_rate"]
Exemple #19
0
def reconstruct_wave(spec, rate=16000, normalize_data=False):
    """
    Reconstruct waveform
    spec: spectrogram generated using Librosa
    rate: sampling rate
    """
    power = librosa.db_to_power(spec, ref=5.0)
    audio = librosa.feature.inverse.mel_to_audio(power,
                                                 sr=rate,
                                                 n_fft=2048,
                                                 hop_length=512)
    out_audio = audio / np.max(audio) if normalize_data else audio
    return out_audio
Exemple #20
0
 def play_spectrogram(self,
                      spectrogram,
                      sr=22050,
                      n_fft=1024,
                      hop_length=256):
     array = np.asarray(spectrogram)
     mels = librosa.db_to_power(array, ref=1)
     return Audio(
         librosa.feature.inverse.mel_to_audio(mels,
                                              sr=sr,
                                              n_fft=self.hop_length * 4,
                                              hop_length=self.hop_length),
         rate=sr)
Exemple #21
0
def writeAudio(spectrogram, sample_rate, mean, sigma, output_file_name):
    '''
    For given normalized mel spectrogram, sample_rate, mean and standard deviation of
    the original recording, write .wav audiofile with the name output_file_name + '.wav'
    '''
    spectrogram = spectrogram * sigma + mean
    spectrogram = librosa.db_to_power(spectrogram)

    file_name = output_file_name + '.wav'

    audio_signal = librosa.feature.inverse.mel_to_audio(spectrogram,
                                                        sr=sample_rate)
    sf.write(file_name, audio_signal, sample_rate)
Exemple #22
0
def apriori_SNR(Noisy,Clean,mask=True):
    m_ibm=[]

    Noisy=librosa.db_to_power(Noisy)
    Clean=librosa.db_to_power(Clean)

    N = np.subtract(Noisy,Clean)
    N[N==0] += 0.000001
    Clean[Clean==0] += 0.000001

    apisnr= 20*np.log10(np.divide(Clean, N, out=np.zeros_like(Noisy), where=N!=0))

    apisnr= np.nan_to_num(apisnr,nan=100)
    me = np.mean(apisnr[apisnr<=80])
    print("MEAN OF A PRIORI SNR <= 80: " +str(me))
    apisnr = np.subtract(apisnr,me)


    if mask==True:
        m_ibm = np.divide(1,(1+np.exp(-0.1*apisnr)))
        return m_ibm
    else:
        return apisnr
Exemple #23
0
 def mel_to_audio(self, mel):
     mag = mel.T.numpy()
     mel_db = np.clip(mag, 0, 1) * self.max_db - self.max_db + self.ref_db
     mel_abs = librosa.db_to_power(mel_db, ref=self.max_db)
     audio = librosa.feature.inverse.mel_to_audio(
         mel_abs,
         sr=self.sampling_rate,
         n_fft=self.n_fft,
         hop_length=self.hop_length,
         win_length=self.win_length,
     )
     audio = lfilter([1], [1, -self.preemphasis], audio)
     audio, _ = librosa.effects.trim(audio)
     return audio
Exemple #24
0
def setup_noise_augmented_dataset(files_list, num_snr, kwargs_stft, dest,
                                  desc):

    os.makedirs(dest)
    with open(files_list, 'r') as list_file:
        all_lines = [line for line in list_file]
        list_file_pbar = tqdm(all_lines, desc=desc, dynamic_ncols=True)

        i_speech = 0
        for line in list_file_pbar:
            audio_file = line.split('|')[0]
            speech = sf.read(audio_file)[0].astype(np.float32)
            spec_clean = np.ascontiguousarray(
                librosa.stft(speech, **kwargs_stft))
            mag_clean = np.ascontiguousarray(
                np.abs(spec_clean)[..., np.newaxis])
            signal_power = np.mean(np.abs(speech)**2)

            y = spec_clean.view(dtype=np.float32).reshape(
                (*spec_clean.shape, 2))
            ##y = torch.from_numpy(y)
            T_y = spec_clean.shape[1]
            ##mag_clean = torch.from_numpy(mag_clean)
            for k in range(num_snr):
                snr_db = -6 * np.random.rand()
                snr = librosa.db_to_power(snr_db)
                noise_power = signal_power / snr
                noisy = speech + np.sqrt(noise_power) * np.random.randn(
                    len(speech))
                spec_noisy = librosa.stft(noisy, **kwargs_stft)
                spec_noisy = np.ascontiguousarray(spec_noisy)
                T_x = spec_noisy.shape[1]
                x = spec_noisy.view(dtype=np.float32).reshape(
                    (*spec_noisy.shape, 2))
                ##x = torch.from_numpy(x)
                mdict = dict(x=x,
                             y=y,
                             y_mag=mag_clean,
                             path_speech=audio_file,
                             length=len(speech),
                             T_x=T_x,
                             T_y=T_y)
                np.savez(
                    f"{dest}/audio_{i_speech}_{k}.npz",
                    **mdict,
                )
                i_speech = i_speech + 1

    return i_speech
Exemple #25
0
def spectrogram_to_audio(data, n_mels, n_frames, sr, n_fft, hop_length, fmin,
                         fmax):

    mel = np.reshape(data, (n_mels, n_frames))
    mel = -(1 - mel) * 80
    mel = librosa.db_to_power(mel)

    y = librosa.feature.inverse.mel_to_audio(mel,
                                             sr=sr,
                                             n_fft=n_fft,
                                             hop_length=hop_length,
                                             window=scipy.signal.hamming,
                                             fmin=fmin,
                                             fmax=fmax)

    return mel, y
    def griffin_lim_aud(self, spec):
        if self.config['use_logMel']:
            spec = librosa.db_to_power(spec)
        else:
            spec = spec
            
        y = librosa.feature.inverse.mel_to_audio(spec,
                                            sr=self.config['resampled_rate'],
                                            n_fft=self.config['n_fft'],
                                            hop_length=self.config['hop_length'],
                                            win_length=self.config['win_length'])

        soundfile.write(os.path.join(self.config['vis_dir'],
                                    '{}.wav'.format(self.epoch)),
                                    y,
                                    samplerate=self.config['resampled_rate'])
        return y
def griffin_lim_aud(self, spec, emotion, save_audio=False):
    """ Generate audio samples from waveforms using Griffin approach
    """
    if config['use_logMel']:
        spec = librosa.db_to_power(spec.detach().numpy())
    else:
        spec = spec.detach().numpy()

    audio = librosa.feature.inverse.mel_to_audio(
        spec,
        sr=config['resampled_rate'],
        n_fft=config['n_fft'],
        hop_length=config['hop_length'],
        win_length=config['win_length'])

    if save_audio:
        savepath = os.path.join(os.getcwd(), 'emotion_{}.wav'.format(emotion))
        soundfile.write(savepath, audio, samplerate=config['resampled_rate'])
    def reconstruct_signal_from_mel_spectrogram(self,
                                                mel_spectrogram,
                                                log=True,
                                                phase=None):
        if log:
            mel_spectrogram = librosa.db_to_power(mel_spectrogram)

        mel_spectrogram = mel_spectrogram**0.5

        magnitude = np.dot(np.linalg.pinv(self._MEL_FILTER), mel_spectrogram)

        if phase is not None:
            inverted_signal = librosa.istft(magnitude * phase,
                                            hop_length=self._HOP_LENGTH)
        else:
            inverted_signal = griffin_lim(magnitude,
                                          self._N_FFT,
                                          self._HOP_LENGTH,
                                          n_iterations=10)

        return AudioSignal(inverted_signal, self._SAMPLE_RATE)
Exemple #29
0
def spectrogram2wav(mag):
    '''# Generate wave file from spectrogram'''
    # transpose
    mag = mag.T

    # de-noramlize
    mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db

    # to amplitude
    mag = librosa.db_to_power(mag)

    # wav reconstruction
    wav = griffin_lim(mag)

    # de-preemphasis
    wav = signal.lfilter([1], [1, -hp.preemphasis], wav)

    # trim
    wav, _ = librosa.effects.trim(wav)

    return wav
Exemple #30
0
def save_sound(data, path, filename, sound_norm):

    for i in range(data.size(0)):
        # Unormalize data
        wave = data[i].cpu() * (sound_norm['max'] -
                                sound_norm['min']) + sound_norm['min']

        # Permute channels and remove channel
        wave = wave.permute(0, 2, 1).squeeze(0)

        # DB to Power
        wave = librosa.db_to_power(wave)

        # Generate wave using Griffin-Lim algorithm
        sound_wav = librosa.feature.inverse.mel_to_audio(
            wave.squeeze(0).data.numpy(), sr=16000, n_iter=60)
        # Save data
        f_filename = filename + "_" + str(i) + ".wav"
        torchaudio.save(os.path.join(path, f_filename),
                        torch.from_numpy(sound_wav) * np.iinfo(np.int16).max,
                        16000)

    return
Exemple #31
0
    def __test(y, rp, x_true):

        x = librosa.db_to_power(y, ref=rp)

        assert np.isclose(x, x_true), (x, x_true, y, rp)
Exemple #32
0
    def __test(ref):

        db = librosa.power_to_db(xp, ref=ref, top_db=None)
        xp2 = librosa.db_to_power(db, ref=ref)

        assert np.allclose(xp, xp2)