Ejemplo n.º 1
0
def prep_dataset(params):
    """ Compute the mixtures at various input SNR for creating the dataset
    Args:
        params: dictionary with fields:
            'noise_data_dir': string - the path to the noise data
            'speech_data_dir': string - the path to the clean speech data
            'sample rate': int - the sampling frequency
            'n_mix': int - the number of mixtures to create
            'input_SNR_list': list - the list of input SNRs to consider
    """

    # Load the noises, keep the 1st of the 16 channels
    noise_data_list = librosa.util.find_files(params['noise_data_dir'],
                                              ext='wav')
    noise_data_list = [n for n in noise_data_list if n.__contains__('ch01')]
    noise_total = np.array([])
    for n in noise_data_list:
        noise_total = np.concatenate(
            (noise_total, librosa.core.load(n, sr=params['sample_rate'])[0]))
    noise_beg_ind = []
    noise_total_len = noise_total.shape[0]

    # Load the list of clean speech files and shuffle it
    speech_data_list = librosa.util.find_files(params['speech_data_dir'],
                                               ext='wav')
    np.random.shuffle(speech_data_list)

    # Create the mixtures
    for n in range(params['n_mix']):
        print('Creating mix ' + str(n + 1) + ' / ' + str(params['n_mix']))

        # Load the speech
        speech_data = speech_data_list[n]
        clean = librosa.core.load(speech_data, sr=params['sample_rate'])[0]
        len_clean = clean.shape[0]

        # Take a piece of the noise of same length
        rand_sample_noise_beg = np.random.randint(noise_total_len)
        noise = noise_total[rand_sample_noise_beg:rand_sample_noise_beg +
                            len_clean]

        # Collect the noise index (to further study the results as a function of the noise type)
        noise_beg_ind.append(rand_sample_noise_beg)

        # Adjust the input SNR and record audio
        for iSNR in params['input_SNR_list']:

            # Adjust the noise at target input SNR
            noise_adj = adjust_noise_at_isnr(clean, noise, input_snr=iSNR)
            src_ref = np.concatenate(
                (clean[:, np.newaxis], noise_adj[:, np.newaxis]), axis=1)

            # Take the STFT and iSTFT to ensure the length is fixed
            src_ref_stft = my_stft(src_ref,
                                   n_fft=params['n_fft'],
                                   hop_length=params['hop_length'],
                                   win_length=params['win_length'],
                                   win_type=params['win_type'])
            src_ref = my_istft(src_ref_stft,
                               hop_length=params['hop_length'],
                               win_length=params['win_length'],
                               win_type=params['win_type'])

            # Create the folder to record the wav (if necessary)
            rec_dir = 'data/SNR_' + str(iSNR) + '/' + str(n)
            if not os.path.exists(rec_dir):
                os.makedirs(rec_dir)

            # Record wav
            record_src(rec_dir + '/',
                       src_ref,
                       params['sample_rate'],
                       rec_mix=True)

    # Get the indices of noise type for each mixture in the test set and record
    noise_beg_ind = np.array(noise_beg_ind)
    noise_beg_ind = noise_beg_ind[50:]
    ind_noise_1 = noise_beg_ind < noise_total_len // 3
    ind_noise_3 = noise_beg_ind > 2 * noise_total_len // 3
    ind_noise_2 = 1 - (ind_noise_1 + ind_noise_3)
    np.savez('data/noise_ind.npz',
             ind_noise_1=ind_noise_1,
             ind_noise_2=ind_noise_2,
             ind_noise_3=ind_noise_3)

    return
Ejemplo n.º 2
0
def validation(params, val_sdr_path='outputs/val_sdr.npz'):
    """ Run the proposed algorithm on the validation subset in different settings
    Args:
        params: dictionary with fields:
            'sample rate': int - the sampling frequency
            'n_mix': int - the number of mixtures to process
            'max_iter': int - the nomber of iterations of the proposed algorithm
            'input_SNR_list': list - the list of input SNRs to consider
            'grad_step_range': numpy array - the step size grid
            'beta_range': numpy array - the beta-divergence parameter grid
            'hop_length': int - the hop size of the STFT
            'win_length': int - the window length
            'n_fft': int - the number of FFT points
            'win_type': string - the STFT window type (e.g., Hann, Hamming, Blackman...)
        val_sdr_path: string - the path where to store the validation SDR
    """

    # Some parameters
    n_isnr = len(params['input_SNR_list'])
    n_grad, n_beta = params['grad_step_range'].shape[0], params[
        'beta_range'].shape[0]

    # Initialize the SDR array
    sdr_val = np.zeros((params['max_iter'] + 1, n_grad, n_beta, 2, 2, n_isnr,
                        params['n_mix']))

    # Loop over iSNRs, mixtures and parameters
    for index_isnr, isnr in enumerate(params['input_SNR_list']):
        for index_mix in range(params['n_mix']):

            # Load time-domain signals and get the mixture's STFT
            audio_path = 'data/SNR_' + str(isnr) + '/' + str(index_mix) + '/'
            src_ref, mix = load_src(audio_path, params['sample_rate'])
            mix_stft = my_stft(mix,
                               n_fft=params['n_fft'],
                               hop_length=params['hop_length'],
                               win_length=params['win_length'],
                               win_type=params['win_type'])[:, :, 0]

            # Estimate the magnitude spectrograms
            spectro_mag = estim_spectro_from_mix(mix)

            # Gradient descent
            for index_b, b in enumerate(params['beta_range']):
                for index_g, g in enumerate(params['grad_step_range']):
                    print('iSNR ' + str(index_isnr + 1) + ' / ' + str(n_isnr) +
                          ' -- Mix ' + str(index_mix + 1) + ' / ' +
                          str(params['n_mix']) + ' -- Beta ' +
                          str(index_b + 1) + ' / ' + str(n_beta) +
                          ' -- Step size ' + str(index_g + 1) + ' / ' +
                          str(n_grad))

                    # Run the gradient descent algorithm for d=1,2 and for the "right" and "left" problems
                    out = bregmisi_all(mix_stft,
                                       spectro_mag,
                                       src_ref=src_ref,
                                       win_length=params['win_length'],
                                       hop_length=params['hop_length'],
                                       win_type=params['win_type'],
                                       beta=b,
                                       grad_step=g * np.ones((2, 2)),
                                       max_iter=params['max_iter'])

                    # Store the SDR over iterations
                    sdr_val[:, index_g, index_b, 0, 0, index_isnr,
                            index_mix] = out['sdr_1r']
                    sdr_val[:, index_g, index_b, 1, 0, index_isnr,
                            index_mix] = out['sdr_2r']
                    sdr_val[:, index_g, index_b, 0, 1, index_isnr,
                            index_mix] = out['sdr_1l']
                    sdr_val[:, index_g, index_b, 1, 1, index_isnr,
                            index_mix] = out['sdr_2l']

    # Save results
    np.savez(val_sdr_path, sdr=sdr_val)

    return
Ejemplo n.º 3
0
def misi(mix_stft,
         spectro_mag,
         win_length=None,
         hop_length=None,
         src_ref=None,
         max_iter=20,
         win_type='hann'):
    """The multiple input spectrogram inversion algorithm for source separation.
    Args:
        mix_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT
        spectro_mag: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude spectrograms
        win_length: int - the window length
        hop_length: int - the hop size of the STFT
        src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations
        max_iter: int - number of iterations
        win_type: string - window type
    Returns:
        estimated_sources: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources
        error: list (max_iter) - loss function (magnitude mismatch) over iterations
        sdr: list (max_iter) - score (SDR in dB) over iterations
    """

    # Parameters
    n_src = spectro_mag.shape[2]
    n_fft = (spectro_mag.shape[0] - 1) * 2
    if win_length is None: win_length = n_fft
    if hop_length is None: hop_length = win_length // 2

    # Pre allocate SDR and error
    compute_sdr = not (src_ref is None)
    error, sdr = [], []

    # Initialization with amplitude mask
    src_est = amplitude_mask(spectro_mag,
                             mix_stft,
                             win_length=win_length,
                             hop_length=hop_length,
                             win_type=win_type)

    if compute_sdr:
        sdr.append(get_score(src_ref, src_est))

    for iteration_number in range(max_iter):
        # STFT
        stft_est = my_stft(src_est,
                           n_fft=n_fft,
                           hop_length=hop_length,
                           win_length=win_length,
                           win_type=win_type)
        current_magnitude = np.abs(stft_est)
        # Normalize to the target amplitude
        stft_est = stft_est * spectro_mag / (np.abs(stft_est) +
                                             sys.float_info.epsilon)
        # Compute and distribute the mixing error
        mixing_error = mix_stft - np.sum(stft_est, axis=2)
        stft_est += np.repeat(mixing_error[:, :, np.newaxis], n_src,
                              axis=2) / n_src
        # Inverse STFT
        src_est = my_istft(stft_est,
                           win_length=win_length,
                           hop_length=hop_length,
                           win_type=win_type)
        # BSS score
        if compute_sdr:
            sdr.append(get_score(src_ref, src_est))
        # Error
        error.append(np.linalg.norm(current_magnitude - spectro_mag))

    return src_est, error, sdr
Ejemplo n.º 4
0
def testing(params, test_sdr_path='outputs/test_sdr.npz'):
    """ Run the proposed algorithm on the test subset and the MISI and AM baselines
    Args:
        params: dictionary with fields:
            'sample rate': int - the sampling frequency
            'n_mix': int - the number of mixtures to process
            'max_iter': int - the nomber of iterations of the proposed algorithm
            'input_SNR_list': list - the list of input SNRs to consider
            'beta_range': numpy array - the beta-divergence parameter grid
            'hop_length': int - the hop size of the STFT
            'win_length': int - the window length
            'n_fft': int - the number of FFT points
            'win_type': string - the STFT window type (e.g., Hann, Hamming, Blackman...)
        test_sdr_path: string - the path where to store the test SDR
    """

    # Define some parameters and initialize the SNR array
    n_isnr = len(params['input_SNR_list'])
    sdr_am = np.zeros((n_isnr, params['n_mix']))
    sdr_misi = np.zeros((n_isnr, params['n_mix']))
    sdr_gd = np.zeros(
        (params['beta_range'].shape[0], 2, 2, n_isnr, params['n_mix']))

    # Load the optimal step sizes from validation
    gd_step_opt = np.load('outputs/val_gd_step.npz')['gd_step']

    # Loop over iSNRs, mixtures and parameters
    for index_isnr, isnr in enumerate(params['input_SNR_list']):
        for index_mix in range(params['n_mix']):

            # Load data (start from mixture 50 since the first 50 are for validation)
            audio_path = 'data/SNR_' + str(isnr) + '/' + str(
                index_mix + params['n_mix']) + '/'
            src_ref, mix = load_src(audio_path, params['sample_rate'])
            mix_stft = my_stft(mix,
                               n_fft=params['n_fft'],
                               hop_length=params['hop_length'],
                               win_length=params['win_length'],
                               win_type=params['win_type'])[:, :, 0]

            # Estimate the magnitude spectrograms
            spectro_mag = estim_spectro_from_mix(mix)

            # Amplitude mask
            src_est_am = amplitude_mask(spectro_mag,
                                        mix_stft,
                                        win_length=params['win_length'],
                                        hop_length=params['hop_length'],
                                        win_type=params['win_type'])
            sdr_am[index_isnr, index_mix] = get_score(src_ref, src_est_am)
            record_src(audio_path + 'am_', src_est_am, params['sample_rate'])

            # MISI
            src_est_misi = misi(mix_stft,
                                spectro_mag,
                                win_length=params['win_length'],
                                hop_length=params['hop_length'],
                                max_iter=params['max_iter'])[0]
            sdr_misi[index_isnr, index_mix] = get_score(src_ref, src_est_misi)
            record_src(audio_path + 'misi_', src_est_misi,
                       params['sample_rate'])

            # Gradient descent
            for index_b, b in enumerate(params['beta_range']):
                print('iSNR ' + str(index_isnr + 1) + ' / ' + str(n_isnr) +
                      ' -- Mix ' + str(index_mix + 1) + ' / ' +
                      str(params['n_mix']) + ' -- Beta ' + str(index_b + 1) +
                      ' / ' + str(9))

                # Get the optimal step size(s) for this beta / iSNR
                my_steps = gd_step_opt[index_b, :, :, index_isnr]

                # Run the gradient descent algorithm for d=1,2 and for the "right" and "left" problems
                out = bregmisi_all(mix_stft,
                                   spectro_mag,
                                   src_ref=src_ref,
                                   win_length=params['win_length'],
                                   hop_length=params['hop_length'],
                                   win_type=params['win_type'],
                                   beta=b,
                                   grad_step=my_steps,
                                   max_iter=params['max_iter'])

                # Store the SDR
                sdr_gd[index_b, 0, 0, index_isnr,
                       index_mix] = get_score(src_ref, out['src_est_1r'])
                sdr_gd[index_b, 1, 0, index_isnr,
                       index_mix] = get_score(src_ref, out['src_est_2r'])
                sdr_gd[index_b, 0, 1, index_isnr,
                       index_mix] = get_score(src_ref, out['src_est_1l'])
                sdr_gd[index_b, 1, 1, index_isnr,
                       index_mix] = get_score(src_ref, out['src_est_2l'])

                # Record in the nice setting (beta=1.25 d=2, left)
                if b == 1.25:
                    record_src(audio_path + 'gd_', out['src_est_2l'],
                               params['sample_rate'])

    # Save results
    np.savez(test_sdr_path, sdr_am=sdr_am, sdr_misi=sdr_misi, sdr_gd=sdr_gd)

    return
Ejemplo n.º 5
0
def bregmisi(mix_stft,
             spectro,
             win_length=None,
             hop_length=None,
             win_type='hann',
             src_ref=None,
             beta=2.,
             d=1,
             grad_step=1e-3,
             direc='right',
             max_iter=20,
             eps=1e-8):
    """The Gradient Descent algorithm for phase recovery in audio source separation
    Args:
        mix_stft: numpy.ndarray (nfreqs, nframes) - input mixture STFT
        spectro: numpy.ndarray (nfreqs, nframes, nrsc) - the target sources' magnitude or power spectrograms
        win_length: int - the window length
        hop_length: int - the hop size of the STFT
        src_ref: numpy.ndarray (nsamples, nrsc) - reference sources for computing the SDR over iterations
        max_iter: int - number of iterations
        win_type: string - window type
        direc: string ('Right' or 'Left') - corresponds to the problem formulation
        d: int - magnitude (1) or power (2) measurements
        beta: float - parameter of the beta-divergence
        grad_step: float - step size for the gradient descent
        eps: float - small ridge added to the loss for avoiding numerical issues
    Returns:
        src_est: numpy.ndarray (nsamples, nrsc) - the time-domain estimated sources
        sdr: list (max_iter) - score (SDR in dB) over iterations
    """

    # Parameters
    n_src = spectro.shape[2]
    n_fft = (spectro.shape[0] - 1) * 2
    if win_length is None: win_length = n_fft
    if hop_length is None: hop_length = win_length // 2

    # Pre allocate SDR and error
    compute_sdr = not (src_ref is None)
    sdr = []

    # Initialization with amplitude mask
    spectro_mag = np.power(spectro, 1 / d)
    src_est = amplitude_mask(spectro_mag,
                             mix_stft,
                             win_length=win_length,
                             hop_length=hop_length,
                             win_type=win_type)
    if compute_sdr:
        sdr.append(get_score(src_ref, src_est))

    # Loop over iterations
    for iteration_number in range(max_iter):

        # Get the STFTs
        stft_est = my_stft(src_est,
                           n_fft=n_fft,
                           hop_length=hop_length,
                           win_length=win_length,
                           win_type=win_type)

        # Gradient descent in the TF domain
        #G = grad_beta(stft_est ** d, spectro, beta, direc)
        #breg_grad = d * (stft_est * (np.abs(stft_est) ** (d - 2)) * G)
        G = grad_beta_eps(stft_est, spectro, d, beta, direc, eps)
        breg_grad = d * (stft_est *
                         ((np.abs(stft_est)**2 + eps)**(d / 2 - 1)) * G)
        stft_est -= grad_step * breg_grad

        # Compute and distribute the mixing error
        mixing_error = mix_stft - np.sum(stft_est, axis=2)
        corrected_stft = stft_est + np.repeat(
            mixing_error[:, :, np.newaxis], n_src, axis=2) / n_src

        # Back to time domain and score
        src_est = my_istft(corrected_stft,
                           win_length=win_length,
                           hop_length=hop_length,
                           win_type=win_type)

        # BSS score
        if compute_sdr:
            sdr.append(get_score(src_ref, src_est))

    return src_est, sdr