Exemple #1
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    # Fuse both list
    args = [[file_path, snr_db]
            for file_path, snr_db in zip(file_paths, all_snr_db)]

    t1 = time.perf_counter()

    with concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor:
        all_metrics = executor.map(compute_metrics_utt, args)

    t2 = time.perf_counter()
    print(f'Finished in {t2 - t1} seconds')

    # Transform generator to list
    all_metrics = list(all_metrics)
    metrics_keys = ['SI-SDR', 'SI-SIR', 'SI-SAR', 'STOI', 'PESQ']

    # Compute & save stats
    compute_stats(metrics_keys=metrics_keys,
                  all_metrics=all_metrics,
                  all_snr_db=all_snr_db,
                  model_data_dir=model_data_dir,
                  confidence=confidence)
Exemple #2
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    # 1 list per metric
    all_stoi = []
    all_pesq = []
    all_polqa = []
    all_f1score = []

    for i, file_path in tqdm(enumerate(file_paths)):

        # Read files
        s_t, fs_s = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_s.wav')  # clean speech
        n_t, fs_n = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] + '_n.wav')  # noise
        x_t, fs_x = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_x.wav')  # mixture

        # compute metrics

        ## STOI (or ESTOI?)
        stoi_s_hat = stoi(s_t, x_t, fs, extended=True)
        all_stoi.append(stoi_s_hat)

        ## PESQ
        pesq_s_hat = pesq(fs, s_t, x_t, 'wb')  # wb = wideband
        all_pesq.append(pesq_s_hat)

        ## POLQA
        # polqa_s_hat = polqa(s, s_t, fs)
        # all_polqa.append(polqa_s_hat)

        # TF representation
        n_tf = stft(n_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        s_tf = stft(s_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        # plots of target / estimation
        # TF representation
        x_tf = stft(x_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        # ## mixture signal (wav + spectro)
        # ## target signal (wav + spectro + mask)
        # ## estimated signal (wav + spectro + mask)
        # signal_list = [
        #     [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask)
        #     [s_t, s_tf, None], # clean speech
        #     [n_t, n_tf, None]
        # ]
        # fig = display_multiple_signals(signal_list,
        #                     fs=fs, vmin=vmin, vmax=vmax,
        #                     wlen_sec=wlen_sec, hop_percent=hop_percent,
        #                     xticks_sec=xticks_sec, fontsize=fontsize)

        # # put all metrics in the title of the figure
        # title = "Input SNR = {:.1f} dB \n" \
        #     "STOI = {:.2f}, " \
        #     "PESQ = {:.2f} \n" \
        #     "".format(all_snr_db[i], stoi_s_hat, pesq_s_hat)

        # fig.suptitle(title, fontsize=40)

        # # Save figure
        # fig.savefig(processed_data_dir + os.path.splitext(file_path)[0] + '_fig.png')

        # # Clear figure
        # plt.close()

    # Confidence interval
    metrics = {'SNR': all_snr_db, 'STOI': all_stoi, 'PESQ': all_pesq}

    stats = {}

    # Print the names of the columns.
    print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.'))
    for key, metric in metrics.items():
        m, h = mean_confidence_interval(metric, confidence=confidence)
        stats[key] = {'avg': m, '+/-': h}
        print("{:<10} {:<10} {:<10}".format(key, m, h))
    print('\n')

    # Save stats (si_sdr, si_sar, etc. )
    with open(
            processed_data_dir + os.path.dirname(os.path.dirname(file_path)) +
            'stats.json', 'w') as f:
        json.dump(stats, f)

    # Metrics by input SNR
    for snr_db in np.unique(all_snr_db):
        stats = {}

        print('Input SNR = {:.2f}'.format(snr_db))
        # Print the names of the columns.
        print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.'))
        for key, metric in metrics.items():
            subset_metric = np.array(metric)[np.where(all_snr_db == snr_db)]
            m, h = mean_confidence_interval(subset_metric,
                                            confidence=confidence)
            stats[key] = {'avg': m, '+/-': h}
            print("{:<10} {:<10} {:<10}".format(key, m, h))
        print('\n')

        # Save stats (si_sdr, si_sar, etc. )
        with open(
                processed_data_dir +
                os.path.dirname(os.path.dirname(file_path)) +
                'stats_{:g}.json'.format(snr_db), 'w') as f:
            json.dump(stats, f)
Exemple #3
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    device = torch.device("cuda" if cuda else "cpu")
    file = open('output.log','w') 

    print('Torch version: {}'.format(torch.__version__))
    print('Device: %s' % (device))
    if torch.cuda.device_count() >= 1: print("Number GPUs: ", torch.cuda.device_count())

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    model = VariationalAutoencoder([x_dim, z_dim, h_dim])
    model.load_state_dict(torch.load(model_data_path))
    if cuda: model = model.cuda()

    model.eval()
    for param in model.parameters():
        param.requires_grad = False

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    for i, file_path in tqdm(enumerate(file_paths)):
        
        # Read files
        s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech
        x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture
        
        # x = x/np.max(x)
        T_orig = len(x_t)
        
        # TF representation
        # Input should be (frames, freq_bibs)
        x_tf = stft(x_t,
                 fs=fs,
                 wlen_sec=wlen_sec,
                 win=win,
                 hop_percent=hop_percent,
                 dtype=dtype)
                        
        # Transpose to match PyTorch
        x_tf = x_tf.T # (frames, freq_bins)
        
        # Power spectrogram (transpose)
        x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device)

        # Encode-decode
        reconstruction, _, _ = model(x)
        reconstruction = reconstruction.cpu().numpy()

        # plots of target / estimation
        s_tf = stft(s_t,
                 fs=fs,
                 wlen_sec=wlen_sec,
                 win=win,
                 hop_percent=hop_percent,
                 dtype=dtype) # shape = (freq_bins, frames) 

        # Transpose to match librosa.display
        reconstruction = reconstruction.T

        # Transform to dB
        x_psd = x.cpu().numpy().T
        x_psd = librosa.core.power_to_db(x_psd)          

        s_psd = np.power(abs(s_tf),2)
        s_psd = librosa.core.power_to_db(s_psd)          

        reconstruction = librosa.core.power_to_db(reconstruction)          

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_psd], # mixture: (waveform, tf_signal, no mask)
            [s_t, s_psd], # clean speech
            [None, reconstruction]
        ]
        #TODO: modify
        fig = display_multiple_spectro(signal_list,
                            fs=fs, vmin=vmin, vmax=vmax,
                            wlen_sec=wlen_sec, hop_percent=hop_percent,
                            xticks_sec=xticks_sec, fontsize=fontsize)
        
        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "".format(all_snr_db[i])

        fig.suptitle(title, fontsize=40)

        # Save figure
        fig.savefig(output_data_dir + os.path.splitext(file_path)[0] + '_recon.png')
Exemple #4
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    device = torch.device("cuda" if cuda else "cpu")
    file = open('output.log', 'w')

    print('Torch version: {}'.format(torch.__version__))
    print('Device: %s' % (device))
    if torch.cuda.device_count() >= 1:
        print("Number GPUs: ", torch.cuda.device_count())

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    classifier = Classifier([x_dim, h_dim_cl, y_dim], batch_norm=batch_norm)
    classifier.load_state_dict(
        torch.load(classif_dir, map_location=cuda_device))
    if cuda: classifier = classifier.cuda()

    classifier.eval()
    for param in classifier.parameters():
        param.requires_grad = False

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    for i, file_path in tqdm(enumerate(file_paths)):

        # Read files
        s_t, fs_s = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_s.wav')  # clean speech
        x_t, fs_x = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_x.wav')  # mixture

        # x = x/np.max(x)
        T_orig = len(x_t)

        # TF representation
        # Input should be (frames, freq_bibs)
        x_tf = stft(x_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # Transpose to match PyTorch
        x_tf = x_tf.T  # (frames, freq_bins)

        # Power spectrogram (transpose)
        x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device)

        # Normalize power spectrogram
        if std_norm:
            x -= mean.T
            x /= (std + eps).T

        # Classify
        y_hat_soft = classifier(x)
        y_hat_hard = (y_hat_soft > 0.5).int()
        y_hat_hard = y_hat_hard.cpu().numpy()
        y_hat_hard = y_hat_hard.T  # Transpose to match librosa.display

        # plots of target / estimation
        s_tf = stft(s_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        if labels == 'labels':
            # binary mask
            target = clean_speech_IBM(s_tf,
                                      quantile_fraction=quantile_fraction,
                                      quantile_weight=quantile_weight)

        if labels == 'vad_labels':
            # vad
            target = clean_speech_VAD(s_tf,
                                      quantile_fraction=quantile_fraction,
                                      quantile_weight=quantile_weight)

        # Transpose to match librosa.display
        x_tf = x_tf.T

        # F1-score
        f1score_s_hat = f1_score(target.flatten(),
                                 y_hat_hard.flatten(),
                                 average="binary")

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, target],  # clean speech
            [None, None, y_hat_hard]
            #[None, None, y_hat_soft]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat)

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = classif_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_hard_mask.png')
Exemple #5
0
def test_write_read_labels(dataset_type):
    # Parameters
    ## Dataset
    input_speech_dir = 'data/subset/raw/'
    output_speech_dir = 'data/subset/processed/'

    output_data_dir = 'data/subset/pickle/'
    fs = int(16e3)  # Sampling rate

    ## STFT
    wlen_sec = 64e-3  # window length in seconds
    hop_percent = 0.25  # hop size as a percentage of the window length
    win = 'hann'  # type of window
    dtype = 'complex64'

    ## Ideal binary mask
    quantile_fraction = 0.98
    quantile_weight = 0.999

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    labels = []

    for path in file_paths:

        x, fs_x = sf.read(input_speech_dir + path, samplerate=None)

        # Cut burst at begining of file
        x[:int(0.1 * fs)] = x[int(0.1 * fs):int(0.2 * fs)]

        # Normalize audio
        x = x / (np.max(np.abs(x)))
        #x = x/(np.max(np.abs(x)) + 2)
        #x = x/np.linalg.norm(x)

        if fs != fs_x:
            raise ValueError('Unexpected sampling rate')

        # TF reprepsentation
        x_tf = stft(x,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # binary mask
        x_ibm = clean_speech_IBM(x_tf,
                                 quantile_fraction=quantile_fraction,
                                 quantile_weight=quantile_weight)

        labels.append(x_ibm)

    labels = np.concatenate(labels, axis=1)
    #labels = labels[1]

    # write spectrograms
    write_dataset(labels,
                  output_data_dir=output_data_dir,
                  dataset_type=dataset_type,
                  suffix='labels')

    # Read pickle
    pickle_labels = read_dataset(data_dir=output_data_dir,
                                 dataset_type=dataset_type,
                                 suffix='labels')

    # Assert stored data is same as spectrograms
    assert_array_equal(labels, pickle_labels)
Exemple #6
0
def test_write_read_frames(dataset_type):
    # Parameters
    ## Dataset
    input_speech_dir = 'data/subset/raw/'
    output_speech_dir = 'data/subset/processed/'

    output_data_dir = 'data/subset/pickle/'
    fs = int(16e3)  # Sampling rate

    ## STFT
    wlen_sec = 64e-3  # window length in seconds
    hop_percent = 0.25  # hop size as a percentage of the window length
    win = 'hann'  # type of window
    dtype = 'complex64'

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    spectrograms = []

    for path in file_paths:

        x, fs_x = sf.read(input_speech_dir + path, samplerate=None)

        # Cut burst at begining of file
        #x[:int(0.1*fs)] = x[int(0.1*fs):int(0.2*fs)]
        x = x[int(0.1 * fs):]

        # Normalize audio
        x = x / (np.max(np.abs(x)))
        #x = x/(np.max(np.abs(x)) + 2)
        #x = x/np.linalg.norm(x)

        if not os.path.exists(os.path.dirname(output_speech_dir + path)):
            os.makedirs(os.path.dirname(output_speech_dir + path))
        sf.write(output_speech_dir + path, x, fs_x)

        if fs != fs_x:
            raise ValueError('Unexpected sampling rate')

        # TF reprepsentation
        x_tf = stft(x,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        spectrograms.append(np.power(abs(x_tf), 2))

    spectrograms = np.concatenate(spectrograms, axis=1)
    #spectrograms = spectrograms[1]

    # write spectrograms
    write_dataset(spectrograms,
                  output_data_dir=output_data_dir,
                  dataset_type=dataset_type,
                  suffix='frames')

    # Read pickle
    pickle_spectrograms = read_dataset(data_dir=output_data_dir,
                                       dataset_type=dataset_type,
                                       suffix='frames')

    # Assert stored data is same as spectrograms
    assert_array_equal(spectrograms, pickle_spectrograms)
Exemple #7
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    for i, file_path in tqdm(enumerate(file_paths)):

        # Read files
        s_t, fs_s = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_s.wav')  # clean speech
        x_t, fs_x = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_x.wav')  # mixture

        # x = x/np.max(x)
        T_orig = len(x_t)

        # TF representation
        # Input should be (frames, freq_bibs)
        x_tf = stft(x_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # Power spectrogram (transpose)
        x = np.power(np.abs(x_tf), 2)

        # Estimate mask
        y_hat_soft = timo_mask_estimation(x)
        y_hat_hard = (y_hat_soft > 0.5).astype(int)

        # plots of target / estimation
        s_tf = stft(s_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        # binary mask
        s_ibm = clean_speech_IBM(s_tf,
                                 quantile_fraction=quantile_fraction,
                                 quantile_weight=quantile_weight)

        # F1-score
        f1score_s_hat = f1_score(s_ibm.flatten(),
                                 y_hat_hard.flatten(),
                                 average="binary")

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, s_ibm],  # clean speech
            #[None, None, y_hat_hard]
            [None, None, y_hat_soft]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "".format(all_snr_db[i])

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = model_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_soft_mask.png')

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, s_ibm],  # clean speech
            #[None, None, y_hat_hard]
            [None, None, y_hat_hard]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat)

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = model_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_hard_mask.png')