Ejemplo n.º 1
0
def test_clean_speech_IBM():
    """
    check that output of 'complex64' spectrogram is 'float32' mask
    """
    ## STFT parameters
    wlen_sec = 80e-3 # window length in seconds
    hop_percent = 0.25  # hop size as a percentage of the window length
    win = 'hann' # type of window
    dtype = 'complex64'

    # Librosa examples
    # # AVAILABLE EXAMPLES
    # # --------------------------------------------------------------------
    # # brahms    	Brahms - Hungarian Dance #5
    # # choice    	Admiral Bob - Choice (drum+bass)
    # # fishin    	Karissa Hobbs - Let's Go Fishin'
    # # nutcracker	Tchaikovsky - Dance of the Sugar Plum Fairy
    # # trumpet   	Mihai Sorohan - Trumpet loop
    # # vibeace   	Kevin MacLeod - Vibe Ace

    # Take example signal from Librosa
    audio_path = example('brahms')
    x, fs_x = sf.read(audio_path)
    x_len = len(x)

    ## Ideal binary mask
    quantile_fraction = 0.98
    quantile_weight = 0.999

    # STFT
    x_tf = stft(x,
                fs=fs_x,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype)
    
    # binary mask
    x_ibm = clean_speech_IBM(x_tf,
                                quantile_fraction=quantile_fraction,
                                quantile_weight=quantile_weight)
    
    assert x_ibm.dtype == 'float32'
    assert np.unique(x_ibm).tolist() == [0., 1.]

#TODO: take masks from Heymann GitHub
Ejemplo n.º 2
0
def main():

    if not os.path.exists(os.path.dirname(output_dataset_file)):
        os.makedirs(os.path.dirname(output_dataset_file))

    with h5.File(output_dataset_file,
                 'a',
                 rdcc_nbytes=rdcc_nbytes,
                 rdcc_nslots=rdcc_nslots) as f:

        # STFT attributes
        f.attrs['fs'] = fs
        f.attrs['wlen_sec'] = wlen_sec
        f.attrs['hop_percent'] = hop_percent
        f.attrs['win'] = win
        f.attrs['dtype'] = dtype

        # label attributes
        f.attrs['quantile_fraction'] = quantile_fraction
        f.attrs['quantile_weight'] = quantile_weight

        # HDF5 attributes
        f.attrs['X_chunks'] = X_chunks
        f.attrs['Y_chunks'] = Y_chunks
        f.attrs['compression'] = compression

        for dataset_type in dataset_types:

            # Create file list
            file_paths = speech_list(input_speech_dir=input_speech_dir,
                                     dataset_type=dataset_type)

            if dataset_type == 'train':
                noise_types = [
                    'domestic', 'nature', 'office', 'transportation'
                ]
            if dataset_type == 'validation':
                noise_types = ['nature', 'office', 'public', 'transportation']

            # Create SNR list
            np.random.seed(0)
            noise_index = np.random.randint(len(noise_types),
                                            size=len(file_paths))
            snrs = [-5, -2.5, 0, 2.5, 5.0]
            snrs_index = np.random.randint(len(snrs), size=len(file_paths))

            # Create noise_audios from processed noise files
            preprocessed_noise_paths = noise_list_preprocessed(
                preprocessed_noise_dir=output_noise_dir,
                dataset_type=dataset_type)
            noise_audios = {}

            # Load the noise files
            for noise_type, preprocessed_noise_path in preprocessed_noise_paths.items(
            ):

                #if noise already preprocessed, read files directly
                if os.path.exists(preprocessed_noise_path):

                    noise_audio, fs_noise = sf.read(preprocessed_noise_path)

                    if fs != fs_noise:
                        raise ValueError(
                            'Unexpected sampling rate. Did you preprocess the 16kHz version of the DEMAND database?'
                        )

                    noise_audios[noise_type] = noise_audio

            # Init list of SNR
            all_snr_dB = []

            # Delete datasets if already exists
            if 'X_' + dataset_type in f:
                del f['X_' + dataset_type]
                del f['Y_' + dataset_type]

            # Exact shape of dataset is unknown in advance unfortunately
            # Faster writing if you know the shape in advance
            # Size of chunks corresponds to one spectrogram frame
            f.create_dataset('X_' + dataset_type,
                             shape=X_shape,
                             dtype='float32',
                             maxshape=X_maxshape,
                             chunks=X_chunks,
                             compression=compression,
                             shuffle=shuffle)
            f.create_dataset('Y_' + dataset_type,
                             shape=Y_shape,
                             dtype='float32',
                             maxshape=Y_maxshape,
                             chunks=Y_chunks,
                             compression=compression,
                             shuffle=shuffle)

            # Store dataset in variables for faster I/O
            fx = f['X_' + dataset_type]
            fy = f['Y_' + dataset_type]

            # Compute mean, std of the train set
            if dataset_type == 'train':
                # VAR = E[X**2] - E[X]**2
                channels_sum, channels_squared_sum = 0., 0.

            # Loop over the speech files
            for i, file_path in tqdm(enumerate(file_paths)):

                speech, fs_speech = sf.read(input_speech_dir + file_path,
                                            samplerate=None)

                # Cut burst at begining of file
                speech = speech[int(0.1 * fs):]

                # Normalize audio
                speech = speech / (np.max(np.abs(speech)))

                if fs != fs_speech:
                    raise ValueError('Unexpected sampling rate')

                # Select noise_type
                noise_type = noise_types[noise_index[i]]

                # Extract noise segment
                noise = noise_segment(noise_audios, noise_type, speech)

                # Select SNR
                snr_dB = snrs[snrs_index[i]]
                all_snr_dB.append(snr_dB)

                # Compute noise gain
                speech_power = np.sum(np.power(speech, 2))
                noise_power = np.sum(np.power(noise, 2))
                noise_power_target = speech_power * np.power(10, -snr_dB / 10)
                k = noise_power_target / noise_power
                noise = noise * np.sqrt(k)

                mixture = speech + noise

                # # Normalize by max of speech, noise, speech+noise
                # norm = np.max(abs(np.concatenate([speech, noise, speech+noise])))
                # mixture = (speech+noise) / norm
                # speech /= norm
                # noise /= norm

                if dataset_size == 'subset':
                    # Save .wav files, just to check if it working
                    output_path = output_wav_dir + file_path
                    output_path = os.path.splitext(output_path)[0]

                    if not os.path.exists(os.path.dirname(output_path)):
                        os.makedirs(os.path.dirname(output_path))

                    sf.write(output_path + '_s.wav', speech, fs)
                    sf.write(output_path + '_n.wav', noise, fs)
                    sf.write(output_path + '_x.wav', mixture, fs)

                # TF reprepsentation
                mixture_tf = stft(mixture,
                                  fs=fs,
                                  wlen_sec=wlen_sec,
                                  win=win,
                                  hop_percent=hop_percent,
                                  dtype=dtype)

                noisy_spectrogram = np.power(abs(mixture_tf), 2)

                # TF reprepsentation
                speech_tf = stft(speech,
                                 fs=fs,
                                 wlen_sec=wlen_sec,
                                 win=win,
                                 hop_percent=hop_percent,
                                 dtype=dtype)

                if labels == 'noisy_wiener_labels':
                    # TF reprepsentation
                    noise_tf = stft(noise,
                                    fs=fs,
                                    wlen_sec=wlen_sec,
                                    win=win,
                                    hop_percent=hop_percent,
                                    dtype=dtype)

                    # wiener mask
                    speech_wiener_mask = ideal_wiener_mask(
                        speech_tf, noise_tf, eps)
                    label = speech_wiener_mask

                if labels == 'noisy_labels':
                    # binary mask
                    speech_ibm = clean_speech_IBM(
                        speech_tf,
                        quantile_fraction=quantile_fraction,
                        quantile_weight=quantile_weight)
                    label = speech_ibm

                if labels == 'noisy_vad_labels':
                    # binary mask
                    speech_vad = clean_speech_VAD(
                        speech_tf,
                        quantile_fraction=quantile_fraction,
                        quantile_weight=quantile_weight)
                    label = speech_vad

                # Compute mean, std
                if dataset_type == 'train':
                    # VAR = E[X**2] - E[X]**2
                    channels_sum += np.sum(noisy_spectrogram, axis=-1)
                    channels_squared_sum += np.sum(noisy_spectrogram**2,
                                                   axis=-1)

                # Store spectrogram in dataset
                fx.resize((fx.shape[1] + noisy_spectrogram.shape[1]), axis=1)
                fx[:, -noisy_spectrogram.shape[1]:] = noisy_spectrogram

                # Store spectrogram in label
                fy.resize((fy.shape[1] + label.shape[1]), axis=1)
                fy[:, -label.shape[1]:] = label

            # Compute and save mean, std
            if dataset_type == 'train':
                print('Compute mean and std')
                #NB: compute the empirical std (!= regular std)
                n_samples = fx.shape[1]
                mean = channels_sum / n_samples
                std = np.sqrt((1 / (n_samples - 1)) *
                              (channels_squared_sum - n_samples * mean**2))

                # Delete datasets if already exists
                if 'X_' + dataset_type + '_mean' in f:
                    del f['X_' + dataset_type + '_mean']
                    del f['X_' + dataset_type + '_std']

                f.create_dataset('X_' + dataset_type + '_mean',
                                 shape=X_chunks,
                                 dtype='float32',
                                 maxshape=X_chunks,
                                 chunks=None,
                                 compression=compression,
                                 shuffle=shuffle)
                f.create_dataset('X_' + dataset_type + '_std',
                                 shape=X_chunks,
                                 dtype='float32',
                                 maxshape=X_chunks,
                                 chunks=None,
                                 compression=compression,
                                 shuffle=shuffle)

                f['X_' + dataset_type +
                  '_mean'][:] = mean[..., None]  # Add axis to fit chunks shape
                f['X_' + dataset_type +
                  '_std'][:] = std[..., None]  # Add axis to fit chunks shape
                print('Mean and std saved in HDF5.')

        # TODO: save SNR, level_s, level_n in 1 big csv
        write_dataset(all_snr_dB, output_wav_dir, dataset_type, 'snr_db')
Ejemplo n.º 3
0
def process_utt(mcem, model, classifier, mean, std, file_path, device):
    
    # Input
    x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture
    T_orig = len(x_t)
    x_tf = stft(x_t,
                fs=fs,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype) # (frames, freq_bins)
    
    # Transpose to match PyTorch
    x_tf = x_tf.T # (frames, freq_bins)

    x = torch.tensor(np.power(np.abs(x_tf), 2), device=device)

    # Target
    s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech
    s_tf = stft(s_t,
                fs=fs,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype) # (freq_bins, frames)

    if classif_type == 'dnn':    
        # Normalize power spectrogram
        if std_norm:
            x_norm = x - mean.T
            x_norm /= (std + eps).T

            y_hat_soft = classifier(x_norm) 
        else:
            y_hat_soft = classifier(x)   
        y_hat_hard = (y_hat_soft > 0.5).float()

    if classif_type == 'oracle':
        y_hat_soft = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight)
        y_hat_hard = torch.from_numpy(y_hat_soft.T).to(device)

    if classif_type == 'timo':
        x_numpy = np.power(np.abs(x_tf), 2)
        y_hat_soft = timo_mask_estimation(x_numpy.T)
        y_hat_hard = (y_hat_soft > 0.5).astype(int)
        y_hat_hard = y_hat_hard.T # (frames, freq_bins)
        y_hat_hard = torch.tensor(y_hat_hard).to(device)
    
    # Init MCEM
    mcem.init_parameters(X=x_tf,
                         y=y_hat_hard,
                        vae=model,
                        nmf_rank=nmf_rank,
                        eps=eps,
                        device=device)

    cost = mcem.run()

    S_hat = mcem.S_hat #+ np.finfo(np.float32).eps
    N_hat = mcem.N_hat #+ np.finfo(np.float32).eps

    s_hat = istft(S_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig)
    n_hat = istft(N_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig)

    # Save .wav files
    output_path = output_data_dir + file_path
    output_path = os.path.splitext(output_path)[0]

    if not os.path.exists(os.path.dirname(output_path)):
        os.makedirs(os.path.dirname(output_path))
    
    sf.write(output_path + '_s_est.wav', s_hat, fs)
    sf.write(output_path + '_n_est.wav', n_hat, fs)
    
    # Save binary mask
    torch.save(y_hat_soft, output_path + ' _ibm_soft_est.pt')
    torch.save(y_hat_hard, output_path + '_ibm_hard_est.pt')
Ejemplo n.º 4
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    device = torch.device("cuda" if cuda else "cpu")
    file = open('output.log', 'w')

    print('Torch version: {}'.format(torch.__version__))
    print('Device: %s' % (device))
    if torch.cuda.device_count() >= 1:
        print("Number GPUs: ", torch.cuda.device_count())

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    classifier = Classifier([x_dim, h_dim_cl, y_dim], batch_norm=batch_norm)
    classifier.load_state_dict(
        torch.load(classif_dir, map_location=cuda_device))
    if cuda: classifier = classifier.cuda()

    classifier.eval()
    for param in classifier.parameters():
        param.requires_grad = False

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    for i, file_path in tqdm(enumerate(file_paths)):

        # Read files
        s_t, fs_s = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_s.wav')  # clean speech
        x_t, fs_x = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_x.wav')  # mixture

        # x = x/np.max(x)
        T_orig = len(x_t)

        # TF representation
        # Input should be (frames, freq_bibs)
        x_tf = stft(x_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # Transpose to match PyTorch
        x_tf = x_tf.T  # (frames, freq_bins)

        # Power spectrogram (transpose)
        x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device)

        # Normalize power spectrogram
        if std_norm:
            x -= mean.T
            x /= (std + eps).T

        # Classify
        y_hat_soft = classifier(x)
        y_hat_hard = (y_hat_soft > 0.5).int()
        y_hat_hard = y_hat_hard.cpu().numpy()
        y_hat_hard = y_hat_hard.T  # Transpose to match librosa.display

        # plots of target / estimation
        s_tf = stft(s_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        if labels == 'labels':
            # binary mask
            target = clean_speech_IBM(s_tf,
                                      quantile_fraction=quantile_fraction,
                                      quantile_weight=quantile_weight)

        if labels == 'vad_labels':
            # vad
            target = clean_speech_VAD(s_tf,
                                      quantile_fraction=quantile_fraction,
                                      quantile_weight=quantile_weight)

        # Transpose to match librosa.display
        x_tf = x_tf.T

        # F1-score
        f1score_s_hat = f1_score(target.flatten(),
                                 y_hat_hard.flatten(),
                                 average="binary")

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, target],  # clean speech
            [None, None, y_hat_hard]
            #[None, None, y_hat_soft]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat)

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = classif_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_hard_mask.png')
Ejemplo n.º 5
0
def compute_metrics_utt(args):
    # Separate args
    file_path, snr_db = args[0], args[1]

    # Read files
    s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech
    n_t, fs_n = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_n.wav') # noise
    x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture
    s_hat_t, fs_s_hat = sf.read(model_data_dir + os.path.splitext(file_path)[0] + '_s_est.wav') # est. speech

    # compute metrics
    ## SI-SDR, SI-SAR, SI-SNR
    si_sdr, si_sir, si_sar = energy_ratios(s_hat=s_hat_t, s=s_t, n=n_t)

    ## STOI (or ESTOI?)
    stoi_s_hat = stoi(s_t, s_hat_t, fs, extended=True)

    ## PESQ
    pesq_s_hat = pesq(fs, s_t, s_hat_t, 'wb') # wb = wideband
    
    ## POLQA
    # polqa_s_hat = polqa(s, s_t, fs)
    # all_polqa.append(polqa_s_hat)

    ## F1 score
    # ideal binary mask
    y_hat_hard = torch.load(model_data_dir + os.path.splitext(file_path)[0] + '_ibm_hard_est.pt', map_location=lambda storage, location: storage) # shape = (frames, freq_bins)
    # y_hat_hard = torch.load(model_data_dir + os.path.splitext(file_path)[0] + '_ibm_soft_est.pt', map_location=lambda storage, location: storage) # shape = (frames, freq_bins)
    y_hat_hard = y_hat_hard.T # Transpose to match target y, shape = (freq_bins, frames)

    # TF representation
    s_tf = stft(s_t,
                fs=fs,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype) # shape = (freq_bins, frames)

    if labels == 'labels':
        y = clean_speech_IBM(s_tf,
                                quantile_fraction=quantile_fraction,
                                quantile_weight=quantile_weight)
    if labels == 'vad_labels':
        y = clean_speech_VAD(s_tf,
                        quantile_fraction=quantile_fraction,
                        quantile_weight=quantile_weight)

    # Convert y to Tensor for f1-score
    y_hat_hard = y_hat_hard.int()
    y = torch.LongTensor(y)

    accuracy, precision, recall, f1score_s_hat = f1_loss(y.flatten(), y_hat_hard.flatten(), epsilon=1e-12)

    # plots of target / estimation
    # TF representation
    x_tf = stft(x_t,
                fs=fs,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype) # shape = (freq_bins, frames)

    s_hat_tf = stft(s_hat_t,
                fs=fs,
                wlen_sec=wlen_sec,
                win=win,
                hop_percent=hop_percent,
                dtype=dtype) # shape = (freq_bins, frames)                 

    ## mixture signal (wav + spectro)
    ## target signal (wav + spectro + mask)
    ## estimated signal (wav + spectro + mask)
    signal_list = [
        [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask)
        [s_t, s_tf, y.numpy()], # clean speech
        [s_hat_t, s_hat_tf, y_hat_hard.numpy()]
    ]
    fig = display_multiple_signals(signal_list,
                        fs=fs, vmin=vmin, vmax=vmax,
                        wlen_sec=wlen_sec, hop_percent=hop_percent,
                        xticks_sec=xticks_sec, fontsize=fontsize)
    
    # put all metrics in the title of the figure
    title = "Input SNR = {:.1f} dB \n" \
        "SI-SDR = {:.1f} dB,  " \
        "SI-SIR = {:.1f} dB,  " \
        "SI-SAR = {:.1f} dB\n" \
        "STOI = {:.2f},  " \
        "PESQ = {:.2f} \n" \
        "Accuracy = {:.3f},  "\
        "Precision = {:.3f},  "\
        "Recall = {:.3f},  "\
        "F1-score = {:.3f}\n".format(snr_db, si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat,\
            accuracy, precision, recall, f1score_s_hat)

    fig.suptitle(title, fontsize=40)

    # Save figure
    fig.savefig(model_data_dir + os.path.splitext(file_path)[0] + '_fig.png')

    # Clear figure
    plt.close()

    metrics = [si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat,\
        accuracy, precision, recall, f1score_s_hat]
    return metrics
Ejemplo n.º 6
0
def test_write_read_labels(dataset_type):
    # Parameters
    ## Dataset
    input_speech_dir = 'data/subset/raw/'
    output_speech_dir = 'data/subset/processed/'

    output_data_dir = 'data/subset/pickle/'
    fs = int(16e3)  # Sampling rate

    ## STFT
    wlen_sec = 64e-3  # window length in seconds
    hop_percent = 0.25  # hop size as a percentage of the window length
    win = 'hann'  # type of window
    dtype = 'complex64'

    ## Ideal binary mask
    quantile_fraction = 0.98
    quantile_weight = 0.999

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    labels = []

    for path in file_paths:

        x, fs_x = sf.read(input_speech_dir + path, samplerate=None)

        # Cut burst at begining of file
        x[:int(0.1 * fs)] = x[int(0.1 * fs):int(0.2 * fs)]

        # Normalize audio
        x = x / (np.max(np.abs(x)))
        #x = x/(np.max(np.abs(x)) + 2)
        #x = x/np.linalg.norm(x)

        if fs != fs_x:
            raise ValueError('Unexpected sampling rate')

        # TF reprepsentation
        x_tf = stft(x,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # binary mask
        x_ibm = clean_speech_IBM(x_tf,
                                 quantile_fraction=quantile_fraction,
                                 quantile_weight=quantile_weight)

        labels.append(x_ibm)

    labels = np.concatenate(labels, axis=1)
    #labels = labels[1]

    # write spectrograms
    write_dataset(labels,
                  output_data_dir=output_data_dir,
                  dataset_type=dataset_type,
                  suffix='labels')

    # Read pickle
    pickle_labels = read_dataset(data_dir=output_data_dir,
                                 dataset_type=dataset_type,
                                 suffix='labels')

    # Assert stored data is same as spectrograms
    assert_array_equal(labels, pickle_labels)
Ejemplo n.º 7
0
def main():
    # Load input SNR
    all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db')
    all_snr_db = np.array(all_snr_db)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    for i, file_path in tqdm(enumerate(file_paths)):

        # Read files
        s_t, fs_s = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_s.wav')  # clean speech
        x_t, fs_x = sf.read(processed_data_dir +
                            os.path.splitext(file_path)[0] +
                            '_x.wav')  # mixture

        # x = x/np.max(x)
        T_orig = len(x_t)

        # TF representation
        # Input should be (frames, freq_bibs)
        x_tf = stft(x_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)

        # Power spectrogram (transpose)
        x = np.power(np.abs(x_tf), 2)

        # Estimate mask
        y_hat_soft = timo_mask_estimation(x)
        y_hat_hard = (y_hat_soft > 0.5).astype(int)

        # plots of target / estimation
        s_tf = stft(s_t,
                    fs=fs,
                    wlen_sec=wlen_sec,
                    win=win,
                    hop_percent=hop_percent,
                    dtype=dtype)  # shape = (freq_bins, frames)

        # binary mask
        s_ibm = clean_speech_IBM(s_tf,
                                 quantile_fraction=quantile_fraction,
                                 quantile_weight=quantile_weight)

        # F1-score
        f1score_s_hat = f1_score(s_ibm.flatten(),
                                 y_hat_hard.flatten(),
                                 average="binary")

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, s_ibm],  # clean speech
            #[None, None, y_hat_hard]
            [None, None, y_hat_soft]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "".format(all_snr_db[i])

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = model_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_soft_mask.png')

        ## mixture signal (wav + spectro)
        ## target signal (wav + spectro + mask)
        ## estimated signal (wav + spectro + mask)
        signal_list = [
            [x_t, x_tf, None],  # mixture: (waveform, tf_signal, no mask)
            [s_t, s_tf, s_ibm],  # clean speech
            #[None, None, y_hat_hard]
            [None, None, y_hat_hard]
        ]
        #TODO: modify
        fig = display_multiple_signals(signal_list,
                                       fs=fs,
                                       vmin=vmin,
                                       vmax=vmax,
                                       wlen_sec=wlen_sec,
                                       hop_percent=hop_percent,
                                       xticks_sec=xticks_sec,
                                       fontsize=fontsize)

        # put all metrics in the title of the figure
        title = "Input SNR = {:.1f} dB \n" \
            "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat)

        fig.suptitle(title, fontsize=40)

        # Save figure
        output_path = model_data_dir + file_path
        output_path = os.path.splitext(output_path)[0]

        if not os.path.exists(os.path.dirname(output_path)):
            os.makedirs(os.path.dirname(output_path))

        fig.savefig(output_path + '_hard_mask.png')
Ejemplo n.º 8
0
def main():

    if not os.path.exists(os.path.dirname(output_dataset_file)):
        os.makedirs(os.path.dirname(output_dataset_file))

    with h5.File(output_dataset_file,
                 'a',
                 rdcc_nbytes=rdcc_nbytes,
                 rdcc_nslots=rdcc_nslots) as f:

        for dataset_type in dataset_types:

            # Create file list
            file_paths = speech_list(input_speech_dir=input_speech_dir,
                                     dataset_type=dataset_type)

            # Delete datasets if already exists
            if 'X_' + dataset_type in f:
                del f['X_' + dataset_type]
                del f['Y_' + dataset_type]

            # Exact shape of dataset is unknown in advance unfortunately
            # Faster writing if you know the shape in advance
            # Size of chunks corresponds to one spectrogram frame
            f.create_dataset('X_' + dataset_type,
                             shape=X_shape,
                             dtype='float32',
                             maxshape=X_maxshape,
                             chunks=X_chunks,
                             compression=compression,
                             shuffle=shuffle)
            f.create_dataset('Y_' + dataset_type,
                             shape=Y_shape,
                             dtype='float32',
                             maxshape=Y_maxshape,
                             chunks=Y_chunks,
                             compression=compression,
                             shuffle=shuffle)

            # STFT attributes
            f.attrs['fs'] = fs
            f.attrs['wlen_sec'] = wlen_sec
            f.attrs['hop_percent'] = hop_percent
            f.attrs['win'] = win
            f.attrs['dtype'] = dtype

            # label attributes
            f.attrs['quantile_fraction'] = quantile_fraction
            f.attrs['quantile_weight'] = quantile_weight

            # HDF5 attributes
            f.attrs['X_chunks'] = X_chunks
            f.attrs['Y_chunks'] = Y_chunks
            f.attrs['compression'] = compression

            # Store dataset in variables for faster I/O
            fx = f['X_' + dataset_type]
            fy = f['Y_' + dataset_type]

            for i, file_path in tqdm(enumerate(file_paths)):

                speech, fs_speech = sf.read(input_speech_dir + file_path,
                                            samplerate=None)

                # Cut burst at begining of file
                speech = speech[int(0.1 * fs):]

                # Normalize audio
                speech = speech / (np.max(np.abs(speech)))

                if fs != fs_speech:
                    raise ValueError('Unexpected sampling rate')

                # TF reprepsentation
                speech_tf = stft(speech,
                                 fs=fs,
                                 wlen_sec=wlen_sec,
                                 win=win,
                                 hop_percent=hop_percent,
                                 dtype=dtype)

                spectrogram = np.power(abs(speech_tf), 2)

                if labels == 'vad_labels':
                    # vad
                    speech_vad = clean_speech_VAD(
                        speech_tf,
                        quantile_fraction=quantile_fraction,
                        quantile_weight=quantile_weight)

                    label = speech_vad

                if labels == 'labels':
                    # binary mask
                    speech_ibm = clean_speech_IBM(
                        speech_tf,
                        quantile_fraction=quantile_fraction,
                        quantile_weight=quantile_weight)

                    label = speech_ibm

                # Store spectrogram in dataset
                fx.resize((fx.shape[1] + spectrogram.shape[1]), axis=1)
                fx[:, -spectrogram.shape[1]:] = spectrogram

                # Store spectrogram in label
                fy.resize((fy.shape[1] + label.shape[1]), axis=1)
                fy[:, -label.shape[1]:] = label
Ejemplo n.º 9
0
def test_write_read_frames(dataset_type):
    # Parameters
    ## Dataset
    input_speech_dir = 'data/subset/raw/'
    output_speech_dir = 'data/subset/processed/'
    output_data_dir = 'data/subset/h5/'
    data_dir = 'CSR-1-WSJ-0'
    suffix = 'lzf'
    output_h5_dir = output_data_dir + data_dir + '_' + suffix + '.h5'

    # Create file list
    file_paths = speech_list(input_speech_dir=input_speech_dir,
                             dataset_type=dataset_type)

    # Open hdf5 file
    #We are using 400Mb of chunk_cache_mem here ("rdcc_nbytes" and "rdcc_nslots")
    with h5.File(output_h5_dir, 'r', rdcc_nbytes=1024**2*400, rdcc_nslots=10e6) as f:

        dx = f['X_' + dataset_type]
        dy = f['Y_' + dataset_type]

        ## STFT
        fs = f.attrs['fs'] # Sampling rate
        wlen_sec = f.attrs['wlen_sec'] # window length in seconds
        hop_percent = f.attrs['hop_percent'] # hop size as a percentage of the window length
        win = f.attrs['win'] # type of window
        dtype = f.attrs['dtype']

        ## Ideal binary mask
        quantile_fraction = f.attrs['quantile_fraction']
        quantile_weight = f.attrs['quantile_weight']
        frame_begin = 0
        frame_end = 0

        for path in file_paths:

            x, fs_x = sf.read(input_speech_dir + path, samplerate=None)

            # Cut burst at begining of file
            #x[:int(0.1*fs)] = x[int(0.1*fs):int(0.2*fs)]
            x = x[int(0.1*fs):]

            # Normalize audio
            x = x/(np.max(np.abs(x)))
            
            if fs != fs_x:
                raise ValueError('Unexpected sampling rate')

            # TF reprepsentation
            x_tf = stft(x,
                        fs=fs,
                        wlen_sec=wlen_sec,
                        win=win,
                        hop_percent=hop_percent,
                        dtype=dtype)

            spectrogram = np.power(abs(x_tf), 2)

            # binary mask
            label = clean_speech_IBM(x_tf,
                                    quantile_fraction=quantile_fraction,
                                    quantile_weight=quantile_weight)


            # Read h5 spectrogram
            frame_end += spectrogram.shape[1]
            h5_spectrogram = dx[:,frame_begin:frame_end]
            h5_label = dy[:,frame_begin:frame_end]
    
            # Assert stored data is same as spectrograms
            assert_array_equal(spectrogram, h5_spectrogram)
            assert_array_equal(label, h5_label)

            # Next iteration
            frame_begin += spectrogram.shape[1]