def test_concat_spectrograms(): """ Check that 2 spectrograms can be concatenated along axis=1 i.e. spectogram.shape = (frames, frequency bins) i.e. spectogram.shape != (frequency bins, frames) """ ## STFT parameters wlen_sec = 80e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window # Librosa examples # # AVAILABLE EXAMPLES # # -------------------------------------------------------------------- # # brahms Brahms - Hungarian Dance #5 # # choice Admiral Bob - Choice (drum+bass) # # fishin Karissa Hobbs - Let's Go Fishin' # # nutcracker Tchaikovsky - Dance of the Sugar Plum Fairy # # trumpet Mihai Sorohan - Trumpet loop # # vibeace Kevin MacLeod - Vibe Ace # Take example signal from Librosa audio_path = example('brahms') x1, fs_x1 = sf.read(audio_path) x1_len = len(x1) audio_path = example('choice') x2, fs_x2 = sf.read(audio_path) x2_len = len(x2) assert fs_x1 == fs_x2 # STFT x1_tf = stft(x1, fs=fs_x1, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent) x2_tf = stft(x2, fs=fs_x2, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent) # Concat along axis=1 np.concatenate([x1_tf, x2_tf], axis=1)
def process_utt(mcem, model, file_path, device): # Input x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture T_orig = len(x_t) # TF reprepsentation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) # Init MCEM mcem.init_parameters(X=x_tf, vae=model, nmf_rank=nmf_rank, eps=eps, device=device) #%% Run speech enhancement algorithm cost = mcem.run() # Estimated sources S_hat = mcem.S_hat #+ np.finfo(np.float32).eps N_hat = mcem.N_hat #+ np.finfo(np.float32).eps # iSTFT s_hat = istft(S_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig) n_hat = istft(N_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig) # Save .wav files output_path = output_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) sf.write(output_path + '_s_est.wav', s_hat, fs) sf.write(output_path + '_n_est.wav', n_hat, fs)
def test_clean_speech_IBM(): """ check that output of 'complex64' spectrogram is 'float32' mask """ ## STFT parameters wlen_sec = 80e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window dtype = 'complex64' # Librosa examples # # AVAILABLE EXAMPLES # # -------------------------------------------------------------------- # # brahms Brahms - Hungarian Dance #5 # # choice Admiral Bob - Choice (drum+bass) # # fishin Karissa Hobbs - Let's Go Fishin' # # nutcracker Tchaikovsky - Dance of the Sugar Plum Fairy # # trumpet Mihai Sorohan - Trumpet loop # # vibeace Kevin MacLeod - Vibe Ace # Take example signal from Librosa audio_path = example('brahms') x, fs_x = sf.read(audio_path) x_len = len(x) ## Ideal binary mask quantile_fraction = 0.98 quantile_weight = 0.999 # STFT x_tf = stft(x, fs=fs_x, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # binary mask x_ibm = clean_speech_IBM(x_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) assert x_ibm.dtype == 'float32' assert np.unique(x_ibm).tolist() == [0., 1.] #TODO: take masks from Heymann GitHub
def test_stft_istft(): """ Check stft + istft gives the same input/ouput signal """ ## STFT parameters wlen_sec = 80e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window # Librosa examples # # AVAILABLE EXAMPLES # # -------------------------------------------------------------------- # # brahms Brahms - Hungarian Dance #5 # # choice Admiral Bob - Choice (drum+bass) # # fishin Karissa Hobbs - Let's Go Fishin' # # nutcracker Tchaikovsky - Dance of the Sugar Plum Fairy # # trumpet Mihai Sorohan - Trumpet loop # # vibeace Kevin MacLeod - Vibe Ace # Take example signal from Librosa audio_path = example('brahms') x, fs_x = sf.read(audio_path) x_len = len(x) # STFT x_tf = stft(x, fs=fs_x, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent) # iSTFT x_hat = istft(x_tf, fs=fs_x, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=x_len) # Asser assert_array_almost_equal(x, x_hat)
def process_utt(mcem, model, classifier, mean, std, file_path, device): # Input x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture T_orig = len(x_t) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # (frames, freq_bins) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) x = torch.tensor(np.power(np.abs(x_tf), 2), device=device) # Target s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # (freq_bins, frames) if classif_type == 'dnn': # Normalize power spectrogram if std_norm: x_norm = x - mean.T x_norm /= (std + eps).T y_hat_soft = classifier(x_norm) else: y_hat_soft = classifier(x) y_hat_hard = (y_hat_soft > 0.5).float() if classif_type == 'oracle': y_hat_soft = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) y_hat_hard = torch.from_numpy(y_hat_soft.T).to(device) if classif_type == 'timo': x_numpy = np.power(np.abs(x_tf), 2) y_hat_soft = timo_mask_estimation(x_numpy.T) y_hat_hard = (y_hat_soft > 0.5).astype(int) y_hat_hard = y_hat_hard.T # (frames, freq_bins) y_hat_hard = torch.tensor(y_hat_hard).to(device) # Init MCEM mcem.init_parameters(X=x_tf, y=y_hat_hard, vae=model, nmf_rank=nmf_rank, eps=eps, device=device) cost = mcem.run() S_hat = mcem.S_hat #+ np.finfo(np.float32).eps N_hat = mcem.N_hat #+ np.finfo(np.float32).eps s_hat = istft(S_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig) n_hat = istft(N_hat, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, max_len=T_orig) # Save .wav files output_path = output_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) sf.write(output_path + '_s_est.wav', s_hat, fs) sf.write(output_path + '_n_est.wav', n_hat, fs) # Save binary mask torch.save(y_hat_soft, output_path + ' _ibm_soft_est.pt') torch.save(y_hat_hard, output_path + '_ibm_hard_est.pt')
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # 1 list per metric all_stoi = [] all_pesq = [] all_polqa = [] all_f1score = [] for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech n_t, fs_n = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_n.wav') # noise x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # compute metrics ## STOI (or ESTOI?) stoi_s_hat = stoi(s_t, x_t, fs, extended=True) all_stoi.append(stoi_s_hat) ## PESQ pesq_s_hat = pesq(fs, s_t, x_t, 'wb') # wb = wideband all_pesq.append(pesq_s_hat) ## POLQA # polqa_s_hat = polqa(s, s_t, fs) # all_polqa.append(polqa_s_hat) # TF representation n_tf = stft(n_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # plots of target / estimation # TF representation x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # ## mixture signal (wav + spectro) # ## target signal (wav + spectro + mask) # ## estimated signal (wav + spectro + mask) # signal_list = [ # [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) # [s_t, s_tf, None], # clean speech # [n_t, n_tf, None] # ] # fig = display_multiple_signals(signal_list, # fs=fs, vmin=vmin, vmax=vmax, # wlen_sec=wlen_sec, hop_percent=hop_percent, # xticks_sec=xticks_sec, fontsize=fontsize) # # put all metrics in the title of the figure # title = "Input SNR = {:.1f} dB \n" \ # "STOI = {:.2f}, " \ # "PESQ = {:.2f} \n" \ # "".format(all_snr_db[i], stoi_s_hat, pesq_s_hat) # fig.suptitle(title, fontsize=40) # # Save figure # fig.savefig(processed_data_dir + os.path.splitext(file_path)[0] + '_fig.png') # # Clear figure # plt.close() # Confidence interval metrics = {'SNR': all_snr_db, 'STOI': all_stoi, 'PESQ': all_pesq} stats = {} # Print the names of the columns. print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.')) for key, metric in metrics.items(): m, h = mean_confidence_interval(metric, confidence=confidence) stats[key] = {'avg': m, '+/-': h} print("{:<10} {:<10} {:<10}".format(key, m, h)) print('\n') # Save stats (si_sdr, si_sar, etc. ) with open( processed_data_dir + os.path.dirname(os.path.dirname(file_path)) + 'stats.json', 'w') as f: json.dump(stats, f) # Metrics by input SNR for snr_db in np.unique(all_snr_db): stats = {} print('Input SNR = {:.2f}'.format(snr_db)) # Print the names of the columns. print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.')) for key, metric in metrics.items(): subset_metric = np.array(metric)[np.where(all_snr_db == snr_db)] m, h = mean_confidence_interval(subset_metric, confidence=confidence) stats[key] = {'avg': m, '+/-': h} print("{:<10} {:<10} {:<10}".format(key, m, h)) print('\n') # Save stats (si_sdr, si_sar, etc. ) with open( processed_data_dir + os.path.dirname(os.path.dirname(file_path)) + 'stats_{:g}.json'.format(snr_db), 'w') as f: json.dump(stats, f)
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) device = torch.device("cuda" if cuda else "cpu") file = open('output.log','w') print('Torch version: {}'.format(torch.__version__)) print('Device: %s' % (device)) if torch.cuda.device_count() >= 1: print("Number GPUs: ", torch.cuda.device_count()) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) model = VariationalAutoencoder([x_dim, z_dim, h_dim]) model.load_state_dict(torch.load(model_data_path)) if cuda: model = model.cuda() model.eval() for param in model.parameters(): param.requires_grad = False # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) # Power spectrogram (transpose) x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device) # Encode-decode reconstruction, _, _ = model(x) reconstruction = reconstruction.cpu().numpy() # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # Transpose to match librosa.display reconstruction = reconstruction.T # Transform to dB x_psd = x.cpu().numpy().T x_psd = librosa.core.power_to_db(x_psd) s_psd = np.power(abs(s_tf),2) s_psd = librosa.core.power_to_db(s_psd) reconstruction = librosa.core.power_to_db(reconstruction) ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_psd], # mixture: (waveform, tf_signal, no mask) [s_t, s_psd], # clean speech [None, reconstruction] ] #TODO: modify fig = display_multiple_spectro(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "".format(all_snr_db[i]) fig.suptitle(title, fontsize=40) # Save figure fig.savefig(output_data_dir + os.path.splitext(file_path)[0] + '_recon.png')
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) device = torch.device("cuda" if cuda else "cpu") file = open('output.log', 'w') print('Torch version: {}'.format(torch.__version__)) print('Device: %s' % (device)) if torch.cuda.device_count() >= 1: print("Number GPUs: ", torch.cuda.device_count()) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) classifier = Classifier([x_dim, h_dim_cl, y_dim], batch_norm=batch_norm) classifier.load_state_dict( torch.load(classif_dir, map_location=cuda_device)) if cuda: classifier = classifier.cuda() classifier.eval() for param in classifier.parameters(): param.requires_grad = False # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) # Power spectrogram (transpose) x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device) # Normalize power spectrogram if std_norm: x -= mean.T x /= (std + eps).T # Classify y_hat_soft = classifier(x) y_hat_hard = (y_hat_soft > 0.5).int() y_hat_hard = y_hat_hard.cpu().numpy() y_hat_hard = y_hat_hard.T # Transpose to match librosa.display # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) if labels == 'labels': # binary mask target = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) if labels == 'vad_labels': # vad target = clean_speech_VAD(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # Transpose to match librosa.display x_tf = x_tf.T # F1-score f1score_s_hat = f1_score(target.flatten(), y_hat_hard.flatten(), average="binary") ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, target], # clean speech [None, None, y_hat_hard] #[None, None, y_hat_soft] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat) fig.suptitle(title, fontsize=40) # Save figure output_path = classif_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_hard_mask.png')
def compute_metrics_utt(args): # Separate args file_path, snr_db = args[0], args[1] # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech n_t, fs_n = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_n.wav') # noise x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture s_hat_t, fs_s_hat = sf.read(model_data_dir + os.path.splitext(file_path)[0] + '_s_est.wav') # est. speech # compute metrics ## SI-SDR, SI-SAR, SI-SNR si_sdr, si_sir, si_sar = energy_ratios(s_hat=s_hat_t, s=s_t, n=n_t) ## STOI (or ESTOI?) stoi_s_hat = stoi(s_t, s_hat_t, fs, extended=True) ## PESQ pesq_s_hat = pesq(fs, s_t, s_hat_t, 'wb') # wb = wideband ## POLQA # polqa_s_hat = polqa(s, s_t, fs) # all_polqa.append(polqa_s_hat) ## F1 score # ideal binary mask y_hat_hard = torch.load(model_data_dir + os.path.splitext(file_path)[0] + '_ibm_hard_est.pt', map_location=lambda storage, location: storage) # shape = (frames, freq_bins) # y_hat_hard = torch.load(model_data_dir + os.path.splitext(file_path)[0] + '_ibm_soft_est.pt', map_location=lambda storage, location: storage) # shape = (frames, freq_bins) y_hat_hard = y_hat_hard.T # Transpose to match target y, shape = (freq_bins, frames) # TF representation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) if labels == 'labels': y = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) if labels == 'vad_labels': y = clean_speech_VAD(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # Convert y to Tensor for f1-score y_hat_hard = y_hat_hard.int() y = torch.LongTensor(y) accuracy, precision, recall, f1score_s_hat = f1_loss(y.flatten(), y_hat_hard.flatten(), epsilon=1e-12) # plots of target / estimation # TF representation x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) s_hat_tf = stft(s_hat_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, y.numpy()], # clean speech [s_hat_t, s_hat_tf, y_hat_hard.numpy()] ] fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "SI-SDR = {:.1f} dB, " \ "SI-SIR = {:.1f} dB, " \ "SI-SAR = {:.1f} dB\n" \ "STOI = {:.2f}, " \ "PESQ = {:.2f} \n" \ "Accuracy = {:.3f}, "\ "Precision = {:.3f}, "\ "Recall = {:.3f}, "\ "F1-score = {:.3f}\n".format(snr_db, si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat,\ accuracy, precision, recall, f1score_s_hat) fig.suptitle(title, fontsize=40) # Save figure fig.savefig(model_data_dir + os.path.splitext(file_path)[0] + '_fig.png') # Clear figure plt.close() metrics = [si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat,\ accuracy, precision, recall, f1score_s_hat] return metrics
def test_write_read_labels(dataset_type): # Parameters ## Dataset input_speech_dir = 'data/subset/raw/' output_speech_dir = 'data/subset/processed/' output_data_dir = 'data/subset/pickle/' fs = int(16e3) # Sampling rate ## STFT wlen_sec = 64e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window dtype = 'complex64' ## Ideal binary mask quantile_fraction = 0.98 quantile_weight = 0.999 # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) labels = [] for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file x[:int(0.1 * fs)] = x[int(0.1 * fs):int(0.2 * fs)] # Normalize audio x = x / (np.max(np.abs(x))) #x = x/(np.max(np.abs(x)) + 2) #x = x/np.linalg.norm(x) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # binary mask x_ibm = clean_speech_IBM(x_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) labels.append(x_ibm) labels = np.concatenate(labels, axis=1) #labels = labels[1] # write spectrograms write_dataset(labels, output_data_dir=output_data_dir, dataset_type=dataset_type, suffix='labels') # Read pickle pickle_labels = read_dataset(data_dir=output_data_dir, dataset_type=dataset_type, suffix='labels') # Assert stored data is same as spectrograms assert_array_equal(labels, pickle_labels)
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Power spectrogram (transpose) x = np.power(np.abs(x_tf), 2) # Estimate mask y_hat_soft = timo_mask_estimation(x) y_hat_hard = (y_hat_soft > 0.5).astype(int) # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # binary mask s_ibm = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # F1-score f1score_s_hat = f1_score(s_ibm.flatten(), y_hat_hard.flatten(), average="binary") ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, s_ibm], # clean speech #[None, None, y_hat_hard] [None, None, y_hat_soft] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "".format(all_snr_db[i]) fig.suptitle(title, fontsize=40) # Save figure output_path = model_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_soft_mask.png') ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, s_ibm], # clean speech #[None, None, y_hat_hard] [None, None, y_hat_hard] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat) fig.suptitle(title, fontsize=40) # Save figure output_path = model_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_hard_mask.png')
def main(): # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file x = x[int(0.1 * fs):] # Normalize audio x = x / np.max(np.abs(x)) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent) # shape = (freq_bins, frames) # # binary mask # x_ibm = clean_speech_IBM(x_tf, # quantile_fraction=quantile_fraction, # quantile_weight=quantile_weight) # compute only VAD x_vad = clean_speech_VAD(x_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) x_vad = x_vad[0] # shape = (frames) # # Plot waveplot + spectrogram + binary mask # fig = display_wav_spectro_mask(x, x_tf, x_ibm, # fs=fs, vmin=vmin, vmax=vmax, # wlen_sec=wlen_sec, hop_percent=hop_percent, # xticks_sec=xticks_sec, fontsize=fontsize) # Plot waveplot + spectrogram + vad fig = display_wav_spectro_mask(x, x_tf, x_vad, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # signal_list = [ # [x, x_tf, x_ibm], # mixture: (waveform, tf_signal, no mask) # [x, x_tf, x_ibm], # clean speech # [x, x_tf, x_ibm] # ] # fig = display_multiple_signals(signal_list, # fs=fs, vmin=vmin, vmax=vmax, # wlen_sec=wlen_sec, hop_percent=hop_percent, # xticks_sec=xticks_sec, fontsize=fontsize) title = "quantile_fraction = {:.4f} \n" \ "quantile_weight = {:.4f} \n".format(quantile_fraction, quantile_weight) fig.suptitle(title, fontsize=40) # Save figure # output_path = output_data_dir + os.path.splitext(path)[0] + '_fig.png' output_path = output_data_dir + os.path.splitext( path)[0] + '_fig_vad.png' if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path) print("data is stored in " + output_data_dir)
def main(): if not os.path.exists(os.path.dirname(output_dataset_file)): os.makedirs(os.path.dirname(output_dataset_file)) with h5.File(output_dataset_file, 'a', rdcc_nbytes=rdcc_nbytes, rdcc_nslots=rdcc_nslots) as f: for dataset_type in dataset_types: # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # Delete datasets if already exists if 'X_' + dataset_type in f: del f['X_' + dataset_type] del f['Y_' + dataset_type] # Exact shape of dataset is unknown in advance unfortunately # Faster writing if you know the shape in advance # Size of chunks corresponds to one spectrogram frame f.create_dataset('X_' + dataset_type, shape=X_shape, dtype='float32', maxshape=X_maxshape, chunks=X_chunks, compression=compression, shuffle=shuffle) f.create_dataset('Y_' + dataset_type, shape=Y_shape, dtype='float32', maxshape=Y_maxshape, chunks=Y_chunks, compression=compression, shuffle=shuffle) # STFT attributes f.attrs['fs'] = fs f.attrs['wlen_sec'] = wlen_sec f.attrs['hop_percent'] = hop_percent f.attrs['win'] = win f.attrs['dtype'] = dtype # label attributes f.attrs['quantile_fraction'] = quantile_fraction f.attrs['quantile_weight'] = quantile_weight # HDF5 attributes f.attrs['X_chunks'] = X_chunks f.attrs['Y_chunks'] = Y_chunks f.attrs['compression'] = compression # Store dataset in variables for faster I/O fx = f['X_' + dataset_type] fy = f['Y_' + dataset_type] for i, file_path in tqdm(enumerate(file_paths)): speech, fs_speech = sf.read(input_speech_dir + file_path, samplerate=None) # Cut burst at begining of file speech = speech[int(0.1 * fs):] # Normalize audio speech = speech / (np.max(np.abs(speech))) if fs != fs_speech: raise ValueError('Unexpected sampling rate') # TF reprepsentation speech_tf = stft(speech, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) spectrogram = np.power(abs(speech_tf), 2) if labels == 'vad_labels': # vad speech_vad = clean_speech_VAD( speech_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) label = speech_vad if labels == 'labels': # binary mask speech_ibm = clean_speech_IBM( speech_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) label = speech_ibm # Store spectrogram in dataset fx.resize((fx.shape[1] + spectrogram.shape[1]), axis=1) fx[:, -spectrogram.shape[1]:] = spectrogram # Store spectrogram in label fy.resize((fy.shape[1] + label.shape[1]), axis=1) fy[:, -label.shape[1]:] = label
def main(): if not os.path.exists(os.path.dirname(output_dataset_file)): os.makedirs(os.path.dirname(output_dataset_file)) with h5.File(output_dataset_file, 'a', rdcc_nbytes=rdcc_nbytes, rdcc_nslots=rdcc_nslots) as f: # STFT attributes f.attrs['fs'] = fs f.attrs['wlen_sec'] = wlen_sec f.attrs['hop_percent'] = hop_percent f.attrs['win'] = win f.attrs['dtype'] = dtype # label attributes f.attrs['quantile_fraction'] = quantile_fraction f.attrs['quantile_weight'] = quantile_weight # HDF5 attributes f.attrs['X_chunks'] = X_chunks f.attrs['Y_chunks'] = Y_chunks f.attrs['compression'] = compression for dataset_type in dataset_types: # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) if dataset_type == 'train': noise_types = [ 'domestic', 'nature', 'office', 'transportation' ] if dataset_type == 'validation': noise_types = ['nature', 'office', 'public', 'transportation'] # Create SNR list np.random.seed(0) noise_index = np.random.randint(len(noise_types), size=len(file_paths)) snrs = [-5, -2.5, 0, 2.5, 5.0] snrs_index = np.random.randint(len(snrs), size=len(file_paths)) # Create noise_audios from processed noise files preprocessed_noise_paths = noise_list_preprocessed( preprocessed_noise_dir=output_noise_dir, dataset_type=dataset_type) noise_audios = {} # Load the noise files for noise_type, preprocessed_noise_path in preprocessed_noise_paths.items( ): #if noise already preprocessed, read files directly if os.path.exists(preprocessed_noise_path): noise_audio, fs_noise = sf.read(preprocessed_noise_path) if fs != fs_noise: raise ValueError( 'Unexpected sampling rate. Did you preprocess the 16kHz version of the DEMAND database?' ) noise_audios[noise_type] = noise_audio # Init list of SNR all_snr_dB = [] # Delete datasets if already exists if 'X_' + dataset_type in f: del f['X_' + dataset_type] del f['Y_' + dataset_type] # Exact shape of dataset is unknown in advance unfortunately # Faster writing if you know the shape in advance # Size of chunks corresponds to one spectrogram frame f.create_dataset('X_' + dataset_type, shape=X_shape, dtype='float32', maxshape=X_maxshape, chunks=X_chunks, compression=compression, shuffle=shuffle) f.create_dataset('Y_' + dataset_type, shape=Y_shape, dtype='float32', maxshape=Y_maxshape, chunks=Y_chunks, compression=compression, shuffle=shuffle) # Store dataset in variables for faster I/O fx = f['X_' + dataset_type] fy = f['Y_' + dataset_type] # Compute mean, std of the train set if dataset_type == 'train': # VAR = E[X**2] - E[X]**2 channels_sum, channels_squared_sum = 0., 0. # Loop over the speech files for i, file_path in tqdm(enumerate(file_paths)): speech, fs_speech = sf.read(input_speech_dir + file_path, samplerate=None) # Cut burst at begining of file speech = speech[int(0.1 * fs):] # Normalize audio speech = speech / (np.max(np.abs(speech))) if fs != fs_speech: raise ValueError('Unexpected sampling rate') # Select noise_type noise_type = noise_types[noise_index[i]] # Extract noise segment noise = noise_segment(noise_audios, noise_type, speech) # Select SNR snr_dB = snrs[snrs_index[i]] all_snr_dB.append(snr_dB) # Compute noise gain speech_power = np.sum(np.power(speech, 2)) noise_power = np.sum(np.power(noise, 2)) noise_power_target = speech_power * np.power(10, -snr_dB / 10) k = noise_power_target / noise_power noise = noise * np.sqrt(k) mixture = speech + noise # # Normalize by max of speech, noise, speech+noise # norm = np.max(abs(np.concatenate([speech, noise, speech+noise]))) # mixture = (speech+noise) / norm # speech /= norm # noise /= norm if dataset_size == 'subset': # Save .wav files, just to check if it working output_path = output_wav_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) sf.write(output_path + '_s.wav', speech, fs) sf.write(output_path + '_n.wav', noise, fs) sf.write(output_path + '_x.wav', mixture, fs) # TF reprepsentation mixture_tf = stft(mixture, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) noisy_spectrogram = np.power(abs(mixture_tf), 2) # TF reprepsentation speech_tf = stft(speech, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) if labels == 'noisy_wiener_labels': # TF reprepsentation noise_tf = stft(noise, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # wiener mask speech_wiener_mask = ideal_wiener_mask( speech_tf, noise_tf, eps) label = speech_wiener_mask if labels == 'noisy_labels': # binary mask speech_ibm = clean_speech_IBM( speech_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) label = speech_ibm if labels == 'noisy_vad_labels': # binary mask speech_vad = clean_speech_VAD( speech_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) label = speech_vad # Compute mean, std if dataset_type == 'train': # VAR = E[X**2] - E[X]**2 channels_sum += np.sum(noisy_spectrogram, axis=-1) channels_squared_sum += np.sum(noisy_spectrogram**2, axis=-1) # Store spectrogram in dataset fx.resize((fx.shape[1] + noisy_spectrogram.shape[1]), axis=1) fx[:, -noisy_spectrogram.shape[1]:] = noisy_spectrogram # Store spectrogram in label fy.resize((fy.shape[1] + label.shape[1]), axis=1) fy[:, -label.shape[1]:] = label # Compute and save mean, std if dataset_type == 'train': print('Compute mean and std') #NB: compute the empirical std (!= regular std) n_samples = fx.shape[1] mean = channels_sum / n_samples std = np.sqrt((1 / (n_samples - 1)) * (channels_squared_sum - n_samples * mean**2)) # Delete datasets if already exists if 'X_' + dataset_type + '_mean' in f: del f['X_' + dataset_type + '_mean'] del f['X_' + dataset_type + '_std'] f.create_dataset('X_' + dataset_type + '_mean', shape=X_chunks, dtype='float32', maxshape=X_chunks, chunks=None, compression=compression, shuffle=shuffle) f.create_dataset('X_' + dataset_type + '_std', shape=X_chunks, dtype='float32', maxshape=X_chunks, chunks=None, compression=compression, shuffle=shuffle) f['X_' + dataset_type + '_mean'][:] = mean[..., None] # Add axis to fit chunks shape f['X_' + dataset_type + '_std'][:] = std[..., None] # Add axis to fit chunks shape print('Mean and std saved in HDF5.') # TODO: save SNR, level_s, level_n in 1 big csv write_dataset(all_snr_dB, output_wav_dir, dataset_type, 'snr_db')
def test_write_read_frames(dataset_type): # Parameters ## Dataset input_speech_dir = 'data/subset/raw/' output_speech_dir = 'data/subset/processed/' output_data_dir = 'data/subset/pickle/' fs = int(16e3) # Sampling rate ## STFT wlen_sec = 64e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window dtype = 'complex64' # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) spectrograms = [] for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file #x[:int(0.1*fs)] = x[int(0.1*fs):int(0.2*fs)] x = x[int(0.1 * fs):] # Normalize audio x = x / (np.max(np.abs(x))) #x = x/(np.max(np.abs(x)) + 2) #x = x/np.linalg.norm(x) if not os.path.exists(os.path.dirname(output_speech_dir + path)): os.makedirs(os.path.dirname(output_speech_dir + path)) sf.write(output_speech_dir + path, x, fs_x) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) spectrograms.append(np.power(abs(x_tf), 2)) spectrograms = np.concatenate(spectrograms, axis=1) #spectrograms = spectrograms[1] # write spectrograms write_dataset(spectrograms, output_data_dir=output_data_dir, dataset_type=dataset_type, suffix='frames') # Read pickle pickle_spectrograms = read_dataset(data_dir=output_data_dir, dataset_type=dataset_type, suffix='frames') # Assert stored data is same as spectrograms assert_array_equal(spectrograms, pickle_spectrograms)
def compute_metrics_utt(args): # Separate args file_path, snr_db = args[0], args[1] # print(file_path) # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech n_t, fs_n = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_n.wav') # noise x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture s_hat_t, fs_s_hat = sf.read(model_data_dir + os.path.splitext(file_path)[0] + '_s_est.wav') # est. speech # compute metrics ## SI-SDR, SI-SAR, SI-SNR si_sdr, si_sir, si_sar = energy_ratios(s_hat=s_hat_t, s=s_t, n=n_t) ## STOI (or ESTOI?) stoi_s_hat = stoi(s_t, s_hat_t, fs, extended=True) # all_stoi.append(stoi_s_hat) ## PESQ pesq_s_hat = pesq(fs, s_t, s_hat_t, 'wb') # wb = wideband # all_pesq.append(pesq_s_hat) ## POLQA # polqa_s_hat = polqa(s, s_t, fs) # all_polqa.append(polqa_s_hat) # TF representation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # plots of target / estimation # TF representation x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) s_hat_tf = stft(s_hat_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, None], # clean speech [s_hat_t, s_hat_tf, None] ] fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "SI-SDR = {:.1f} dB, " \ "SI-SIR = {:.1f} dB, " \ "SI-SAR = {:.1f} dB \n" \ "STOI = {:.2f}, " \ "PESQ = {:.2f} \n" \ "".format(snr_db, si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat) fig.suptitle(title, fontsize=40) # Save figure fig.savefig(model_data_dir + os.path.splitext(file_path)[0] + '_fig.png') # Clear figure plt.close() metrics = [si_sdr, si_sir, si_sar, stoi_s_hat, pesq_s_hat] return metrics
def test_write_read_frames(dataset_type): # Parameters ## Dataset input_speech_dir = 'data/subset/raw/' output_speech_dir = 'data/subset/processed/' output_data_dir = 'data/subset/h5/' data_dir = 'CSR-1-WSJ-0' suffix = 'lzf' output_h5_dir = output_data_dir + data_dir + '_' + suffix + '.h5' # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # Open hdf5 file #We are using 400Mb of chunk_cache_mem here ("rdcc_nbytes" and "rdcc_nslots") with h5.File(output_h5_dir, 'r', rdcc_nbytes=1024**2*400, rdcc_nslots=10e6) as f: dx = f['X_' + dataset_type] dy = f['Y_' + dataset_type] ## STFT fs = f.attrs['fs'] # Sampling rate wlen_sec = f.attrs['wlen_sec'] # window length in seconds hop_percent = f.attrs['hop_percent'] # hop size as a percentage of the window length win = f.attrs['win'] # type of window dtype = f.attrs['dtype'] ## Ideal binary mask quantile_fraction = f.attrs['quantile_fraction'] quantile_weight = f.attrs['quantile_weight'] frame_begin = 0 frame_end = 0 for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file #x[:int(0.1*fs)] = x[int(0.1*fs):int(0.2*fs)] x = x[int(0.1*fs):] # Normalize audio x = x/(np.max(np.abs(x))) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) spectrogram = np.power(abs(x_tf), 2) # binary mask label = clean_speech_IBM(x_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # Read h5 spectrogram frame_end += spectrogram.shape[1] h5_spectrogram = dx[:,frame_begin:frame_end] h5_label = dy[:,frame_begin:frame_end] # Assert stored data is same as spectrograms assert_array_equal(spectrogram, h5_spectrogram) assert_array_equal(label, h5_label) # Next iteration frame_begin += spectrogram.shape[1]