def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # Fuse both list args = [[file_path, snr_db] for file_path, snr_db in zip(file_paths, all_snr_db)] t1 = time.perf_counter() with concurrent.futures.ProcessPoolExecutor(max_workers=None) as executor: all_metrics = executor.map(compute_metrics_utt, args) t2 = time.perf_counter() print(f'Finished in {t2 - t1} seconds') # Transform generator to list all_metrics = list(all_metrics) metrics_keys = ['SI-SDR', 'SI-SIR', 'SI-SAR', 'STOI', 'PESQ'] # Compute & save stats compute_stats(metrics_keys=metrics_keys, all_metrics=all_metrics, all_snr_db=all_snr_db, model_data_dir=model_data_dir, confidence=confidence)
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # 1 list per metric all_stoi = [] all_pesq = [] all_polqa = [] all_f1score = [] for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech n_t, fs_n = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_n.wav') # noise x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # compute metrics ## STOI (or ESTOI?) stoi_s_hat = stoi(s_t, x_t, fs, extended=True) all_stoi.append(stoi_s_hat) ## PESQ pesq_s_hat = pesq(fs, s_t, x_t, 'wb') # wb = wideband all_pesq.append(pesq_s_hat) ## POLQA # polqa_s_hat = polqa(s, s_t, fs) # all_polqa.append(polqa_s_hat) # TF representation n_tf = stft(n_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # plots of target / estimation # TF representation x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # ## mixture signal (wav + spectro) # ## target signal (wav + spectro + mask) # ## estimated signal (wav + spectro + mask) # signal_list = [ # [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) # [s_t, s_tf, None], # clean speech # [n_t, n_tf, None] # ] # fig = display_multiple_signals(signal_list, # fs=fs, vmin=vmin, vmax=vmax, # wlen_sec=wlen_sec, hop_percent=hop_percent, # xticks_sec=xticks_sec, fontsize=fontsize) # # put all metrics in the title of the figure # title = "Input SNR = {:.1f} dB \n" \ # "STOI = {:.2f}, " \ # "PESQ = {:.2f} \n" \ # "".format(all_snr_db[i], stoi_s_hat, pesq_s_hat) # fig.suptitle(title, fontsize=40) # # Save figure # fig.savefig(processed_data_dir + os.path.splitext(file_path)[0] + '_fig.png') # # Clear figure # plt.close() # Confidence interval metrics = {'SNR': all_snr_db, 'STOI': all_stoi, 'PESQ': all_pesq} stats = {} # Print the names of the columns. print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.')) for key, metric in metrics.items(): m, h = mean_confidence_interval(metric, confidence=confidence) stats[key] = {'avg': m, '+/-': h} print("{:<10} {:<10} {:<10}".format(key, m, h)) print('\n') # Save stats (si_sdr, si_sar, etc. ) with open( processed_data_dir + os.path.dirname(os.path.dirname(file_path)) + 'stats.json', 'w') as f: json.dump(stats, f) # Metrics by input SNR for snr_db in np.unique(all_snr_db): stats = {} print('Input SNR = {:.2f}'.format(snr_db)) # Print the names of the columns. print("{:<10} {:<10} {:<10}".format('METRIC', 'AVERAGE', 'CONF. INT.')) for key, metric in metrics.items(): subset_metric = np.array(metric)[np.where(all_snr_db == snr_db)] m, h = mean_confidence_interval(subset_metric, confidence=confidence) stats[key] = {'avg': m, '+/-': h} print("{:<10} {:<10} {:<10}".format(key, m, h)) print('\n') # Save stats (si_sdr, si_sar, etc. ) with open( processed_data_dir + os.path.dirname(os.path.dirname(file_path)) + 'stats_{:g}.json'.format(snr_db), 'w') as f: json.dump(stats, f)
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) device = torch.device("cuda" if cuda else "cpu") file = open('output.log','w') print('Torch version: {}'.format(torch.__version__)) print('Device: %s' % (device)) if torch.cuda.device_count() >= 1: print("Number GPUs: ", torch.cuda.device_count()) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) model = VariationalAutoencoder([x_dim, z_dim, h_dim]) model.load_state_dict(torch.load(model_data_path)) if cuda: model = model.cuda() model.eval() for param in model.parameters(): param.requires_grad = False # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) # Power spectrogram (transpose) x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device) # Encode-decode reconstruction, _, _ = model(x) reconstruction = reconstruction.cpu().numpy() # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # Transpose to match librosa.display reconstruction = reconstruction.T # Transform to dB x_psd = x.cpu().numpy().T x_psd = librosa.core.power_to_db(x_psd) s_psd = np.power(abs(s_tf),2) s_psd = librosa.core.power_to_db(s_psd) reconstruction = librosa.core.power_to_db(reconstruction) ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_psd], # mixture: (waveform, tf_signal, no mask) [s_t, s_psd], # clean speech [None, reconstruction] ] #TODO: modify fig = display_multiple_spectro(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "".format(all_snr_db[i]) fig.suptitle(title, fontsize=40) # Save figure fig.savefig(output_data_dir + os.path.splitext(file_path)[0] + '_recon.png')
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) device = torch.device("cuda" if cuda else "cpu") file = open('output.log', 'w') print('Torch version: {}'.format(torch.__version__)) print('Device: %s' % (device)) if torch.cuda.device_count() >= 1: print("Number GPUs: ", torch.cuda.device_count()) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) classifier = Classifier([x_dim, h_dim_cl, y_dim], batch_norm=batch_norm) classifier.load_state_dict( torch.load(classif_dir, map_location=cuda_device)) if cuda: classifier = classifier.cuda() classifier.eval() for param in classifier.parameters(): param.requires_grad = False # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Transpose to match PyTorch x_tf = x_tf.T # (frames, freq_bins) # Power spectrogram (transpose) x = torch.tensor(np.power(np.abs(x_tf), 2)).to(device) # Normalize power spectrogram if std_norm: x -= mean.T x /= (std + eps).T # Classify y_hat_soft = classifier(x) y_hat_hard = (y_hat_soft > 0.5).int() y_hat_hard = y_hat_hard.cpu().numpy() y_hat_hard = y_hat_hard.T # Transpose to match librosa.display # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) if labels == 'labels': # binary mask target = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) if labels == 'vad_labels': # vad target = clean_speech_VAD(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # Transpose to match librosa.display x_tf = x_tf.T # F1-score f1score_s_hat = f1_score(target.flatten(), y_hat_hard.flatten(), average="binary") ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, target], # clean speech [None, None, y_hat_hard] #[None, None, y_hat_soft] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat) fig.suptitle(title, fontsize=40) # Save figure output_path = classif_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_hard_mask.png')
def test_write_read_labels(dataset_type): # Parameters ## Dataset input_speech_dir = 'data/subset/raw/' output_speech_dir = 'data/subset/processed/' output_data_dir = 'data/subset/pickle/' fs = int(16e3) # Sampling rate ## STFT wlen_sec = 64e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window dtype = 'complex64' ## Ideal binary mask quantile_fraction = 0.98 quantile_weight = 0.999 # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) labels = [] for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file x[:int(0.1 * fs)] = x[int(0.1 * fs):int(0.2 * fs)] # Normalize audio x = x / (np.max(np.abs(x))) #x = x/(np.max(np.abs(x)) + 2) #x = x/np.linalg.norm(x) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # binary mask x_ibm = clean_speech_IBM(x_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) labels.append(x_ibm) labels = np.concatenate(labels, axis=1) #labels = labels[1] # write spectrograms write_dataset(labels, output_data_dir=output_data_dir, dataset_type=dataset_type, suffix='labels') # Read pickle pickle_labels = read_dataset(data_dir=output_data_dir, dataset_type=dataset_type, suffix='labels') # Assert stored data is same as spectrograms assert_array_equal(labels, pickle_labels)
def test_write_read_frames(dataset_type): # Parameters ## Dataset input_speech_dir = 'data/subset/raw/' output_speech_dir = 'data/subset/processed/' output_data_dir = 'data/subset/pickle/' fs = int(16e3) # Sampling rate ## STFT wlen_sec = 64e-3 # window length in seconds hop_percent = 0.25 # hop size as a percentage of the window length win = 'hann' # type of window dtype = 'complex64' # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) spectrograms = [] for path in file_paths: x, fs_x = sf.read(input_speech_dir + path, samplerate=None) # Cut burst at begining of file #x[:int(0.1*fs)] = x[int(0.1*fs):int(0.2*fs)] x = x[int(0.1 * fs):] # Normalize audio x = x / (np.max(np.abs(x))) #x = x/(np.max(np.abs(x)) + 2) #x = x/np.linalg.norm(x) if not os.path.exists(os.path.dirname(output_speech_dir + path)): os.makedirs(os.path.dirname(output_speech_dir + path)) sf.write(output_speech_dir + path, x, fs_x) if fs != fs_x: raise ValueError('Unexpected sampling rate') # TF reprepsentation x_tf = stft(x, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) spectrograms.append(np.power(abs(x_tf), 2)) spectrograms = np.concatenate(spectrograms, axis=1) #spectrograms = spectrograms[1] # write spectrograms write_dataset(spectrograms, output_data_dir=output_data_dir, dataset_type=dataset_type, suffix='frames') # Read pickle pickle_spectrograms = read_dataset(data_dir=output_data_dir, dataset_type=dataset_type, suffix='frames') # Assert stored data is same as spectrograms assert_array_equal(spectrograms, pickle_spectrograms)
def main(): # Load input SNR all_snr_db = read_dataset(processed_data_dir, dataset_type, 'snr_db') all_snr_db = np.array(all_snr_db) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) # Create file list file_paths = speech_list(input_speech_dir=input_speech_dir, dataset_type=dataset_type) for i, file_path in tqdm(enumerate(file_paths)): # Read files s_t, fs_s = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_s.wav') # clean speech x_t, fs_x = sf.read(processed_data_dir + os.path.splitext(file_path)[0] + '_x.wav') # mixture # x = x/np.max(x) T_orig = len(x_t) # TF representation # Input should be (frames, freq_bibs) x_tf = stft(x_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # Power spectrogram (transpose) x = np.power(np.abs(x_tf), 2) # Estimate mask y_hat_soft = timo_mask_estimation(x) y_hat_hard = (y_hat_soft > 0.5).astype(int) # plots of target / estimation s_tf = stft(s_t, fs=fs, wlen_sec=wlen_sec, win=win, hop_percent=hop_percent, dtype=dtype) # shape = (freq_bins, frames) # binary mask s_ibm = clean_speech_IBM(s_tf, quantile_fraction=quantile_fraction, quantile_weight=quantile_weight) # F1-score f1score_s_hat = f1_score(s_ibm.flatten(), y_hat_hard.flatten(), average="binary") ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, s_ibm], # clean speech #[None, None, y_hat_hard] [None, None, y_hat_soft] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "".format(all_snr_db[i]) fig.suptitle(title, fontsize=40) # Save figure output_path = model_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_soft_mask.png') ## mixture signal (wav + spectro) ## target signal (wav + spectro + mask) ## estimated signal (wav + spectro + mask) signal_list = [ [x_t, x_tf, None], # mixture: (waveform, tf_signal, no mask) [s_t, s_tf, s_ibm], # clean speech #[None, None, y_hat_hard] [None, None, y_hat_hard] ] #TODO: modify fig = display_multiple_signals(signal_list, fs=fs, vmin=vmin, vmax=vmax, wlen_sec=wlen_sec, hop_percent=hop_percent, xticks_sec=xticks_sec, fontsize=fontsize) # put all metrics in the title of the figure title = "Input SNR = {:.1f} dB \n" \ "F1-score = {:.3f} \n".format(all_snr_db[i], f1score_s_hat) fig.suptitle(title, fontsize=40) # Save figure output_path = model_data_dir + file_path output_path = os.path.splitext(output_path)[0] if not os.path.exists(os.path.dirname(output_path)): os.makedirs(os.path.dirname(output_path)) fig.savefig(output_path + '_hard_mask.png')