def evaluation(): with torch.no_grad(): # # initialize network # np.random.seed(0) torch.manual_seed(0) network_params = 0 network = EnsembleNetwork( filepath_gating=file_gating, filepaths_denoising=files_specialists, g_hs=hidden_size_gating, g_nl=num_layers_gating, s_hs=hidden_size_specialist, s_nl=num_layers_specialist, ct=args.latent_space, ).to(device=args.device_id) F.write_data(filename=os.path.join(output_directory, 'files_gating.txt'), data=str(file_gating)) F.write_data(filename=os.path.join(output_directory, 'files_specialist.txt'), data=str(files_specialists)) with torch.cuda.device(args.device_id): torch.cuda.empty_cache() if args.latent_space == 'gender': te_sisdr = {str(k): 0 for k in C.gender_all} for te_gender in C.gender_all: te_batch_durations = list() te_batch_sisdr = list() files_speech = np.random.choice(F.filter_by_gender( te_utterances, te_gender), size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all) (X, X_mag) = F.stft(x) X = X.permute( 1, 0, 2)[:stft_frames] # (seq_len, num_features, channel) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X = torch.unsqueeze(X, dim=0) X_mag = torch.unsqueeze(X_mag, dim=0) s = torch.unsqueeze(s, dim=0) x = torch.unsqueeze(x, dim=0) actual_sisdr = float(F.calculate_sisdr(s, x).item()) # feed-forward M_hat = network(X_mag) s_hat = F.istft(X, mask=M_hat) te_batch_sisdr.append( (F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item())) te_batch_durations.append(min_length) # store the weighted average results te_sisdr[str(te_gender)] = np.average( te_batch_sisdr, weights=te_batch_durations) elif args.latent_space == 'snr': te_sisdr = {str(k): 0 for k in C.snr_all} for te_snr in C.snr_all: te_batch_durations = list() te_batch_sisdr = list() files_speech = np.random.choice(te_utterances, size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr) (X, X_mag) = F.stft(x) X = X.permute( 1, 0, 2)[:stft_frames] # (seq_len, num_features, channel) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X = torch.unsqueeze(X, dim=0) X_mag = torch.unsqueeze(X_mag, dim=0) s = torch.unsqueeze(s, dim=0) x = torch.unsqueeze(x, dim=0) actual_sisdr = float(F.calculate_sisdr(s, x).item()) # feed-forward M_hat = network(X_mag) s_hat = F.istft(X, mask=M_hat) te_batch_sisdr.append( (F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item())) te_batch_durations.append(min_length) # store the weighted average results te_sisdr[str(te_snr)] = np.average(te_batch_sisdr, weights=te_batch_durations) te_sisdr['mean'] = np.mean(list(te_sisdr.values())) logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4)) F.write_data(filename=os.path.join(output_directory, f'test_results.txt'), data=te_sisdr) return
def evaluation(): with torch.no_grad(): sum_num_params = 0 # # initialize gating network # gating = GatingNetwork(hidden_size_gating, num_layers_gating, len(gender_all)).to(device=args.device_id) gating.load_state_dict(torch.load( args.state_dict_file_gating, map_location=torch.device(args.device_id)) ) gating.eval() sum_num_params += F.count_parameters(gating) # # initialize specialist networks (as a hashed list of networks) # specialists = { i: SpecialistNetwork(hidden_size_specialist, num_layers_specialist).to(device=args.device_id) for i in range(len(gender_all)) } for i in range(len(gender_all)): assert (re.search(r'gender\_[MF]', args.state_dict_file_specialist) is not None) filepath = re.sub(r'gender\_[MF]', F.fmt_gender(gender_all[i]), args.state_dict_file_specialist) specialists[i].load_state_dict(torch.load( filepath, map_location=torch.device(args.device_id)) ) specialists[i].eval() sum_num_params += F.count_parameters(specialists[i]) F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'), data=sum_num_params) with torch.cuda.device(args.device_id): torch.cuda.empty_cache() # # log experiment configuration # logging.info('All results will be stored in "{}".'.format( output_directory)) logging.info('Testing {} model (with Gating architecture {} and Specialist architecture {}) to denoise {} gendered mixtures...'.format( model_name, architecture_gating, architecture_specialist, gender_all)) logging.info('Using GPU device {}...'.format( args.device_id)) fields = ['snr_val','num_mixtures','sdr','sisdr','mse','bce','accuracy'] # # validation # results_validation = [] np.random.seed(0) torch.manual_seed(0) for gender_val in gender_all: # construct a batch batch = F.generate_batch( vl_utterances, vl_noises, batch_size=vl_batch_size, gender=gender_val, device=args.device_id, ) Y = batch.index_gender # compute batch-wise specialist probabilities Y_hat = gating(batch.X_mag) # pick the best specialist to apply to the whole batch (based on batch probabilities sum) k = int(Y_hat.sum(dim=0).argmax().item()) # apply the best specialist to the entire batch M_hat = specialists[k](batch.X_mag) s_hat = F.istft(batch.X, mask=M_hat) results_validation.append([ gender_val, vl_batch_size, float(F.calculate_sdr(batch.s, s_hat, offset=batch.actual_sdr).mean().item()), float(F.calculate_sisdr(batch.s, s_hat, offset=batch.actual_sisdr).mean().item()), float(F.calculate_mse(batch.M, M_hat).item()), float(F.calculate_bce(batch.M, M_hat).item()), float(F.calculate_accuracy(Y, Y_hat)), ]) status = ( f'Validation Data (Gender={gender_val}) -- ' + \ f'SDR: {results_validation[-1][2]:>6.3f} dB, ' + \ f'\033[33mSISDR: {results_validation[-1][3]:>6.3f} dB\033[39m, ' + \ f'MSE: {results_validation[-1][4]:>6.3f}, ' + \ f'BCE: {results_validation[-1][5]:>6.3f}, ' + \ f'Accuracy: {results_validation[-1][6]:>6.3f}' ) logging.info(status) F.write_table(filename=os.path.join(output_directory, f'validation_results.txt'), table_data=results_validation, headers=fields) # # testing # results_testing = [] for gender_val in gender_all: np.random.seed(0) torch.manual_seed(0) te_utterances_filtered = te_utterances[np.array([(F.get_gender(row) in gender_val) for row in te_utterances])] files_speech = np.random.choice(te_utterances_filtered, size=te_batch_size) files_noise = np.random.choice(te_noises, size=te_batch_size) te_m_durations = list() te_m_sdr = list() te_m_sisdr = list() te_m_mse = list() te_m_bce = list() te_m_accuracy = list() for (i, fs, fn) in zip(range(te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length/hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=snr_all) (S, S_mag) = F.stft(s) (N, N_mag) = F.stft(n) (X, X_mag) = F.stft(x) (M) = F.calculate_masking_target(S_mag, N_mag) X = X.permute(1, 0, 2)[:stft_frames] # (seq_len, num_features, channel) S_mag = S_mag.permute(1, 0)[:stft_frames] # (seq_len, num_features) N_mag = N_mag.permute(1, 0)[:stft_frames] # (seq_len, num_features) X_mag = X_mag.permute(1, 0)[:stft_frames] # (seq_len, num_features) M = M.permute(1, 0)[:stft_frames] # (seq_len, num_features) actual_sdr = float(F.calculate_sdr(s, x).item()) actual_sisdr = float(F.calculate_sisdr(s, x).item()) gender_index = int(F.get_gender(fs)=='F') Y = torch.zeros(1, len(gender_all), device=args.device_id) Y[..., gender_index] = 1 # add a fake batch axis to everything x = torch.unsqueeze(x, dim=0) s = torch.unsqueeze(s, dim=0) n = torch.unsqueeze(n, dim=0) S = torch.unsqueeze(S, dim=0) S_mag = torch.unsqueeze(S_mag, dim=0) N = torch.unsqueeze(N, dim=0) N_mag = torch.unsqueeze(N_mag, dim=0) X = torch.unsqueeze(X, dim=0) X_mag = torch.unsqueeze(X_mag, dim=0) M = torch.unsqueeze(M, dim=0) # compute batch-wise specialist probabilities Y_hat = gating(X_mag) # pick the best specialist to apply to the whole batch (based on batch probabilities sum) k = int(Y_hat.sum(dim=0).argmax().item()) # apply the best specialist to the entire batch M_hat = specialists[k](X_mag) s_hat = F.istft(X, mask=M_hat) te_m_sdr.append(F.calculate_sdr(s, s_hat, offset=actual_sdr).mean().item()) te_m_sisdr.append(F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item()) te_m_mse.append(F.calculate_mse(M, M_hat).item()) te_m_bce.append(F.calculate_bce(M, M_hat).item()) te_m_accuracy.append(float(torch.prod(Y==torch.round(Y_hat), dim=-1).sum().item()/float(len(Y)))) te_m_durations.append(min_length) # store the weighted average results results_testing.append([ gender_val, te_batch_size, np.average(te_m_sdr, weights=te_m_durations), np.average(te_m_sisdr, weights=te_m_durations), np.average(te_m_mse, weights=te_m_durations), np.average(te_m_bce, weights=te_m_durations), np.average(te_m_accuracy, weights=te_m_durations), ]) status = ( f'Test Data (Gender={gender_val}) -- ' + \ f'SDR: {results_testing[-1][2]:>6.3f} dB, ' + \ f'\033[33mSISDR: {results_testing[-1][3]:>6.3f} dB\033[39m, ' + \ f'MSE: {results_testing[-1][4]:>6.3f}, ' + \ f'BCE: {results_testing[-1][5]:>6.3f}, ' + \ f'Accuracy: {results_testing[-1][6]:>6.3f}' ) logging.info(status) F.write_table(filename=os.path.join(output_directory, f'test_results.txt'), table_data=results_testing, headers=fields) return
def evaluation(): with torch.no_grad(): # # initialize network # np.random.seed(0) torch.manual_seed(0) network = GatingNetwork( args.hidden_size, args.num_layers, args.num_clusters, ).to(device=args.device_id) network_params = F.count_parameters(network) network.load_state_dict(torch.load( args.state_dict_file, map_location=torch.device(args.device_id), ), strict=False) network.eval() F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'), data=network_params) with torch.cuda.device(args.device_id): torch.cuda.empty_cache() if args.latent_space == 'gender': te_accuracy = {str(k): 0 for k in C.gender_all} for te_gender in C.gender_all: te_batch_durations = list() te_batch_accuracy = list() files_speech = np.random.choice(F.filter_by_gender( te_utterances, te_gender), size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all) (X, X_mag) = F.stft(x) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X_mag = torch.unsqueeze(X_mag, dim=0) Y = torch.zeros(1, len(C.gender_all), device=args.device_id) gender_index = int(te_gender == 'F') Y[..., gender_index] = 1 # forward pass Y_hat = network(X_mag) te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat)) te_batch_durations.append(min_length) # store the weighted average results te_accuracy[str(te_gender)] = np.average( te_batch_accuracy, weights=te_batch_durations) elif args.latent_space == 'snr': te_accuracy = {str(k): 0 for k in C.snr_all} for te_snr in C.snr_all: te_batch_durations = list() te_batch_accuracy = list() files_speech = np.random.choice(te_utterances, size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr) (X, X_mag) = F.stft(x) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X_mag = torch.unsqueeze(X_mag, dim=0) actual_sdr = float(F.calculate_sdr(s, x).item()) sdr_index = int(np.abs(C.snr_all - actual_sdr).argmin()) Y = torch.zeros(1, len(C.snr_all), device=args.device_id) Y[..., sdr_index] = 1 # forward pass Y_hat = network(X_mag) te_batch_accuracy.append(F.calculate_accuracy(Y, Y_hat)) te_batch_durations.append(min_length) # store the weighted average results te_accuracy[str(te_snr)] = np.average( te_batch_accuracy, weights=te_batch_durations) te_accuracy['mean'] = np.mean(list(te_accuracy.values())) logging.info(json.dumps(te_accuracy, sort_keys=True, indent=4)) F.write_data(filename=os.path.join(output_directory, f'test_results.txt'), data=te_accuracy) return
for mixture_snr in te_snrs: if not (mixture_snr == -5): continue # mix the signals up source = F.load_audio(fs, device=args.device_id, random_offset=False, duration=None) noise = F.load_audio(fn, device=args.device_id, random_offset=False, duration=None) min_length = min(len(source), len(noise)) (x, s, n) = F.mix_signals(source[:min_length], noise[:min_length], snr_db=mixture_snr) (S, S_mag) = F.stft(s) (N, N_mag) = F.stft(n) (X, X_mag) = F.stft(x) (M) = F.calculate_masking_target(S_mag, N_mag) X = X.permute(1, 0, 2) S_mag = S_mag.permute(1, 0) N_mag = N_mag.permute(1, 0) X_mag = X_mag.permute(1, 0) M = M.permute(1, 0) actual_sdr = float(F.calculate_sdr(s, x).item()) actual_sisdr = float(F.calculate_sisdr(s, x).item()) # inference M = network(X_mag.unsqueeze(0))
def evaluation(): with torch.no_grad(): # # initialize network # np.random.seed(0) torch.manual_seed(0) network = DenoisingNetwork(args.hidden_size, args.num_layers).to(device=args.device_id) network_params = F.count_parameters(network) network.load_state_dict(torch.load( args.state_dict_file, map_location=torch.device(args.device_id), ), strict=True) network.eval() F.write_data(filename=os.path.join(output_directory, 'num_parameters.txt'), data=network_params) with torch.cuda.device(args.device_id): torch.cuda.empty_cache() te_sisdr = dict() if args.latent_space in ('gender', 'all'): np.random.seed(0) torch.manual_seed(0) for te_gender in C.gender_all: logging.info( f'Now testing model with {te_gender}-gender inputs...') te_batch_durations = list() te_batch_sisdr = list() files_speech = np.random.choice(F.filter_by_gender( te_utterances, te_gender), size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=C.snr_all) (X, X_mag) = F.stft(x) X = X.permute( 1, 0, 2)[:stft_frames] # (seq_len, num_features, channel) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X = torch.unsqueeze(X, dim=0) X_mag = torch.unsqueeze(X_mag, dim=0) s = torch.unsqueeze(s, dim=0) x = torch.unsqueeze(x, dim=0) actual_sisdr = float(F.calculate_sisdr(s, x).item()) # feed-forward M_hat = network(X_mag) s_hat = F.istft(X, mask=M_hat) te_batch_sisdr.append( (F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item())) te_batch_durations.append(min_length) # store the weighted average results te_sisdr[str(te_gender)] = np.average( te_batch_sisdr, weights=te_batch_durations) te_sisdr['mean_gender'] = np.mean( [te_sisdr[str(x)] for x in C.gender_all]) if args.latent_space in ('snr', 'all'): np.random.seed(0) torch.manual_seed(0) for te_snr in C.snr_all: logging.info( f'Now testing model with {te_snr} dB mixture SDR inputs...' ) te_batch_durations = list() te_batch_sisdr = list() files_speech = np.random.choice(te_utterances, size=C.te_batch_size) files_noise = np.random.choice(te_noises, size=C.te_batch_size) for (i, fs, fn) in zip(range(C.te_batch_size), files_speech, files_noise): source = F.load_audio(fs, duration=None, random_offset=False, device=args.device_id) noise = F.load_audio(fn, duration=None, random_offset=False, device=args.device_id) min_length = min(len(source), len(noise)) stft_frames = ceil(min_length / C.hop_size) source = source[:min_length] noise = noise[:min_length] (x, s, n) = F.mix_signals(source, noise, snr_db=te_snr) (X, X_mag) = F.stft(x) X = X.permute( 1, 0, 2)[:stft_frames] # (seq_len, num_features, channel) X_mag = X_mag.permute( 1, 0)[:stft_frames] # (seq_len, num_features) X = torch.unsqueeze(X, dim=0) X_mag = torch.unsqueeze(X_mag, dim=0) s = torch.unsqueeze(s, dim=0) x = torch.unsqueeze(x, dim=0) actual_sisdr = float(F.calculate_sisdr(s, x).item()) # feed-forward M_hat = network(X_mag) s_hat = F.istft(X, mask=M_hat) te_batch_sisdr.append( (F.calculate_sisdr(s, s_hat, offset=actual_sisdr).mean().item())) te_batch_durations.append(min_length) # store the weighted average results te_sisdr[str(te_snr)] = np.average(te_batch_sisdr, weights=te_batch_durations) te_sisdr['mean_sisdr'] = np.mean( [te_sisdr[str(x)] for x in C.snr_all]) logging.info(json.dumps(te_sisdr, sort_keys=True, indent=4)) F.write_data(filename=os.path.join(output_directory, f'test_results.txt'), data=te_sisdr) return