def gev_wrapper_on_masks(mix, noise_mask=None, target_mask=None, normalization=True): if noise_mask is None and target_mask is None: raise ValueError('At least one mask needs to be present.') mix = mix.T if noise_mask is not None: noise_mask = noise_mask.T if target_mask is not None: target_mask = target_mask.T if target_mask is None: target_mask = np.clip(1 - noise_mask, 1e-6, 1) if noise_mask is None: noise_mask = np.clip(1 - target_mask, 1e-6, 1) t_psd = 0 t_vector = 0 t_beam = 0 with Timer() as t: # print("target_mask: ", target_mask.shape, "noise_mask: ", noise_mask.shape, end="\n") target_psd_matrix = get_power_spectral_density_matrix(mix, target_mask) # print("psd matrix: ", target_psd_matrix.shape, end="\n") noise_psd_matrix = get_power_spectral_density_matrix(mix, noise_mask) t_psd += t.msecs # Beamforming vector with Timer() as t: W_gev = get_gev_vector(target_psd_matrix, noise_psd_matrix) t_vector += t.msecs with Timer() as t: if normalization: W_gev = blind_analytic_normalization(W_gev, noise_psd_matrix) # print("get normalization", end="\n") # print("W_gev: ", W_gev.shape, end="\n") output = apply_beamforming_vector(W_gev, mix) t_beam += t.msecs print( 'Timings: PSD: {:.2f}s | Vector: {:.2f}s | Beam: {:.2f}s | Total: {:.2f}s' .format(t_psd / 1000, t_vector / 1000, t_beam / 1000, ((t_psd + t_vector + t_beam) / 1000))) # beamformer_psd_matrix = get_power_spectral_density_matrix(output) # print(beamformer_psd_matrix) # print ("no normalization") return output.T
def single_normal(): # audio_data = get_audio_nochime('data/new_dataset/216m/2m_pub_new', ch_range=range(1, 9), fs=16000) # noise_data = get_audio_nochime('data/new_dataset/blstm_noise/noise_124', ch_range=range(1, 9), fs=16000) # audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000) t_io = 0 t_net = 0 t_beamform = 0 # check execution time with Timer() as t: audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000) context_samples = 0 print("audio_data: ", audio_data.shape, end="\n") # for i in range (0, 8): # print(audio_data[i][1]) t_io += t.msecs Y = stft(audio_data, time_dim=1).transpose((1, 0, 2)) # N = stft(noise_data, time_dim=1).transpose((1, 0, 2)) Y_phase = np.divide(Y, abs(Y)) print("Y: ", Y.shape, "Y_phase: ", Y_phase.shape, end="\n") # Y_var with or without chainer Variable class doesn't give any different Y_var = Variable(np.abs(Y).astype(np.float32)) # N_var = Variable(np.abs(N).astype(np.float32), True) # blstm_noise = Variable(np.abs(blstm_noise).astype(np.float32), True) with Timer() as t: # mask estimation N_masks, X_masks = model.calc_masks(Y_var) # Noise_masks = model.calc_mask_noise(N_var) print("N_masks: ", N_masks.shape, end="\n") N_masks.to_cpu() X_masks.to_cpu() t_net += t.msecs # Noise_masks.to_cpu() with Timer() as t: N_mask = np.median(N_masks.data, axis=1) X_mask = np.median(X_masks.data, axis=1) # Noise_mask = np.median(Noise_masks.data, axis=1) # signal = audioread('data/new_dataset/216m/2m_pub_new' + '.CH{}.wav'.format(ch), sample_rate=16000) # noise = audioread('data/new_dataset/gevnoise/gevnoise' + '.CH{}.wav'.format(ch), sample_rate=16000) # signal_ = stft(signal) # noise_ = stft(noise) # # signal_phase = np.divide(signal, abs(signal_)) # noise_masks = model.calc_mask_noise(noise_) # noise_to = np.multiply(noise_masks.data, signal_) # noise_to = np.multiply(noise_to, signal_phase) # audiowrite(istft(noise_to)[context_samples:], # "/home/hipo/workspace/BeamSaber/result/noise/noise_to_.CH{}.wav".format(ch), 16000, True, True) Noise = np.multiply(N_masks.data, Y) Noise = np.multiply(Noise, Y_phase) # Y_phase_med = np.median(Y_phase, axis=1) # print(Noise.shape) # for ch in range(0, 8): # audiowrite(istft(Noise[:,ch,:])[context_samples:], # "/home/hipo/workspace/BeamSaber/result/noise/2mnoise_.CH{}.wav".format(ch), 16000, True, True) Noise = np.median(Noise, axis=1) # print("N_mask: ", N_mask.shape, "X_mask: ", X_mask.shape, "Y_phase: ", Y_phase.shape, end="\n") Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) # print(Y_hat.shape) # print("Noise: ", Noise.shape) t_beamform += t.msecs with Timer() as t: audiowrite( istft(Noise)[context_samples:], "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_noise.wav" .format(args.exNum), 16000, True, True) audiowrite( istft(Y_hat)[context_samples:], "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_gev.wav" .format(args.exNum), 16000, True, True) t_io += t.msecs print( 'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total: {:.2f}s' .format(t_io / 1000, t_net / 1000, t_beamform / 1000, ((t_io + t_net + t_beamform) / 1000)))
flist = gen_flist_real(args.chime_dir, stage) else: raise ValueError('Unknown flist {}'.format(args.flist)) for env in ['caf', 'bus', 'str', 'ped']: for beamformer in beamformers: mkdir_p(os.path.join(args.output_dir, beamformer,'{}05_{}_{}'.format( stage, env, scenario ))) t_io = 0 t_net = 0 t_beamform = 0 # Beamform loop for cur_line in tqdm(flist): with Timer() as t: if scenario == 'simu': audio_data = get_audio_data(cur_line) context_samples = 0 elif scenario == 'real': audio_data, context_samples = get_audio_data_with_context( cur_line[0], cur_line[1], cur_line[2]) t_io += t.msecs Y = stft(audio_data, time_dim=1).transpose((1, 0, 2)) Y_var = Variable(np.abs(Y).astype(np.float32)) if args.gpu >= 0: Y_var.to_gpu(args.gpu) with Timer() as t: N_masks, X_masks = model.calc_masks(Y_var) N_masks.to_cpu() X_masks.to_cpu()