def apply_beamfomer(args): estimator = MaskEstimator(num_bins) mask_computer = MaskComputer(estimator, args.model) flist_name = os.path.basename(args.flist) sub_dir = flist_name.split('.')[0] dumps_dir = os.path.join(args.dumps_dir, sub_dir) func_bf = mvdr_wrapper_on_masks if not args.gev else \ gev_wrapper_on_masks if not os.path.exists(dumps_dir): os.makedirs(dumps_dir) with open(args.flist, 'r') as f: flist = f.readlines() for f in flist: f = f.strip() tokens = f.split('/') noisy_samples = load_multichannel_data(f) noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2)) mask_n, mask_x = mask_computer.compute_masks( np.abs(noisy_specs).astype(np.float32)) mask_n = np.median(mask_n, axis=1) mask_x = np.median(mask_x, axis=1) clean_specs = func_bf(noisy_specs, mask_n, mask_x) clean_samples = istft(clean_specs) print('dumps to {}/{}.wav'.format(dumps_dir, tokens[-1])) audiowrite(clean_samples, '{}/{}.wav'.format(dumps_dir, tokens[-1]), 16000, True, True)
def audio_manipulation(self): print("longto") audio_file = audioread('new_dataset/chime_ex.wav', sample_rate=16000) babble_file = audioread('new_dataset/babble_16.wav', sample_rate=16000) print("len chime: ", audio_file.shape) print("len chime: ", babble_file.shape) audio_shape = audio_file.shape[0] babble_shape = babble_file.shape[0] split = int(babble_shape / audio_shape) # y = list() start = 0 end = audio_file.shape[0] for i in range(1, 7): print("start = ", start, "end = ", end) y = babble_file[start:end] start = end + 1 end = end + audio_file.shape[0] audiowrite(y, "new_dataset/babble_noise/babble.CH{}.wav".format(i)) # audiowrite(y, "y.wav") # np.split(babble_file, 2) print("split into: ", split, "babble shape: ", babble_file.shape, "y: ", sys.getsizeof(y)) audio_stft = stft(audio_file) babble_stft = stft(y) print(audio_stft.shape) print(babble_stft.shape)
def prepare_custom_audio(noise_data, chime_data): print("new shape: ", chime_data.shape) # noise_data = audioread('new_dataset/babble.wav', sample_rate=16000) start = 0 end = chime_data.shape[0] for i in range(1, 7): y = noise_data[start:end] print("start: ", start, "end: ", end, end="\n") start = end end = end + chime_data.shape[0] audiowrite(y, "new_dataset/babble_noise/babble.CH{}.wav".format(i)) sleep(0.01) print("last_shape: ", chime_data.shape)
def apply_beamfomer(args): estimator = MaskEstimator(num_bins) mask_computer = MaskComputer(estimator, args.model) func_bf = mvdr_wrapper_on_masks if not args.gev else \ gev_wrapper_on_masks f = args.flist.strip() tokens = f.split('/') noisy_samples = load_multichannel_data(args.flist) noisy_specs = stft(noisy_samples, time_dim=1).transpose((1, 0, 2)) mask_n, mask_x = mask_computer.compute_masks( np.abs(noisy_specs).astype(np.float32)) mask_n = np.median(mask_n, axis=1) mask_x = np.median(mask_x, axis=1) clean_specs = func_bf(noisy_specs, mask_n, mask_x) clean_samples = istft(clean_specs) #print('dumps to {}/{}.wav'.format(args.dump, tokens[-1])) audiowrite(clean_samples, '{}/{}'.format(args.dump, tokens[-1]), 16000, True, True)
def write_wav(magnitude, phase, filename, exponentiate=True, griffin_lim=False): if exponentiate: magnitude = np.exp(magnitude) complex_spec = magnitude * np.exp(1j * phase) kwargs = {'size': 512, 'shift': 64, 'window_length': 512} resynth = istft(complex_spec, **kwargs) if griffin_lim: for i in range(10): complex_spec = magnitude * np.exp( 1j * np.angle(stft(resynth, **kwargs))) resynth = istft(complex_spec, **kwargs) audiowrite(resynth, filename)
def audio_joiner(path): chime_data_dir = path print(path) flist = [ f for f in listdir(chime_data_dir) if isfile(join(chime_data_dir, f)) ] thefile = open('list.txt', 'w') y = list() counter = 0 for item in flist: audio_file = audioread('{}/{}'.format(path, item), sample_rate=16000) print(item) if len(audio_file) < len(y): c = y.copy() c[:len(audio_file)] += audio_file else: c = audio_file.copy() c[:len(y)] += y # y = y + audio_file audiowrite(c, '/media/hipo/lento/Dataset/LibriSpeech/test/com.flac', samplerate=16000)
def single_normal(): # audio_data = get_audio_nochime('data/new_dataset/216m/2m_pub_new', ch_range=range(1, 9), fs=16000) # noise_data = get_audio_nochime('data/new_dataset/blstm_noise/noise_124', ch_range=range(1, 9), fs=16000) # audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000) t_io = 0 t_net = 0 t_beamform = 0 # check execution time with Timer() as t: audio_data = get_audio_nochime(args.data_directory, ch_range=range(1, 3), fs=16000) context_samples = 0 print("audio_data: ", audio_data.shape, end="\n") # for i in range (0, 8): # print(audio_data[i][1]) t_io += t.msecs Y = stft(audio_data, time_dim=1).transpose((1, 0, 2)) # N = stft(noise_data, time_dim=1).transpose((1, 0, 2)) Y_phase = np.divide(Y, abs(Y)) print("Y: ", Y.shape, "Y_phase: ", Y_phase.shape, end="\n") # Y_var with or without chainer Variable class doesn't give any different Y_var = Variable(np.abs(Y).astype(np.float32)) # N_var = Variable(np.abs(N).astype(np.float32), True) # blstm_noise = Variable(np.abs(blstm_noise).astype(np.float32), True) with Timer() as t: # mask estimation N_masks, X_masks = model.calc_masks(Y_var) # Noise_masks = model.calc_mask_noise(N_var) print("N_masks: ", N_masks.shape, end="\n") N_masks.to_cpu() X_masks.to_cpu() t_net += t.msecs # Noise_masks.to_cpu() with Timer() as t: N_mask = np.median(N_masks.data, axis=1) X_mask = np.median(X_masks.data, axis=1) # Noise_mask = np.median(Noise_masks.data, axis=1) # signal = audioread('data/new_dataset/216m/2m_pub_new' + '.CH{}.wav'.format(ch), sample_rate=16000) # noise = audioread('data/new_dataset/gevnoise/gevnoise' + '.CH{}.wav'.format(ch), sample_rate=16000) # signal_ = stft(signal) # noise_ = stft(noise) # # signal_phase = np.divide(signal, abs(signal_)) # noise_masks = model.calc_mask_noise(noise_) # noise_to = np.multiply(noise_masks.data, signal_) # noise_to = np.multiply(noise_to, signal_phase) # audiowrite(istft(noise_to)[context_samples:], # "/home/hipo/workspace/BeamSaber/result/noise/noise_to_.CH{}.wav".format(ch), 16000, True, True) Noise = np.multiply(N_masks.data, Y) Noise = np.multiply(Noise, Y_phase) # Y_phase_med = np.median(Y_phase, axis=1) # print(Noise.shape) # for ch in range(0, 8): # audiowrite(istft(Noise[:,ch,:])[context_samples:], # "/home/hipo/workspace/BeamSaber/result/noise/2mnoise_.CH{}.wav".format(ch), 16000, True, True) Noise = np.median(Noise, axis=1) # print("N_mask: ", N_mask.shape, "X_mask: ", X_mask.shape, "Y_phase: ", Y_phase.shape, end="\n") Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) # print(Y_hat.shape) # print("Noise: ", Noise.shape) t_beamform += t.msecs with Timer() as t: audiowrite( istft(Noise)[context_samples:], "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_noise.wav" .format(args.exNum), 16000, True, True) audiowrite( istft(Y_hat)[context_samples:], "/media/hipo/lento/workspace/BeamSaber/tools/enhancement/gev/PublicFOMLSA/sample/{}_gev.wav" .format(args.exNum), 16000, True, True) t_io += t.msecs print( 'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total: {:.2f}s' .format(t_io / 1000, t_net / 1000, t_beamform / 1000, ((t_io + t_net + t_beamform) / 1000)))
with Timer() as t: N_mask = np.median(N_masks.data, axis=1) X_mask = np.median(X_masks.data, axis=1) Y_hat_dicts = bf_wrapper_on_masks(Y, N_mask, X_mask, beamformers=beamformers) t_beamform += t.msecs if scenario == 'simu': wsj_name = cur_line.split('/')[-1].split('_')[1] spk = cur_line.split('/')[-1].split('_')[0] env = cur_line.split('/')[-1].split('_')[-1] elif scenario == 'real': wsj_name = cur_line[3] spk = cur_line[0].split('/')[-1].split('_')[0] env = cur_line[0].split('/')[-1].split('_')[-1] for beamformer, Y_hat in Y_hat_dicts.items(): filename = os.path.join( args.output_dir,beamformer, '{}05_{}_{}'.format(stage, env.lower(), scenario), '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()) ) with Timer() as t: audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True) t_io += t.msecs print('Finished') print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format( t_io / 1000, t_net / 1000, t_beamform / 1000 ))
NN_masks.to_cpu() XX_masks.to_cpu() with Timer() as t: NN_mask = np.median(NN_masks.data, axis=1) XX_mask = np.median(XX_masks.data, axis=1) print("Y: ", Y_hat.shape, "N_mask: ", NN_mask.shape, "X_mask: ", XX_mask.shape, end="\n") # try: YY_hat = gev_wrapper_on_masks(Y_hat, NN_mask, XX_mask) # except AttributeError: # YY_hat = gev_wrapper_on_masks(Y, NN_mask, XX_mask) with Timer() as t: audiowrite( istft(YY_hat), "new_dataset_result/2m_feedback_{}.wav".format(args.experiments), 48000, True, True) t_io += t.msecs print('Finished') print( 'Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s | Total Time: {:.2f}s' .format(t_io / 1000, t_net / 1000, t_beamform / 1000, ((t_io / 1000) + (t_net / 1000) + (t_beamform / 1000))))
X_masks.to_cpu() t_net += t.msecs with Timer() as t: N_mask = np.median(N_masks.data, axis=1) X_mask = np.median(X_masks.data, axis=1) Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) t_beamform += t.msecs if scenario == 'simu': wsj_name = cur_line.split('/')[-1].split('_')[1] spk = cur_line.split('/')[-1].split('_')[0] env = cur_line.split('/')[-1].split('_')[-1] elif scenario == 'real': wsj_name = cur_line[3] spk = cur_line[0].split('/')[-1].split('_')[0] env = cur_line[0].split('/')[-1].split('_')[-1] filename = os.path.join(args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())) with Timer() as t: audiowrite( istft(Y_hat, audio_data.shape[1])[context_samples:], filename, 16000, True, True) t_io += t.msecs print('Finished') print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format( t_io / 1000, t_net / 1000, t_beamform / 1000))
t_net += t.msecs with Timer() as t: N_mask = np.median(N_masks.data, axis=1) X_mask = np.median(X_masks.data, axis=1) Y_hat = gev_wrapper_on_masks(Y, N_mask, X_mask) t_beamform += t.msecs if scenario == 'simu': wsj_name = cur_line.split('/')[-1].split('_')[1] spk = cur_line.split('/')[-1].split('_')[0] env = cur_line.split('/')[-1].split('_')[-1] elif scenario == 'real': wsj_name = cur_line[3] spk = cur_line[0].split('/')[-1].split('_')[0] env = cur_line[0].split('/')[-1].split('_')[-1] filename = os.path.join( args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), '{}_{}_{}.wav'.format(spk, wsj_name, env.upper()) ) with Timer() as t: audiowrite(istft(Y_hat)[context_samples:], filename, 16000, True, True) t_io += t.msecs print('Finished') print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format( t_io / 1000, t_net / 1000, t_beamform / 1000 ))
#Y_hat = mcmf_wrapper_on_masks(Y, N_mask, X_mask, output_setup, corr_info) t_beamform += t.msecs # the spliter in Win '\' and Linux '/' if scenario == 'simu': wsj_name = cur_line.split('\\')[-1].split('_')[1] spk = cur_line.split('\\')[-1].split('_')[0] env = cur_line.split('\\')[-1].split('_')[-1] elif scenario == 'real': wsj_name = cur_line[3] spk = cur_line[0].split('\\')[-1].split('_')[0] env = cur_line[0].split('\\')[-1].split('_')[-1] filename = os.path.join(args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())) audiowrite( istft(Y_hat, audio_data.shape[1])[context_samples:], filename[:-4] + outfile_postfix + '.wav', 16000, True, True) ''' # direct apply the mask on the DS Y_ds_hat = np.sum(Y, axis=1) * X_mask audiowrite(istft(Y_ds_hat, audio_data.shape[1])[context_samples:], filename[:-4]+'_X.wav', 16000, True, True) Y_ds_hat = np.sum(Y, axis=1) * X_mask/(X_mask+N_mask) audiowrite(istft(Y_ds_hat, audio_data.shape[1])[context_samples:], filename[:-4]+'_W.wav', 16000, True, True) ''' print('Finished') print('Timings: I/O: {:.2f}s | Net: {:.2f}s | Beamformer: {:.2f}s'.format( t_io / 1000, t_net / 1000, t_beamform / 1000))
if scenario == 'simu' or args.track == 2: wsj_name = cur_line.split('/')[-1].split('_')[1] spk = cur_line.split('/')[-1].split('_')[0] env = cur_line.split('/')[-1].split('_')[-1] elif scenario == 'real': wsj_name = cur_line[3] spk = cur_line[0].split('/')[-1].split('_')[0] env = cur_line[0].split('/')[-1].split('_')[-1] filename = os.path.join( args.output_dir, '{}05_{}_{}'.format(stage, env.lower(), scenario), '{}_{}_{}.wav'.format(spk, wsj_name, env.upper())) if args.track == 1: with Timer() as t: audiowrite( istft(Y_hat[:, 0, :])[int(context_samples):], filename, 16000, True, True) t_io += t.msecs elif args.single == 0: with Timer() as t: audiowrite( istft(Y_hat)[int(context_samples):], filename, 16000, True, True) t_io += t.msecs elif args.single >= 1: ch = args.single with Timer() as t: audiowrite( istft(Y_hat[:, ch - 1, :])[int(context_samples):], filename, 16000, True, True) t_io += t.msecs