def no_separation(args): """Write out un-separated mixture as baseline. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "no_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) for na in names: if '.mix_0db.wav' in na: print(na) audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio bare_na = os.path.splitext(os.path.splitext(na)[0])[0] pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), mixed_audio, fs) pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), mixed_audio, fs) print("Write out finished!")
def create_room(source_file, noise_file, dist): (clean, fs) = pp.read_audio(source_file) (noise, _) = pp.read_audio(noise_file) for file in os.listdir(os.path.join("data_eval", "dnn1_in")): file_path = os.path.join("data_eval", "dnn1_in", file) os.remove(file_path) for n in range(len(dist)): mixed, noise_new, clean_new, s2nr = set_microphone_at_distance( clean, noise, fs, dist[n]) # s2nr = 1 / (1 + (1 / float(snr))) mixed_name = "mix_%s_%s" % (str( dist[n]), os.path.basename(source_file)) clean_name = "clean_%s_%s" % (str( dist[n]), os.path.basename(source_file)) mixed_path = os.path.join('data_eval/dnn1_in', mixed_name) clean_path = os.path.join('data_eval/dnn1_in', clean_name) pp.write_audio(mixed_path, mixed, fs)
def demo(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "FullyCNN.h5") model = load_model(model_path) # Load test data. if args.online: print('recording....') recordfile = 'record.wav' my_record(recordfile, 16000, 2) print('recording end') (data, _) = pp_data.read_audio(recordfile, 16000) else: testfile = 'data_cache/test_speech/1568253725.587787.wav' (data, _) = pp_data.read_audio(testfile, 16000) mixed_complx_x = pp_data.calc_sp(data, mode='complex') mixed_x, mixed_phase = divide_magphase(mixed_complx_x, power=1) # Predict. pred = model.predict(mixed_x) # Recover enhanced wav. pred_sp = pred # np.exp(pred) n_window = cfg.n_window n_overlap = cfg.n_overlap hop_size = n_window - n_overlap ham_win = np.sqrt(np.hanning(n_window)) stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase) stft_reconstructed_clean = stft_reconstructed_clean.T signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean, hop_length=hop_size, window=ham_win) signal_reconstructed_clean = signal_reconstructed_clean*32768 s = signal_reconstructed_clean.astype('int16') # Write out enhanced wav. # out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) # pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio('1568253725.587787ehs.wav', s, fs)
def main(from_dir, to_dir, sr): from_paths = wav_paths(from_dir) for from_p in tqdm(from_paths, 'Resampling audio'): rel_p = PurePath(from_p).relative_to(from_dir) to_p = to_dir / rel_p os.makedirs(to_p.parent, exist_ok=True) wav, _ = read_audio(from_p, sr) write_audio(to_p, wav, sr)
def DAB_generate(source_audio, out_folder, name): shoebox = pra.ShoeBox( room_dimensions, absorption=wall_absorption, fs=fs, max_order=15, ) # number of microphones M = 4 source_position = np.array([ random.uniform(0, room_dimensions[0]), random.uniform(0, room_dimensions[1]) ]) distances = np.random.randint(1, 20, M) mic_pos = [] for m in range(M): mic_distance = distances[m] mic_m = guess_microphone( source_position, mic_distance ) # random way: guess microphone position until it's in the room: very long time for small rooms mic_pos.append(mic_m) out_mic_file = os.path.join(out_folder, 'log_%s.txt' % name) if os.path.exists(out_mic_file): os.remove(out_mic_file) f1 = open(out_mic_file, 'w') for l in range(M): f1.write("%s, %f\n" % (str(mic_pos[l]), distances[l])) Lg_t = 0.100 # filter size in seconds Lg = np.ceil(Lg_t * fs) # in samples fft_len = 512 mics = pra.Beamformer(np.asarray(mic_pos).T, shoebox.fs, N=fft_len, Lg=Lg) shoebox.add_source(source_position, signal=source_audio) shoebox.add_microphone_array(mics) shoebox.compute_rir() shoebox.simulate() # ADDING NOISE AND SAVING for n in range(M): signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float) signal = pra.utilities.normalize(signal, bits=16) mixed_signal = add_noise(source_audio, signal) mixed_signal = np.array(mixed_signal, dtype=np.int16) mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name)) pp.write_audio(mixed_file, mixed_signal, fs)
def DS_generate(source_audio, out_folder, name): # Create the shoebox shoebox = pra.ShoeBox( room_dimensions, absorption=wall_absorption, fs=fs, max_order=15, ) mic_distance = random.randint(1, 20) # distance from source to microphone source_position = np.array([ random.uniform(0, room_dimensions[0]), random.uniform(0, room_dimensions[1]) ]) # random way: guess microphone position until it's in the room: very long time for small rooms mic_in_room = False while mic_in_room == False: theta = random.uniform(0, 2 * math.pi) mic_position = source_position - mic_distance * np.array( [math.cos(theta), math.sin(theta)]) print(mic_position) if (0 <= mic_position[0] <= room_dimensions[0]) and ( 0 <= mic_position[1] <= room_dimensions[1]): mic_in_room = True # source and mic locations shoebox.add_source(source_position, signal=source_audio) shoebox.add_microphone_array( pra.MicrophoneArray(np.array([mic_position]).T, shoebox.fs)) shoebox.simulate() signal = shoebox.mic_array.signals[0, :] mixed_signal = add_noise(source_audio, signal) mixed_signal = pra.utilities.normalize(mixed_signal, bits=16) mixed_signal = np.array(mixed_signal, dtype=np.int16) pp.write_audio(os.path.join(out_folder, 'mix_%s' % name), mixed_signal, fs)
def separate(args, bgn_iter, fin_iter, interval): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_events = args.n_events n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list # Load and sum preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(seg_masks.shape) # audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(sep_dir) ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) # (64, 513) seg_stats = {} for e in events: seg_stats[e] = { 'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': [] } cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) sm = seg_masks[i1] # (n_classes, n_time, n_freq) sm_upsampled = np.dot(sm, inverse_melW) # (n_classes, n_time, 513) print(na) # Write out separated events. for j1 in xrange(len(events)): if at_y[i1][j1] == 1: (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) seg_stats[events[j1]]['fvalue'].append(fvalue) seg_stats[events[j1]]['auc'].append(auc) seg_stats[events[j1]]['iou'].append(iou) seg_stats[events[j1]]['hit'].append(hit) seg_stats[events[j1]]['fa'].append(fa) seg_stats[events[j1]]['tp'].append(tp) seg_stats[events[j1]]['fn'].append(fn) seg_stats[events[j1]]['fp'].append(fp) sep_event_sp = sm_upsampled[j1] * mixed_sp sep_event_s = spectrogram_to_wave.recover_wav( sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_event_s *= recover_scaler out_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) pp_data.write_audio(out_event_audio_path, sep_event_s, fs) # Write out separated noise. sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.) sep_noise_sp = sm_noise_upsampled * mixed_sp sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int( fs * clip_duration)) sep_noise_s *= recover_scaler out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs) cnt += 1 # if cnt == 2: break fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], [] for e in events: fvalues.append(np.mean(seg_stats[e]['fvalue'])) ious.append(np.mean(seg_stats[e]['iou'])) aucs.append(np.mean(seg_stats[e]['auc'])) hits.append(np.mean(seg_stats[e]['hit'])) fas.append(np.mean(seg_stats[e]['fa'])) tps.append(np.mean(seg_stats[e]['tp'])) fns.append(np.mean(seg_stats[e]['fn'])) fps.append(np.mean(seg_stats[e]['fp'])) logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16))) logging.info( "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps))) for i1 in xrange(len(events)): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()
def inference(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) audio_type = 'speech' sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) speech_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/timit_wavs/subtest" # Load model model_path = os.path.join(workspace, "models", filename, audio_type, "md_%d_iters.tar" % iter) n_freq = 257 model = DNN(stack_num, n_freq) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) dft = pp_data.DFT(fft_size, cuda) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio0, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio0) # Enframe frames = stft.enframe(audio, fft_size, hop_size) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(frames, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) pred_frames = forward(model, x, mean_, std_, cuda) pred_frames = pred_frames.data.cpu().numpy() # cola_constant = 0.5 # seq = stft.overlap_add(pred_frames, hop_size, cola_constant) pred_frames *= window cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(pred_frames, hop_size, cola_constant) seq = seq[0 : len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: clean_audio_path = os.path.join(speech_dir, name.split('.')[0] + ".WAV") (clean_audio, _) = pp_data.read_audio(clean_audio_path, sample_rate) clean_audio = pp_data.normalize(clean_audio) clean_frames = stft.enframe(clean_audio, fft_size, hop_size) mix_sp = np.abs(np.fft.rfft(frames * window, norm='ortho')) enh_sp = np.abs(np.fft.rfft(pred_frames * window, norm='ortho')) clean_sp = np.abs(np.fft.rfft(clean_frames * window, norm='ortho')) K = 10 fig, axs = plt.subplots(K/2,2, sharex=True) for k in range(K): axs[k / 2, k % 2].plot(frames[k+100], color='y') axs[k / 2, k % 2].plot(clean_frames[k+100], color='r') axs[k / 2, k % 2].plot(pred_frames[k+100], color='b') plt.show() # import crash # asdf vmin = -5. vmax = 5. fig, axs = plt.subplots(3,1, sharex=True) axs[0].matshow(np.log(np.abs(mix_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[1].matshow(np.log(np.abs(clean_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) axs[2].matshow(np.log(np.abs(enh_sp)).T, origin='lower', aspect='auto', cmap='jet', vmin=vmin, vmax=vmax) plt.show()
def test(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "FullyCNN.h5") model = load_model(model_path) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_cmplx_x, noise_x, alpha, na] = data mixed_x, mixed_phase = divide_magphase(mixed_cmplx_x, power=1) # power=1 为幅度谱 speech_x, clean_phase = divide_magphase(speech_cmplx_x, power=1) # Predict. pred = model.predict(mixed_x) print(cnt, na) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in range(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = pred # np.exp(pred) n_window = cfg.n_window n_overlap = cfg.n_overlap hop_size = n_window - n_overlap ham_win = np.sqrt(np.hanning(n_window)) stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase) stft_reconstructed_clean = stft_reconstructed_clean.T signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean, hop_length=hop_size, window=ham_win) signal_reconstructed_clean = signal_reconstructed_clean * 32768 s = signal_reconstructed_clean.astype('int16') # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): workspace = "workspace" n_concat = 11 iter = 50000 n_window = 320 n_overlap = 160 fs = 16000 # Load model. model_path = os.path.join(workspace, "models", "crn_mixdb", "md_%diters.h5" % iter) model = load_model(model_path, custom_objects={'keras': keras}) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "crn_mixdb") #feat_dir = os.path.join(workspace, "features", "spectrogram", "train", "office_mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) #mixed_x = pad_with_border(mixed_x, n_pad) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=11) #[100, 7, 257] #mixed_x = pad_with_border(mixed_x, n_pad) #mixed_x_3d = mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. w, h, l = mixed_x_3d.shape pred = model.predict(mixed_x_3d) pred_sp = np.reshape(pred, [w * h, l]) mixed_cmplx_x = mixed_cmplx_x[:w * h, :] #pred_sp = pred[:, -1, :] print(cnt, na) if False: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred_sp.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(1)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in range(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. #pred_sp = np.exp(pred) #pred_sp = pred s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "crn_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def prepare_database(): (noise, _) = pp.read_audio(conf2.noise_path) with open(os.path.join('dnn2', 'dnn2_files_list.txt')) as f: dnn2_data = f.readlines() (model1, scaler1) = dnn1.load_dnn() # generate train mean values snr2_list = [] mixed_avg = [] clean_avg = [] enh_avg = [] for n in range(conf2.training_number): current_file = (random.choice(dnn2_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, s2nr = set_microphone_at_distance( clean, noise, conf2.fs, dist) (_, enh, _) = dnn1.predict_file(current_file, model1, scaler1) # s2nr = 1 / (1 + (1 / float(snr))) snr2_list.append(s2nr) mixed_avg.append(np.mean(mixed)) clean_avg.append(np.mean(clean_new)) enh_avg.append(np.mean(enh)) sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) enh_name = "enh_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) if n % 10 == 0: print(n) if conf2.save_single_files and n < conf1.n_files_to_save: mixed_path = os.path.join(conf2.train_folder, mixed_name) clean_path = os.path.join(conf2.train_folder, clean_name) enh_path = os.path.join(conf2.train_folder, enh_name) pp.write_audio(mixed_path, mixed, conf2.fs) pp.write_audio(clean_path, clean_new, conf2.fs) pp.write_audio(enh_path, enh, conf2.fs) if len(mixed_avg) != len(enh_avg): raise Exception('Number of mixed and enhanced audio must be the same') num_tr = len(mixed_avg) if os.path.exists(os.path.join(conf2.train_folder, 'train_data.txt')): os.remove(os.path.join(conf2.train_folder, 'train_data.txt')) f1 = open(os.path.join(conf2.train_folder, 'train_data.txt'), 'w') for line1, line2, line3 in zip(mixed_avg, clean_avg, snr2_list): f1.write("%s, %s, %s\n" % (line1, line2, line3)) print(len(mixed_avg), ',', len(enh_avg)) # generate test spectrograms] snr2_list = [] mixed_avg = [] clean_avg = [] enh_avg = [] for n in range(conf2.test_number): current_file = (random.choice(dnn2_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, s2nr = set_microphone_at_distance( clean, noise, conf2.fs, dist) (_, enh, _) = dnn1.predict_file(current_file, model1, scaler1) # s2nr = 1 / (1 + (1 / float(snr))) snr2_list.append(s2nr) mixed_avg.append(np.mean(mixed)) clean_avg.append(np.mean(clean_new)) enh_avg.append(np.mean(enh)) sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) enh_name = "enh_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) if n % 10 == 0: print(n) if conf2.save_single_files and n < conf1.n_files_to_save: mixed_path = os.path.join(conf2.train_folder, mixed_name) clean_path = os.path.join(conf2.train_folder, clean_name) enh_path = os.path.join(conf2.train_folder, enh_name) pp.write_audio(mixed_path, mixed, conf2.fs) pp.write_audio(clean_path, clean_new, conf2.fs) pp.write_audio(enh_path, enh, conf2.fs) print(len(mixed_avg), ',', len(enh_avg)) if len(mixed_avg) != len(enh_avg): raise Exception('Number of mixed and enhanced audio must be the same') num_te = len(mixed_avg) if os.path.exists(os.path.join(conf2.test_folder, 'test_data.txt')): os.remove(os.path.join(conf2.test_folder, 'test_data.txt')) f1 = open(os.path.join(conf2.test_folder, 'test_data.txt'), 'w') for line1, line2, line3 in zip(mixed_avg, clean_avg, snr2_list): f1.write("%s, %s, %s\n" % (line1, line2, line3)) return num_tr, num_te
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration calc_log = args.calc_log model_file = args.model_file n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Build model n_concat = 7 n_freq = 257 n_hid = 2048 lr = 1e-3 model = Sequential() model.add(Flatten(input_shape=(n_concat, n_freq))) model.add(Dropout(0.1)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dense(n_hid, activation='relu')) model.add(Dropout(0.2)) if calc_log: model.add(Dense(n_freq, activation='linear')) else: model.add(Dense(n_freq, activation='relu')) model.summary() model.compile(loss='mean_absolute_error', optimizer=Adam(lr=lr)) # Load model. if (model_file == "null"): model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) #model = load_model(model_path) model.load_weights(model_path) else: model.load_weights(model_file) # Load scaler. if calc_log: scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) if calc_log: mixed_x = pp_data.log_sp(mixed_x) #speech_x = pp_data.log_sp(speech_x) else: mixed_x = mixed_x #speech_x = speech_x # Scale data. if calc_log: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) #speech_x = pp_data.scale_on_2d(speech_x, scaler) else: mixed_x_max = np.max(mixed_x) print("max of tr_x:", mixed_x_max) mixed_x = mixed_x / mixed_x_max speech_x_max = np.max(speech_x) print("max of speech_x:", speech_x_max) speech_x = speech_x / speech_x_max # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. if False: print(mixed_x_3d) pred = model.predict(mixed_x_3d) print(cnt, na) if False: print("pred") print(pred) print("speech") print(speech_x) # Inverse scale. if calc_log: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) #speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) else: mixed_x = mixed_x * mixed_x_max #speech_x = speech_x * 16384 pred = pred * mixed_x_max # Debug plot. if args.visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') #axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav. if calc_log: pred_sp = np.exp(pred) else: #gv = 0.025 #pred_sp = np.maximum(0,pred - gv) pred_sp = pred if False: pred_sp = mixed_x[3:-3] s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) # Write out enhanced pcm 8K pcm_s16le. out_pcm_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.pcm" % na) cmd = ' '.join([ "./ffmpeg -y -i ", out_path, " -f s16le -ar 8000 -ac 1 -acodec pcm_s16le ", out_pcm_path ]) os.system(cmd) # Write out webrtc-denoised enhanced pcm 8K pcm_s16le. ns_out_pcm_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.pcm" % na) ns_out_wav_path = os.path.join(workspace, "ns_enh_wavs", "test", "%ddb" % int(te_snr), "%s.ns_enh.wav" % na) pp_data.create_folder(os.path.dirname(ns_out_pcm_path)) cmd = ' '.join(["./ns", out_pcm_path, ns_out_pcm_path]) os.system(cmd) cmd = ' '.join([ "./ffmpeg -y -f s16le -ar 8000 -ac 1 -acodec pcm_s16le -i ", ns_out_pcm_path, " ", ns_out_wav_path ]) os.system(cmd) cmd = ' '.join(["rm ", out_pcm_path]) os.system(cmd) cmd = ' '.join(["rm ", ns_out_pcm_path]) os.system(cmd)
def ibm_separation(args): """Ideal binary mask (IBM) source separation. """ workspace = args.workspace out_dir = os.path.join(workspace, "separated_wavs", "ibm_separation") pp_data.create_folder(out_dir) audio_dir = os.path.join(workspace, "mixed_audio", "testing") names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_sec = cfg.clip_sec ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) for na in names: if '.mix_0db.wav' in na: print(na) bare_na = os.path.splitext(os.path.splitext(na)[0])[0] audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, fs) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='magnitude') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T ratio = 1.7 # 5 dB event_mask = (np.sign(event_spec / (bg_spec * ratio) - 1) + 1) / 2 bg_mask = 1. - event_mask bg_separated_spec = np.abs(mixed_spec) * bg_mask event_separated_spec = np.abs(mixed_spec) * event_mask # Write out separated music s = spectrogram_to_wave.recover_wav(bg_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio(os.path.join(out_dir, bare_na + ".sep_bg.wav"), s, fs) # Write out separated vocal s = spectrogram_to_wave.recover_wav(event_separated_spec, mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_sec)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, bare_na + ".sep_event.wav"), s, fs) print("Finished!")
def dab_run(snr_list, file_name="dab_out", mode='dab'): output_file_folder = os.path.join("data_eval", mode) # removing previous enhancements for file in os.listdir(os.path.join("data_eval", "dnn1_out")): file_path = os.path.join("data_eval", "dnn1_out", file) os.remove(file_path) dnn1_inputs, dnn1_outputs = dnn1.predict_folder( os.path.join("data_eval", "dnn1_in"), os.path.join("data_eval", "dnn1_out")) names = [ f for f in sorted(os.listdir(os.path.join("data_eval", "dnn1_out"))) if f.startswith("enh") ] dnn1_outputs = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join("data_eval", "dnn1_out", na) (a, _) = pp.read_audio(file_path) enh_complex = pp.calc_sp(a, 'complex') dnn1_outputs.append(enh_complex) # s2nrs = dnn2.predict("data_eval/dnn1_in", "data_eval/dnn1_out") # snr = np.array([5.62, 1.405, 0.703, 0.281]) # snr = np.array([5.62, 2.81, 1.875, 1.406]) s2nrs = snr_list * 1 for i in range(len(snr_list)): s2nrs[i] = 1 / (1 + 1 / snr_list[i]) ch_rw_outputs = [] # calculate channel weights if mode == 'dab': new_weights = channel_weights(s2nrs) print(new_weights) # multiply enhanced audio for the corresponding weight for i, p in zip(dnn1_outputs, new_weights): ch_rw_outputs.append(p * i) # cancel reweighting if db mode if mode == 'db': new_weights = s2nrs print(new_weights) ch_rw_outputs = dnn1_outputs # execute mvdr final = mvdr(dnn1_inputs, ch_rw_outputs) (init, _) = pp.read_audio(os.path.join('data_eval', 'test_speech', file_name)) init_sp = pp.calc_sp(init, mode='complex') visualize(dnn1_colors(np.abs(init_sp)), dnn1_colors(np.abs(final)), "source amplitude", "final amplitude") # Recover and save enhanced wav pp.create_folder(output_file_folder) s = recover_wav_complex(final, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming( conf1.n_window)**2).sum()) # Scaler for compensate the amplitude audio_path = os.path.join(output_file_folder, file_name) pp.write_audio(audio_path, s, conf1.sample_rate) print('%s done' % mode)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") mag_model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' mag_model_files = find_models(mag_model_root) epoch_num = [] for i in range(len(mag_model_files)): epoch_num.append(int(mag_model_files[i].split("/")[-1].split('-')[2])) mag_model_index = epoch_num.index(max(epoch_num)) mag_model_path = mag_model_files[mag_model_index] print("The selected model path is %s :" % mag_model_path) mag_model = load_model(mag_model_path) ''' # loading phase model phase_model_root = os.path.join(workspace, "phase_saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' phase_model_files = find_models(phase_model_root) epoch_num1 = [] for i in range(len(phase_model_files)): epoch_num1.append(int(phase_model_files[i].split("/")[-1].split('-')[2])) phase_model_index = epoch_num1.index(max(epoch_num1)) phase_model_path = phase_model_files[phase_model_index] print("The selected model path is %s :" % phase_model_path) phase_model = load_model(phase_model_path) ''' # Load scaler mag_scaler_path = os.path.join(workspace, "packed_features", "train", "mag_scaler.p") mag_scaler = pickle.load(open(mag_scaler_path, 'rb')) #phase_scaler_path = os.path.join(workspace, "packed_features", "train", "phase_scaler.p") #phase_scaler = pickle.load(open(phase_scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_cmplx_x] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # mixed_phase = np.angle(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # mixed_phase = pp_data.pad_with_border(mixed_phase, n_pad) # speech_x = pp_data.log_sp(np.abs(speech_cmplx_x)) #speech_phase = np.angle(speech_cmplx_x) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.scale_on_2d(speech_x, mag_scaler) #mixed_phase = pp_data.scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.scale_on_2d(speech_phase, phase_scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #mixed_phase_3d = pp_data.mat_2d_to_3d(mixed_phase, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() mag_pred = mag_model.predict(mixed_x_3d) #phase_pred = phase_model.predict(mixed_phase_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: # mixed_x = pp_data.inverse_scale_on_2d(mixed_x, mag_scaler) # speech_x = pp_data.inverse_scale_on_2d(speech_x, mag_scaler) mag_pred = pp_data.inverse_scale_on_2d(mag_pred, mag_scaler) #mixed_phase = pp_data.inverse_scale_on_2d(mixed_phase, phase_scaler) #speech_phase = pp_data.inverse_scale_on_2d(speech_phase, phase_scaler) #phase_pred = pp_data.inverse_scale_on_2d(phase_pred, phase_scaler) # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(mag_pred/10))-1e-10 #pred_ph = np.exp(1j * phase_pred) ''' R = np.multiply(pred_sp, pred_ph) result = librosa.istft(R.T, hop_length=n_hop, win_length=cfg.n_window, window=scipy.signal.hamming, center=False) result /= abs(result).max() y_out = result*0.8''' #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # Write out enhanced wav. out_path = os.path.join(workspace, "enh_flipphase", "test", "%s" % model_name, "{}_fft_dnn_map.wav".format(na.split('.')[0])) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def inference1111(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace #tr_snr = args.tr_snr #te_snr = args.te_snr n_concat = args.n_concat #iter = args.iteration TF = args.TF model_name = args.model_name n_window = cfg.n_window n_overlap = cfg.n_overlap #snr = cfg.SNR n_hop = int(n_window-n_overlap) fs = cfg.sample_rate scale = True # Load model t1 = time.time() #model_path = os.path.join(workspace, "saved_models", "%s" % model_name, "weights-checkpoint-25-0.41.h5") model_root = os.path.join(workspace, "saved_models", "%s" % model_name ) #model_root = '/home/szuer/CI_DNN/workspace_16kHz/cis_strategy/noise10/mixture/saved_models/0/sdnn1' model_files = find_models(model_root) epoch_num = [] for i in range(len(model_files)): epoch_num.append(int(model_files[i].split("/")[-1].split('-')[2])) model_index = epoch_num.index(max(epoch_num)) model_path = model_files[model_index] print("The selected model path is %s :" % model_path) model = load_model(model_path) # Load scaler scaler_path = os.path.join(workspace, "packed_features", "train", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "test") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, na] = data n_pad = (n_concat - 1) / 2 if TF == "spectrogram": mixed_x = np.abs(mixed_cmplx_x) # Process data. #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) elif TF == "timedomain": #n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_cmplx_x, n_pad) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 mixed_x = np.abs(mixed_cmplx_x) mixed_x = pp_data.pad_with_border(mixed_x, n_pad) else: raise Exception("TF must be spectrogram, timedomain or fftmagnitude!") # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. #mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) #print("loading data time: %s s" % (time.time() - t1,)) ''' layer_1 = K.function([model.layers[0].input], [model.layers[2].output])#第一个 model.layers[0],不修改,表示输入数据;第二个model.layers[you wanted],修改为你需要输出的层数的编号 f1 = layer_1([mixed_x_3d])[0]#只修改inpu_image #第一层卷积后的特征图展示,输出是(1,149,149,32),(样本个数,特征图尺寸长,特征图尺寸宽,特征图个数) for _ in range(12): show_img = f1[1, :, :, _] show_img.shape = [1, 257] plt.subplot(3, 4, _ + 1) plt.imshow(show_img.T, cmap='gray') plt.axis('off') plt.show() ''' # Predict. t2 = time.time() pred = model.predict(mixed_x_3d) print("model predicts %d utterance : %s successfully" % (cnt, na)) #print(pred) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) #(frames, frame_length) = pred.shape #print("pred domensions %d and %d : " % (frames, frame_length)) # Debug plot. if args.visualize: if TF == "spectrogram": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug_model_spectra.png') plt.show() elif TF == "timedomain": fig, axs = plt.subplots(3,1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture time domain" % int(te_snr)) axs[1].set_title("Clean speech time domain") axs[2].set_title("Enhanced speech time domain") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig('debug model_time.png') plt.show() else: raise Exception("TF must be spectrogram or timedomain!") # Recover enhanced wav. #pred_sp = np.exp(pred) if TF == "spectrogram": pred_sp = (10**(pred/20))-1e-10 #s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) #s *= np.sqrt((np.hamming(n_window)**2).sum()) # Scaler for compensate the amplitude s = spectra_to_wav(pred_sp, mixed_cmplx_x, n_window, n_hop, 'hamming') # change after spectrogram and IFFT. elif TF == "timedomain": s = time_recover_wav(pred, n_window, n_hop, 'hamming') #s *= np.sqrt((np.hamming(n_window)**2).sum()) elif TF == "fftmagnitude": #n_pad = (n_concat - 1) / 2 s = spectra_to_wav(pred, mixed_cmplx_x, n_window, n_hop, 'hamming') else: raise Exception("TF must be spectrogram timedomain or fftmagnitude!") # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%s" % model_name, "%s.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs) print("predict an utterance time: %s s" % (time.time() - t2,)) print("total test time: %s s" % (time.time() - t1,))
def jsc_separation(args): """Joing separation-classification (JSC) source separation. """ workspace = args.workspace scaler_path = os.path.join(workspace, "scalers", "logmel", "training.scaler") scaler = pickle.load(open(scaler_path, 'rb')) md_path = os.path.join(workspace, "models", "main", args.model_name) md = serializations.load(md_path) out_dir = os.path.join(workspace, "separated_wavs", "jsc_separation") pp_data.create_folder(out_dir) observe_nodes = [md.find_layer('seg_masks').output_] f_forward = md.get_observe_forward_func(observe_nodes) audio_dir = os.path.join(os.path.join(workspace, "mixed_audio", "testing")) names = os.listdir(audio_dir) n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) for na in names: if ".mix" in na: # Read yaml bare_name = os.path.splitext(os.path.splitext(na)[0])[0] yaml_path = os.path.join(audio_dir, "%s.yaml" % bare_name) with open(yaml_path, 'r') as f: data = yaml.load(f) event_type = data['event_type'] print(na, event_type) # Read audio audio_path = os.path.join(audio_dir, na) (bg_audio, event_audio, _) = pp_data.read_audio_stereo(audio_path) mixed_audio = bg_audio + event_audio # Spectrogram [f, t, bg_spec] = signal.spectral.spectrogram(x=bg_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, event_spec] = signal.spectral.spectrogram(x=event_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') [f, t, mixed_spec] = signal.spectral.spectrogram(x=mixed_audio, window=ham_win, nperseg=n_window, noverlap=n_overlap, detrend=False, return_onesided=True, scaling='density', mode='complex') bg_spec = bg_spec.T event_spec = event_spec.T mixed_spec = mixed_spec.T # Log Mel spectrogram mixed_x = pp_data.calc_feat(mixed_audio) x3d = pp_data.do_scaler_on_x3d(mixed_x[np.newaxis, ...], scaler) # Segmentation masks [mel_masks] = md.run_function(f_forward, x3d, batch_size=10, tr_phase=0.) mel_masks = mel_masks[0] # (n_time, 64) spec_masks = np.dot(mel_masks, inverse_melW) # (n_time, 513) if args.plot_only: mixed_mel_spec = np.dot(np.abs(mixed_spec), melW.T) bg_mel_spec = np.dot(np.abs(bg_spec), melW.T) event_mel_spec = np.dot(np.abs(event_spec), melW.T) ratio = 1.7 # 5 dB event_mask = (np.sign(event_mel_spec / (bg_mel_spec * ratio) - 1) + 1) / 2 fig, axs = plt.subplots(3, 2, sharex=True) axs[0, 0].matshow(np.log(mixed_mel_spec.T), origin='lower', aspect='auto') axs[0, 1].matshow(event_mask.T, origin='lower', aspect='auto') axs[1, 0].matshow(spec_masks[0].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[1, 1].matshow(spec_masks[1].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 0].matshow(spec_masks[2].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[2, 1].matshow(spec_masks[3].T, origin='lower', aspect='auto', vmin=0., vmax=1.) axs[0, 0].set_title('log Mel of mixture') axs[0, 1].set_title('IBM of event') axs[1, 0].set_title('babycry') axs[1, 1].set_title('glassbreak') axs[2, 0].set_title('gunshot') axs[2, 1].set_title('bg') plt.show() else: # Separated spec separated_specs = spec_masks * np.abs(mixed_spec)[None, :, :] # Write out all events and bg enlarged_events = cfg.events + ['bg'] for i1 in xrange(4): s = spectrogram_to_wave.recover_wav( separated_specs[i1], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join( out_dir, "%s.sep_%s.wav" % (bare_name, enlarged_events[i1])), s, fs) # Write out event s = spectrogram_to_wave.recover_wav( separated_specs[cfg.lb_to_ix[event_type]], mixed_spec, n_overlap=n_overlap, winfunc=np.hamming, wav_len=len(mixed_audio)) s *= recover_scaler pp_data.write_audio( os.path.join(out_dir, "%s.sep_event.wav" % bare_name), s, fs) # Write out origin mix pp_data.write_audio( os.path.join(out_dir, "%s.sep_mix.wav" % bare_name), mixed_audio, fs)
def DB_generate(source_audio, out_folder, name): #source_audio = pra.normalize(source_audio, bits=16) mic_distance = random.randint( 1, 20) # mean distance from source to microphones source_position = np.array([ random.uniform(0, room_dimensions[0]), random.uniform(0, room_dimensions[1]) ]) # random way: guess array center until it's in the room: very long time for small rooms mic_in_room = False while mic_in_room == False: theta = random.uniform(0, 2 * math.pi) mic_center = source_position - mic_distance * np.array( [math.cos(theta), math.sin(theta)]) print(mic_center) if (0 <= mic_center[0] <= room_dimensions[0]) and ( 0 <= mic_center[1] <= room_dimensions[1]): mic_in_room = True # number of lateral microphones M = 4 # counterclockwise rotation of array: phi = 0 # distance between microphones d = 0.4 mic_pos = pra.beamforming.linear_2D_array(mic_center, M, phi, d) mic_pos = np.concatenate((mic_pos, np.array(mic_center, ndmin=2).T), axis=1) distances = [] for m in range(M): d = math.sqrt((source_position[0] - mic_pos[0, m])**2 + (source_position[1] - mic_pos[1, m])**2) distances.append(d) # create room shoebox = pra.ShoeBox( room_dimensions, absorption=wall_absorption, fs=fs, max_order=15, ) # shoebox.mic_array.to_wav(os.path.join(out_folder + '_DB', 'mix_' + name), norm=True, bitdepth=np.int16) Lg_t = 0.100 # filter size in seconds Lg = np.ceil(Lg_t * fs) # in samples fft_len = 512 mics = pra.Beamformer(mic_pos, shoebox.fs, N=fft_len, Lg=Lg) shoebox.add_source(source_position, signal=source_audio) shoebox.add_microphone_array(mics) shoebox.compute_rir() shoebox.simulate() # ADDING NOISE for n in range(M + 1): signal = np.asarray(shoebox.mic_array.signals[n, :], dtype=float) signal = pra.utilities.normalize(signal, bits=16) mixed_signal = add_noise(source_audio, signal) mixed_signal = np.array(mixed_signal, dtype=np.int16) mixed_file = os.path.join(out_folder, 'mix%d_%s' % (n, name)) pp.write_audio(mixed_file, mixed_signal, fs)
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration data_type = 'IRM' n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. if data_type == "DM": model_path = os.path.join(workspace, "models", "mixdb", "md_%diters.h5" % 120000) else: model_path = os.path.join(workspace, "models", "mask_mixdb", "md_%diters.h5" % 265000) model = load_model(model_path) # Load scaler. scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "mixdb", "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "mixdb") names = os.listdir(feat_dir) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) if data_type == "IRM": mixed_x = speech_x + noise_x mixed_x1 = speech_x + noise_x # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if data_type == "IRM": pred_sp = pred * mixed_x1 print(cnt, na) # Inverse scale. if data_type == "DM": pred = pp_data.inverse_scale_on_2d(pred, scaler) pred_sp = np.exp(pred) # Debug plot. # Recover enhanced wav. s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. if data_type == "DM": out_path = os.path.join(workspace, "enh_wavs", "test", "mixdb", "%s.enh.wav" % na) else: out_path = os.path.join(workspace, "enh_wavs", "test", "mask_mixdb", "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all
def prepare_database(): (noise, _) = pp.read_audio(conf1.noise_path) with open('dnn1/dnn1_files_list.txt') as f: dnn1_data = f.readlines() # generate train spectrograms mixed_all = [] clean_all = [] snr1_list = [] mixed_avg = [] for n in range(conf1.training_number): current_file = (random.choice(dnn1_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, snr = set_microphone_at_distance( clean, noise, conf1.fs, dist) snr1_list.append(snr) mixed_avg.append(np.mean(mixed)) if n % 10 == 0: print(n) if conf1.save_single_files and n < conf1.n_files_to_save: sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) path_list = current_file.split(os.sep) mixed_path = os.path.join(conf1.train_folder, mixed_name) clean_path = os.path.join(conf1.train_folder, clean_name) pp.write_audio(mixed_path, mixed, conf1.fs) pp.write_audio(clean_path, clean_new, conf1.fs) clean_spec = pp.calc_sp(clean_new, mode='magnitude') mixed_spec = pp.calc_sp(mixed, mode='complex') clean_all.append(clean_spec) mixed_all.append(mixed_spec) print(len(clean_all), ',', len(mixed_all)) num_tr = pp.pack_features(mixed_all, clean_all, 'train') compute_scaler('train') # generate test spectrograms mixed_all = [] clean_all = [] snr1_list = [] mixed_avg = [] for n in range(conf1.test_number): current_file = (random.choice(dnn1_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, snr = set_microphone_at_distance( clean, noise, conf1.fs, dist) snr1_list.append(snr) mixed_avg.append(np.mean(mixed)) if n % 10 == 0: print(n) if conf1.save_single_files and n < conf1.n_files_to_save: sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) mixed_path = os.path.join(conf1.test_folder, mixed_name) clean_path = os.path.join(conf1.test_folder, clean_name) pp.write_audio(mixed_path, mixed, conf1.fs) pp.write_audio(clean_path, clean_new, conf1.fs) clean_spec = pp.calc_sp(clean_new, mode='magnitude') mixed_spec = pp.calc_sp(mixed, mode='complex') clean_all.append(clean_spec) mixed_all.append(mixed_spec) print(len(clean_all), ',', len(mixed_all)) num_te = pp.pack_features(mixed_all, clean_all, 'test') compute_scaler('test') return num_tr, num_te,
def decode(): """Decoding the inputs using current model.""" tf.logging.info("Get TEST sets number.") num_batch = get_num_batch(FLAGS.test_list_file, infer=True) with tf.Graph().as_default(): with tf.device('/cpu:0'): with tf.name_scope('input'): data_list = read_list(FLAGS.test_list_file) test_utt_id, test_inputs, _ = get_batch( data_list, batch_size=1, input_size=FLAGS.input_dim, output_size=FLAGS.output_dim, left=FLAGS.left_context, right=FLAGS.right_context, num_enqueuing_threads=FLAGS.num_threads, num_epochs=1, infer=True) # test_inputs = tf.squeeze(test_inputs, axis=[0]) devices = [] for i in xrange(FLAGS.num_gpu): device_name = ("/gpu:%d" % i) print('Using device: ', device_name) devices.append(device_name) # Prevent exhausting all the gpu memories. config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.4 #config.gpu_options.allow_growth = True config.allow_soft_placement = True set_session(tf.Session(config=config)) # execute the session with tf.Session(config=config) as sess: # Create two models with tr_inputs and cv_inputs individually. with tf.name_scope('model'): model = DNNTrainer(sess, FLAGS, devices, test_inputs, labels=None, cross_validation=True) show_all_variables() init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) print("Initializing variables ...") sess.run(init) if model.load(model.save_dir, moving_average=False): print("[*] Load Moving Average model SUCCESS") else: print("[!] Load failed. Checkpoint not found. Exit now.") sys.exit(1) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) cmvn_filename = os.path.join(FLAGS.data_dir, "train_cmvn.npz") if os.path.isfile(cmvn_filename): cmvn = np.load(cmvn_filename) else: tf.logging.fatal("%s not exist, exit now." % cmvn_filename) sys.exit(1) out_dir_name = os.path.join('/Work18/2017/linan/SE/my_enh', FLAGS.save_dir, FLAGS.savetestdir) # out_dir_name = os.path.join(FLAGS.save_dir, 'test') if not os.path.exists(out_dir_name): os.makedirs(out_dir_name) write_scp_path = os.path.join(out_dir_name, 'feats.scp') write_ark_path = os.path.join(out_dir_name, 'feats.ark') writer = ArkWriter(write_scp_path) outputs = model.generator(test_inputs, None, reuse=True) outputs = tf.reshape(outputs, [-1, model.output_dim]) print('shape is', np.shape(outputs)) try: for batch in range(num_batch): if coord.should_stop(): break # outputs = model.generator(test_inputs, None, reuse=True) # outputs = tf.reshape(outputs, [-1, model.output_dim]) utt_id, activations = sess.run([test_utt_id, outputs]) # sequence = activations * cmvn['stddev_labels'] + \ # cmvn['mean_labels'] sequence = activations save_result = np.vstack(sequence) dir_load = FLAGS.savetestdir dir_load = dir_load.split('/')[-1] mode = FLAGS.mode if mode == 'use_org': inputs_path = os.path.join( 'workspace/features/spectrogram/test', dir_load, '%s.wav.p' % utt_id[0]) data = cPickle.load(open(inputs_path, 'rb')) [mixed_complx_x] = data #tf.logging.info("Write inferred %s to %s" %(utt_id[0], np.shape(save_result))) save_result = np.exp(save_result) n_window = cfg.n_window s = recover_wav(save_result, mixed_complx_x, cfg.n_overlap, np.hamming) s *= np.sqrt( (np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. print("start enhance wav file") # Write out enhanced wav. out_path = os.path.join("workspace", "enh_wavs", "test", dir_load, "%s.enh.wav" % utt_id[0]) print("have enhanced all the wav") pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, 16000) elif mode == 'g_l': inputs_path = os.path.join( 'workspace/features/spectrogram/test', dir_load, '%s.wav.p' % utt_id[0]) data = cPickle.load(open(inputs_path, 'rb')) [mixed_complx_x] = data save_result = np.exp(save_result) s = save_result s = audio_utilities.reconstruct_signal_griffin_lim( s, mixed_complx_x, 512, 256, 15) #s = recover_wav(save_result,mixed_complx_x,cfg.n_overlap, np.hamming) s *= np.sqrt((np.hamming(cfg.n_window)**2).sum()) #s = audio._griffin_lim(s) out_path = os.path.join("workspace", "enh_wavs", "test2", dir_load, "%s.enh.wav" % utt_id[0]) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, 16000) tf.logging.info("Write inferred%s" % (np.shape(s))) #writer.write_next_utt(write_ark_path, utt_id[0], save_result) tf.logging.info("Write inferred %s to %s" % (utt_id[0], out_path)) except Exception, e: # Report exceptions to the coordinator. coord.request_stop(e) finally:
def inference(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_noise_frame = args.noise_frame n_hop = args.n_hop n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = False # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "md_%diters.h5" % iter) model = load_model(model_path) # Load scaler. # scaler_path = os.path.join(workspace, "packed_features", "spectrogram", "train", "%ddb" % int(tr_snr), "scaler.p") # scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(te_snr)) names = os.listdir(feat_dir) mel_basis = librosa.filters.mel(cfg.sample_rate, cfg.n_window, n_mels=40) for (cnt, na) in enumerate(names): # Load feature. feat_path = os.path.join(feat_dir, na) data = cPickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, alpha, na] = data input1_3d, input2, out1, out2 = pp_data.get_input_output_layer( mixed_cmplx_x, speech_x, noise_x, alpha, n_concat, n_noise_frame, n_hop, mel_basis) # Predict. pred = model.predict([input1_3d, input2]) print(cnt, na) sys.stdout.flush() # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) pred = pp_data.inverse_scale_on_2d(pred, scaler) # post processing pred_speech_lps = 1 / 3.0 * (pred[0][:, :257] + pred[1][:, :257] + np.log(np.abs(mixed_cmplx_x) + 1e-08) + np.log(pred[1][:, 327:584])) # Debug plot. if args.visualize: out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.all.png" % na) pp_data.create_folder(os.path.dirname(out_path)) fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08), origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(speech_x.T + 1e-08), origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred_speech_lps.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.savefig(out_path) plt.close('all') # plt.show() out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.mixture.png" % na) display.specshow(np.log(np.abs(mixed_cmplx_x.T) + 1e-08)) plt.title("%ddb mixture log spectrogram" % int(te_snr)) plt.savefig(out_path) out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.clean.png" % na) display.specshow(np.log(speech_x.T + 1e-08)) plt.title("Clean speech log spectrogram") plt.savefig(out_path) out_path = os.path.join(workspace, "figures", "test", "%ddb" % int(te_snr), "%s.enh.png" % na) display.specshow(pred_speech_lps.T) plt.title("Enhanced speech log spectrogram") plt.savefig(out_path) plt.close('all') # Recover enhanced wav. pred_sp = np.exp(pred_speech_lps) s = recover_wav(pred_sp, mixed_cmplx_x, n_overlap, np.hamming) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)
def inference(workspace, tr_snr, te_snr, n_concat, iteration, model_name=None, visualize=False, force=False): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True if model_name is None: model_name = '_'.join([str(snr) for snr in tr_snr]) + 'ddbs' # Load model. model_path = os.path.join(workspace, "models", model_name, "md_%diters.h5" % iteration) print('GPU available: ', tf.test.is_gpu_available()) model = load_model(model_path) # Load scaler. scaler = read_combined_scaler(workspace, tr_snr) for snr in te_snr: # Load test data. feat_dir = os.path.join(workspace, "features", "spectrogram", "test", "%ddb" % int(snr)) feat_paths = all_file_paths(feat_dir) for (cnt, feat_path) in tqdm(enumerate(feat_paths), 'Inference (creating enhanced speech)'): # Check if the enhanced audio is already inferred na = str(PurePath(feat_path).relative_to(feat_dir).with_suffix('')) out_path = os.path.join(workspace, "enh_wavs", "test", model_name, "%ddb" % int(snr), "%s.enh.wav" % na) if os.path.isfile(out_path) and not force: print(f'Enhanced audio {out_path} is already made') continue # Load feature. data = pickle.load(open(feat_path, 'rb')) [mixed_cmplx_x, speech_x, noise_x, ir_mask, alpha, na] = data mixed_x = np.abs(mixed_cmplx_x) # Process data. n_pad = (n_concat - 1) / 2 mixed_x = pp_data.pad_with_border(mixed_x, n_pad) mixed_x = pp_data.log_sp(mixed_x) speech_x = pp_data.log_sp(speech_x) # Scale data. if scale: mixed_x = pp_data.scale_on_2d(mixed_x, scaler) speech_x = pp_data.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp_data.mat_2d_to_3d(mixed_x, agg_num=n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) #print(cnt, na) # Inverse scale. if scale: mixed_x = pp_data.inverse_scale_on_2d(mixed_x, scaler) speech_x = pp_data.inverse_scale_on_2d(speech_x, scaler) #pred = pp_data.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize: fig, axs = plt.subplots(3, 1, sharex=False) axs[0].matshow(mixed_x.T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(speech_x.T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(pred.T, origin='lower', aspect='auto', cmap='jet') axs[0].set_title("%ddb mixture log spectrogram" % int(te_snr)) axs[1].set_title("Clean speech log spectrogram") axs[2].set_title("Enhanced speech log spectrogram") for j1 in xrange(3): axs[j1].xaxis.tick_bottom() plt.tight_layout() plt.show() # Recover enhanced wav s = recover_wav(pred, mixed_cmplx_x, n_overlap, np.hamming, irr_mask=True) s *= np.sqrt((np.hamming(n_window)**2 ).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio(out_path, s, fs)