def demo(args): """Inference all test data, write out recovered wavs to disk. Args: workspace: str, path of workspace. tr_snr: float, training SNR. te_snr: float, testing SNR. n_concat: int, number of frames to concatenta, should equal to n_concat in the training stage. iter: int, iteration of model to load. visualize: bool, plot enhanced spectrogram for debug. """ print(args) workspace = args.workspace tr_snr = args.tr_snr te_snr = args.te_snr n_concat = args.n_concat iter = args.iteration n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate scale = True # Load model. model_path = os.path.join(workspace, "models", "%ddb" % int(tr_snr), "FullyCNN.h5") model = load_model(model_path) # Load test data. if args.online: print('recording....') recordfile = 'record.wav' my_record(recordfile, 16000, 2) print('recording end') (data, _) = pp_data.read_audio(recordfile, 16000) else: testfile = 'data_cache/test_speech/1568253725.587787.wav' (data, _) = pp_data.read_audio(testfile, 16000) mixed_complx_x = pp_data.calc_sp(data, mode='complex') mixed_x, mixed_phase = divide_magphase(mixed_complx_x, power=1) # Predict. pred = model.predict(mixed_x) # Recover enhanced wav. pred_sp = pred # np.exp(pred) n_window = cfg.n_window n_overlap = cfg.n_overlap hop_size = n_window - n_overlap ham_win = np.sqrt(np.hanning(n_window)) stft_reconstructed_clean = merge_magphase(pred_sp, mixed_phase) stft_reconstructed_clean = stft_reconstructed_clean.T signal_reconstructed_clean = librosa.istft(stft_reconstructed_clean, hop_length=hop_size, window=ham_win) signal_reconstructed_clean = signal_reconstructed_clean*32768 s = signal_reconstructed_clean.astype('int16') # Write out enhanced wav. # out_path = os.path.join(workspace, "enh_wavs", "test", "%ddb" % int(te_snr), "%s.enh.wav" % na) # pp_data.create_folder(os.path.dirname(out_path)) pp_data.write_audio('1568253725.587787ehs.wav', s, fs)
def predict_file(file_path, model, scaler): (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # speech_x = pp.scale_on_2d(speech_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) if visualize_plot: visualize(mixed_x, pred) # Inverse scale. # if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. # audio_path = os.path.dirname(file_path) # pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_complex, pred, s
def plot_fig4(data_type, audio_idx): workspace = cfg.workspace n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate events = cfg.events te_fold = cfg.te_fold # Read audio. audio_path = os.path.join( workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx) (audio, _) = pp_data.read_audio(audio_path, fs) # Calculate log Mel. x = _calc_feat(audio) sp = _calc_spectrogram(audio) print(x.shape) # Plot. fig, axs = plt.subplots(4, 4, sharex=False) # Plot log Mel spectrogram. for i2 in xrange(16): axs[i2 / 4, i2 % 4].set_visible(False) axs[0, 0].matshow(x.T, origin='lower', aspect='auto', cmap='jet') axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[0, 0].xaxis.tick_bottom() axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 0].set_xlabel("time (s)") # axs[0,0].xaxis.set_label_coords(1.12, -0.05) axs[0, 0].yaxis.set_ticks([0, 16, 32, 48, 63]) axs[0, 0].yaxis.set_ticklabels([0, 16, 32, 48, 63]) axs[0, 0].set_ylabel('Mel freq. bin') axs[0, 0].set_title("Log Mel spectrogram") axs[0, 0].set_visible(True) # Plot spectrogram. axs[0, 2].matshow(np.log(sp.T + 1.), origin='lower', aspect='auto', cmap='jet') axs[0, 2].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[0, 2].xaxis.tick_bottom() axs[0, 2].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 2].set_xlabel("time (s)") # axs[0,2].xaxis.set_label_coords(1.12, -0.05) axs[0, 2].yaxis.set_ticks([0, 128, 256, 384, 512]) axs[0, 2].yaxis.set_ticklabels([0, 128, 256, 384, 512]) axs[0, 2].set_ylabel('FFT freq. bin') axs[0, 2].set_title("Spectrogram") axs[0, 2].set_visible(True) # plt.tight_layout() plt.show() # Load data. snr = 20 n_events = 3 feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) if data_type == "train": x = tr_x at_y = tr_at_y sed_y = tr_sed_y na_list = tr_na_list elif data_type == "test": x = te_x at_y = te_at_y sed_y = te_sed_y na_list = te_na_list for (i1, na) in enumerate(na_list): if audio_idx in na: idx = i1 print(idx) # GT mask (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio ham_win = np.hamming(n_window) mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) db = -5. gt_mask = (np.sign(20 * np.log10(event_sp / noise_sp) - db) + 1.) / 2. # (n_time, n_freq) fig, axs = plt.subplots(4, 4, sharex=True) for i2 in xrange(16): ind_gt_mask = gt_mask * sed_y[idx, :, i2][:, None] axs[i2 / 4, i2 % 4].matshow(ind_gt_mask.T, origin='lower', aspect='auto', cmap='jet') # axs[i2/4, i2%4].set_title(events[i2]) axs[i2 / 4, i2 % 4].xaxis.set_ticks([]) axs[i2 / 4, i2 % 4].yaxis.set_ticks([]) axs[i2 / 4, i2 % 4].set_xlabel('time') axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin') plt.show() for filename in ["tmp01", "tmp02", "tmp03"]: # Plot up sampled seg masks. preds_dir = os.path.join(workspace, "preds", filename, "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] bgn_iter, fin_iter, interval = 2000, 3001, 200 for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(at_y[idx]) melW = librosa.filters.mel(sr=fs, n_fft=cfg.n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) spec_masks = np.dot(seg_masks[idx], inverse_melW) # (n_classes, n_time, 513) fig, axs = plt.subplots(4, 4, sharex=True) for i2 in xrange(16): axs[i2 / 4, i2 % 4].matshow(spec_masks[i2].T, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[i2/4, i2%4].set_title(events[i2]) axs[i2 / 4, i2 % 4].xaxis.set_ticks([]) axs[i2 / 4, i2 % 4].yaxis.set_ticks([]) axs[i2 / 4, i2 % 4].set_xlabel('time') axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin') fig.suptitle(filename) plt.show() # Plot SED probs. sed_probs = np.mean(seg_masks[idx], axis=-1) # (n_classes, n_time) fig, axs = plt.subplots(4, 4, sharex=False) for i2 in xrange(16): axs[i2 / 4, i2 % 4].set_visible(False) axs[0, 0].matshow(sed_probs, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) # axs[0, 0].xaxis.tick_bottom() # axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 0].xaxis.set_ticks([]) # axs[0, 0].set_xlabel('time (s)') axs[0, 0].yaxis.set_ticks(xrange(len(events))) axs[0, 0].yaxis.set_ticklabels(events) for tick in axs[0, 0].yaxis.get_major_ticks(): tick.label.set_fontsize(8) axs[0, 0].set_visible(True) axs[1, 0].matshow(sed_y[idx].T, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[1, 0].xaxis.set_ticks([]) axs[1, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[1, 0].xaxis.tick_bottom() axs[1, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[1, 0].set_xlabel('time (s)') axs[1, 0].yaxis.set_ticks(xrange(len(events))) axs[1, 0].yaxis.set_ticklabels(events) for tick in axs[1, 0].yaxis.get_major_ticks(): tick.label.set_fontsize(8) axs[1, 0].set_visible(True) fig.suptitle(filename) plt.show()
def prepare_database(): (noise, _) = pp.read_audio(conf1.noise_path) with open('dnn1/dnn1_files_list.txt') as f: dnn1_data = f.readlines() # generate train spectrograms mixed_all = [] clean_all = [] snr1_list = [] mixed_avg = [] for n in range(conf1.training_number): current_file = (random.choice(dnn1_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, snr = set_microphone_at_distance( clean, noise, conf1.fs, dist) snr1_list.append(snr) mixed_avg.append(np.mean(mixed)) if n % 10 == 0: print(n) if conf1.save_single_files and n < conf1.n_files_to_save: sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) path_list = current_file.split(os.sep) mixed_path = os.path.join(conf1.train_folder, mixed_name) clean_path = os.path.join(conf1.train_folder, clean_name) pp.write_audio(mixed_path, mixed, conf1.fs) pp.write_audio(clean_path, clean_new, conf1.fs) clean_spec = pp.calc_sp(clean_new, mode='magnitude') mixed_spec = pp.calc_sp(mixed, mode='complex') clean_all.append(clean_spec) mixed_all.append(mixed_spec) print(len(clean_all), ',', len(mixed_all)) num_tr = pp.pack_features(mixed_all, clean_all, 'train') compute_scaler('train') # generate test spectrograms mixed_all = [] clean_all = [] snr1_list = [] mixed_avg = [] for n in range(conf1.test_number): current_file = (random.choice(dnn1_data)).rstrip() dist = random.uniform(1, 20) (clean, _) = pp.read_audio(current_file) mixed, noise_new, clean_new, snr = set_microphone_at_distance( clean, noise, conf1.fs, dist) snr1_list.append(snr) mixed_avg.append(np.mean(mixed)) if n % 10 == 0: print(n) if conf1.save_single_files and n < conf1.n_files_to_save: sr = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(5)) path_list = current_file.split(os.sep) mixed_name = "mix_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) clean_name = "clean_%s_%s_%s" % (path_list[2], sr, os.path.basename(current_file)) mixed_path = os.path.join(conf1.test_folder, mixed_name) clean_path = os.path.join(conf1.test_folder, clean_name) pp.write_audio(mixed_path, mixed, conf1.fs) pp.write_audio(clean_path, clean_new, conf1.fs) clean_spec = pp.calc_sp(clean_new, mode='magnitude') mixed_spec = pp.calc_sp(mixed, mode='complex') clean_all.append(clean_spec) mixed_all.append(mixed_spec) print(len(clean_all), ',', len(mixed_all)) num_te = pp.pack_features(mixed_all, clean_all, 'test') compute_scaler('test') return num_tr, num_te,
def separate(args, bgn_iter, fin_iter, interval): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_events = args.n_events n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list # Load and sum preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(seg_masks.shape) # audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(sep_dir) ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) # (64, 513) seg_stats = {} for e in events: seg_stats[e] = { 'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': [] } cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) sm = seg_masks[i1] # (n_classes, n_time, n_freq) sm_upsampled = np.dot(sm, inverse_melW) # (n_classes, n_time, 513) print(na) # Write out separated events. for j1 in xrange(len(events)): if at_y[i1][j1] == 1: (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) seg_stats[events[j1]]['fvalue'].append(fvalue) seg_stats[events[j1]]['auc'].append(auc) seg_stats[events[j1]]['iou'].append(iou) seg_stats[events[j1]]['hit'].append(hit) seg_stats[events[j1]]['fa'].append(fa) seg_stats[events[j1]]['tp'].append(tp) seg_stats[events[j1]]['fn'].append(fn) seg_stats[events[j1]]['fp'].append(fp) sep_event_sp = sm_upsampled[j1] * mixed_sp sep_event_s = spectrogram_to_wave.recover_wav( sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_event_s *= recover_scaler out_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) pp_data.write_audio(out_event_audio_path, sep_event_s, fs) # Write out separated noise. sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.) sep_noise_sp = sm_noise_upsampled * mixed_sp sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int( fs * clip_duration)) sep_noise_s *= recover_scaler out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs) cnt += 1 # if cnt == 2: break fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], [] for e in events: fvalues.append(np.mean(seg_stats[e]['fvalue'])) ious.append(np.mean(seg_stats[e]['iou'])) aucs.append(np.mean(seg_stats[e]['auc'])) hits.append(np.mean(seg_stats[e]['hit'])) fas.append(np.mean(seg_stats[e]['fa'])) tps.append(np.mean(seg_stats[e]['tp'])) fns.append(np.mean(seg_stats[e]['fn'])) fps.append(np.mean(seg_stats[e]['fp'])) logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16))) logging.info( "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps))) for i1 in xrange(len(events)): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))
def inference_wiener(args): workspace = args.workspace iter = args.iteration stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" # audio_dir = "/user/HS229/qk00006/my_code2015.5-/python/pub_speech_enhancement/mixture2clean_dnn/workspace/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model. target_type = ['speech', 'noise'] model_dict = {} for e in target_type: n_freq = 257 model = DNN(stack_num, n_freq) model_path = os.path.join(workspace, "models", filename, e, "md_%d_iters.tar" % iter) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) # Move model to GPU. if cuda: model.cuda() model.eval() model_dict[e] = model # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 out_wav_dir = os.path.join(workspace, "enh_wavs", filename) pp_data.create_folder(out_wav_dir) for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) cmplx_sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(cmplx_sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) # Predict. pred_dict = {} for e in target_type: pred = forward(model_dict[e], x, mean_, std_, cuda) pred = pred.data.cpu().numpy() pred_dict[e] = pred print(cnt, name) # Wiener filter. pred_mag_sp = pred_dict['speech'] / ( pred_dict['speech'] + pred_dict['noise']) * np.abs(cmplx_sp) pred_cmplx_sp = stft.real_to_complex(pred_mag_sp, cmplx_sp) frames = stft.istft(pred_cmplx_sp) cola_constant = stft.get_cola_constant(hop_size, window) seq = stft.overlap_add(frames, hop_size, cola_constant) seq = seq[0:len(audio)] # Write out wav out_wav_path = os.path.join(out_wav_dir, name) pp_data.write_audio(out_wav_path, seq, sample_rate) print("Write out wav to: %s" % out_wav_path) if visualize: vmin = -5. vmax = 5. fig, axs = plt.subplots(3, 1, sharex=True) axs[0].matshow(np.log(np.abs(cmplx_sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(pred_dict['speech'])).T, origin='lower', aspect='auto', cmap='jet') axs[2].matshow(np.log(np.abs(pred_dict['noise'])).T, origin='lower', aspect='auto', cmap='jet') plt.show()
def inference(args): workspace = args.workspace model_name = args.model_name stack_num = args.stack_num filename = args.filename mini_num = args.mini_num visualize = args.visualize cuda = args.use_cuda and torch.cuda.is_available() print("cuda:", cuda) sample_rate = cfg.sample_rate fft_size = cfg.fft_size hop_size = cfg.hop_size window_type = cfg.window_type if window_type == 'hamming': window = np.hamming(fft_size) # Audio audio_dir = "/vol/vssp/msos/qk/workspaces/speech_enhancement/mixed_audios/spectrogram/test/0db" names = os.listdir(audio_dir) # Load model model_path = os.path.join(workspace, "models", filename, model_name) n_freq = 257 model = DNN(stack_num, n_freq) checkpoint = torch.load(model_path) model.load_state_dict(checkpoint['state_dict']) if cuda: model.cuda() # Load scalar scalar_path = os.path.join(workspace, "scalars", filename, "scalar.p") (mean_, std_) = cPickle.load(open(scalar_path, 'rb')) mean_ = move_data_to_gpu(mean_, cuda, volatile=True) std_ = move_data_to_gpu(std_, cuda, volatile=True) if mini_num > 0: n_every = len(names) / mini_num else: n_every = 1 for (cnt, name) in enumerate(names): if cnt % n_every == 0: audio_path = os.path.join(audio_dir, name) (audio, _) = pp_data.read_audio(audio_path, sample_rate) audio = pp_data.normalize(audio) sp = pp_data.calc_sp(audio, fft_size, hop_size, window) x = np.abs(sp) # Process data. n_pad = (stack_num - 1) / 2 x = pp_data.pad_with_border(x, n_pad) x = pp_data.mat_2d_to_3d(x, stack_num, hop=1) output = forward(model, x, mean_, std_, cuda) output = output.data.cpu().numpy() print(output.shape) if visualize: fig, axs = plt.subplots(2, 1, sharex=True) axs[0].matshow(np.log(np.abs(sp)).T, origin='lower', aspect='auto', cmap='jet') axs[1].matshow(np.log(np.abs(output)).T, origin='lower', aspect='auto', cmap='jet') plt.show() import crash pause
def dab_run(snr_list, file_name="dab_out", mode='dab'): output_file_folder = os.path.join("data_eval", mode) # removing previous enhancements for file in os.listdir(os.path.join("data_eval", "dnn1_out")): file_path = os.path.join("data_eval", "dnn1_out", file) os.remove(file_path) dnn1_inputs, dnn1_outputs = dnn1.predict_folder( os.path.join("data_eval", "dnn1_in"), os.path.join("data_eval", "dnn1_out")) names = [ f for f in sorted(os.listdir(os.path.join("data_eval", "dnn1_out"))) if f.startswith("enh") ] dnn1_outputs = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join("data_eval", "dnn1_out", na) (a, _) = pp.read_audio(file_path) enh_complex = pp.calc_sp(a, 'complex') dnn1_outputs.append(enh_complex) # s2nrs = dnn2.predict("data_eval/dnn1_in", "data_eval/dnn1_out") # snr = np.array([5.62, 1.405, 0.703, 0.281]) # snr = np.array([5.62, 2.81, 1.875, 1.406]) s2nrs = snr_list * 1 for i in range(len(snr_list)): s2nrs[i] = 1 / (1 + 1 / snr_list[i]) ch_rw_outputs = [] # calculate channel weights if mode == 'dab': new_weights = channel_weights(s2nrs) print(new_weights) # multiply enhanced audio for the corresponding weight for i, p in zip(dnn1_outputs, new_weights): ch_rw_outputs.append(p * i) # cancel reweighting if db mode if mode == 'db': new_weights = s2nrs print(new_weights) ch_rw_outputs = dnn1_outputs # execute mvdr final = mvdr(dnn1_inputs, ch_rw_outputs) (init, _) = pp.read_audio(os.path.join('data_eval', 'test_speech', file_name)) init_sp = pp.calc_sp(init, mode='complex') visualize(dnn1_colors(np.abs(init_sp)), dnn1_colors(np.abs(final)), "source amplitude", "final amplitude") # Recover and save enhanced wav pp.create_folder(output_file_folder) s = recover_wav_complex(final, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming( conf1.n_window)**2).sum()) # Scaler for compensate the amplitude audio_path = os.path.join(output_file_folder, file_name) pp.write_audio(audio_path, s, conf1.sample_rate) print('%s done' % mode)
def predict_folder(input_file_folder: object, output_file_folder: object) -> object: # Load model. data_type = "test" model_path = os.path.join(conf1.model_dir, "md_%diters.h5" % conf1.iterations) model = load_model(model_path) # Load scaler. # if scale: scaler_path = os.path.join(conf1.packed_feature_dir, data_type, "scaler.p") scaler = pickle.load(open(scaler_path, 'rb')) # Load test data. # names = os.listdir(input_file_folder) names = [f for f in sorted(os.listdir(input_file_folder)) if f.startswith("mix")] mixed_all = [] pred_all = [] for (cnt, na) in enumerate(names): # Load feature. file_path = os.path.join(input_file_folder, na) (a, _) = pp.read_audio(file_path) mixed_complex = pp.calc_sp(a, 'complex') mixed_x = np.abs(mixed_complex) # Process data. n_pad = (conf1.n_concat - 1) / 2 mixed_x = pp.pad_with_border(mixed_x, n_pad) mixed_x = pp.log_sp(mixed_x) # speech_x = dnn1_train.log_sp(speech_x) # Scale data. # if scale: mixed_x = pp.scale_on_2d(mixed_x, scaler) # Cut input spectrogram to 3D segments with n_concat. mixed_x_3d = pp.mat_2d_to_3d(mixed_x, agg_num=conf1.n_concat, hop=1) # Predict. pred = model.predict(mixed_x_3d) print(cnt, na) # Inverse scale. #if scale: mixed_x = pp.inverse_scale_on_2d(mixed_x, scaler) # speech_x = dnn1_train.inverse_scale_on_2d(speech_x, scaler) pred = pp.inverse_scale_on_2d(pred, scaler) # Debug plot. if visualize_plot: visualize(mixed_x, pred) mixed_all.append(mixed_complex) pred_all.append(real_to_complex(pred, mixed_complex)) # Recover enhanced wav. pred_sp = np.exp(pred) s = recover_wav(pred_sp, mixed_complex, conf1.n_overlap, np.hamming) s *= np.sqrt((np.hamming(conf1.n_window) ** 2).sum()) # Scaler for compensate the amplitude # change after spectrogram and IFFT. # Write out enhanced wav. pp.create_folder(output_file_folder) audio_path = os.path.join(output_file_folder, "enh_%s" % na) pp.write_audio(audio_path, s, conf1.sample_rate) return mixed_all, pred_all