def plot_fig5(data_type, audio_idx): workspace = cfg.workspace fs = cfg.sample_rate # Read audio. audio_path = os.path.join( workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx) (audio, _) = pp_data.read_stereo_audio(audio_path, fs) event_audio = audio[:, 0] noise_audio = audio[:, 1] mixed_audio = (event_audio + noise_audio) / 2 event_audio_1 = np.zeros_like(event_audio) event_audio_1[0:int(fs * 2.5)] = event_audio[0:int(fs * 2.5)] event_audio_2 = np.zeros_like(event_audio) event_audio_2[int(fs * 2.5):int(fs * 5.)] = event_audio[int(fs * 2.5):int(fs * 5.)] event_audio_3 = np.zeros_like(event_audio) event_audio_3[int(fs * 5.):int(fs * 7.5)] = event_audio[int(fs * 5.):int(fs * 7.5)] sep_dir = "/vol/vssp/msos/qk/workspaces/weak_source_separation/dcase2013_task2/sep_audio/tmp01/n_events=3/fold=0/snr=20" sep_paths = glob.glob(os.path.join(sep_dir, "%s*" % audio_idx)) print([os.path.basename(e) for e in sep_paths]) (sep_event_audio_1, _) = pp_data.read_audio(sep_paths[3]) (sep_event_audio_2, _) = pp_data.read_audio(sep_paths[0]) (sep_event_audio_3, _) = pp_data.read_audio(sep_paths[1]) (sep_noise_audio, _) = pp_data.read_audio(sep_paths[2]) fig, axs = plt.subplots(5, 2, sharex=True) axs[0, 0].plot(mixed_audio) axs[1, 0].plot(event_audio_1) axs[2, 0].plot(event_audio_2) axs[3, 0].plot(event_audio_3) axs[4, 0].plot(noise_audio) axs[1, 1].plot(sep_event_audio_1) axs[2, 1].plot(sep_event_audio_2) axs[3, 1].plot(sep_event_audio_3) axs[4, 1].plot(sep_noise_audio) T = len(noise_audio) for i1 in xrange(5): for i2 in xrange(2): axs[i1, i2].axis([0, T, -1, 1]) axs[i1, i2].xaxis.set_ticks([]) axs[i1, i2].yaxis.set_ticks([]) axs[i1, i2].set_ylabel("Amplitude") plt.show()
def evaluate_separation(args): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration n_events = args.n_events snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) sep_stats = {} for e in events: sep_stats[e] = {'sdr': [], 'sir': [], 'sar': []} cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] gt_audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(gt_audio_path, target_fs=fs) gt_event_audio = stereo_audio[:, 0] gt_noise_audio = stereo_audio[:, 1] print(na) for j1 in xrange(len(events)): if at_y[i1][j1] == 1: sep_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) (sep_event_audio, _) = pp_data.read_audio(sep_event_audio_path, target_fs=fs) sep_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) (sep_noise_audio, _) = pp_data.read_audio(sep_noise_audio_path, target_fs=fs) ref_array = np.array((gt_event_audio, gt_noise_audio)) est_array = np.array((sep_event_audio, sep_noise_audio)) (sdr, sir, sar) = sdr_sir_sar(ref_array, est_array, sed_y[i1, :, j1], inside_only=True) print(sdr, sir, sar) sep_stats[events[j1]]['sdr'].append(sdr) sep_stats[events[j1]]['sir'].append(sir) sep_stats[events[j1]]['sar'].append(sar) cnt += 1 # if cnt == 5: break print(sep_stats) sep_stat_path = os.path.join(workspace, "sep_stats", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr, "sep_stat.p") pp_data.create_folder(os.path.dirname(sep_stat_path)) cPickle.dump(sep_stats, open(sep_stat_path, 'wb'))
def plot_fig4(data_type, audio_idx): workspace = cfg.workspace n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate events = cfg.events te_fold = cfg.te_fold # Read audio. audio_path = os.path.join( workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx) (audio, _) = pp_data.read_audio(audio_path, fs) # Calculate log Mel. x = _calc_feat(audio) sp = _calc_spectrogram(audio) print(x.shape) # Plot. fig, axs = plt.subplots(4, 4, sharex=False) # Plot log Mel spectrogram. for i2 in xrange(16): axs[i2 / 4, i2 % 4].set_visible(False) axs[0, 0].matshow(x.T, origin='lower', aspect='auto', cmap='jet') axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[0, 0].xaxis.tick_bottom() axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 0].set_xlabel("time (s)") # axs[0,0].xaxis.set_label_coords(1.12, -0.05) axs[0, 0].yaxis.set_ticks([0, 16, 32, 48, 63]) axs[0, 0].yaxis.set_ticklabels([0, 16, 32, 48, 63]) axs[0, 0].set_ylabel('Mel freq. bin') axs[0, 0].set_title("Log Mel spectrogram") axs[0, 0].set_visible(True) # Plot spectrogram. axs[0, 2].matshow(np.log(sp.T + 1.), origin='lower', aspect='auto', cmap='jet') axs[0, 2].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[0, 2].xaxis.tick_bottom() axs[0, 2].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 2].set_xlabel("time (s)") # axs[0,2].xaxis.set_label_coords(1.12, -0.05) axs[0, 2].yaxis.set_ticks([0, 128, 256, 384, 512]) axs[0, 2].yaxis.set_ticklabels([0, 128, 256, 384, 512]) axs[0, 2].set_ylabel('FFT freq. bin') axs[0, 2].set_title("Spectrogram") axs[0, 2].set_visible(True) # plt.tight_layout() plt.show() # Load data. snr = 20 n_events = 3 feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) if data_type == "train": x = tr_x at_y = tr_at_y sed_y = tr_sed_y na_list = tr_na_list elif data_type == "test": x = te_x at_y = te_at_y sed_y = te_sed_y na_list = te_na_list for (i1, na) in enumerate(na_list): if audio_idx in na: idx = i1 print(idx) # GT mask (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio ham_win = np.hamming(n_window) mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) db = -5. gt_mask = (np.sign(20 * np.log10(event_sp / noise_sp) - db) + 1.) / 2. # (n_time, n_freq) fig, axs = plt.subplots(4, 4, sharex=True) for i2 in xrange(16): ind_gt_mask = gt_mask * sed_y[idx, :, i2][:, None] axs[i2 / 4, i2 % 4].matshow(ind_gt_mask.T, origin='lower', aspect='auto', cmap='jet') # axs[i2/4, i2%4].set_title(events[i2]) axs[i2 / 4, i2 % 4].xaxis.set_ticks([]) axs[i2 / 4, i2 % 4].yaxis.set_ticks([]) axs[i2 / 4, i2 % 4].set_xlabel('time') axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin') plt.show() for filename in ["tmp01", "tmp02", "tmp03"]: # Plot up sampled seg masks. preds_dir = os.path.join(workspace, "preds", filename, "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] bgn_iter, fin_iter, interval = 2000, 3001, 200 for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(at_y[idx]) melW = librosa.filters.mel(sr=fs, n_fft=cfg.n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) spec_masks = np.dot(seg_masks[idx], inverse_melW) # (n_classes, n_time, 513) fig, axs = plt.subplots(4, 4, sharex=True) for i2 in xrange(16): axs[i2 / 4, i2 % 4].matshow(spec_masks[i2].T, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[i2/4, i2%4].set_title(events[i2]) axs[i2 / 4, i2 % 4].xaxis.set_ticks([]) axs[i2 / 4, i2 % 4].yaxis.set_ticks([]) axs[i2 / 4, i2 % 4].set_xlabel('time') axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin') fig.suptitle(filename) plt.show() # Plot SED probs. sed_probs = np.mean(seg_masks[idx], axis=-1) # (n_classes, n_time) fig, axs = plt.subplots(4, 4, sharex=False) for i2 in xrange(16): axs[i2 / 4, i2 % 4].set_visible(False) axs[0, 0].matshow(sed_probs, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) # axs[0, 0].xaxis.tick_bottom() # axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[0, 0].xaxis.set_ticks([]) # axs[0, 0].set_xlabel('time (s)') axs[0, 0].yaxis.set_ticks(xrange(len(events))) axs[0, 0].yaxis.set_ticklabels(events) for tick in axs[0, 0].yaxis.get_major_ticks(): tick.label.set_fontsize(8) axs[0, 0].set_visible(True) axs[1, 0].matshow(sed_y[idx].T, origin='lower', aspect='auto', vmin=0, vmax=1, cmap='jet') # axs[1, 0].xaxis.set_ticks([]) axs[1, 0].xaxis.set_ticks([0, 60, 120, 180, 239]) axs[1, 0].xaxis.tick_bottom() axs[1, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5)) axs[1, 0].set_xlabel('time (s)') axs[1, 0].yaxis.set_ticks(xrange(len(events))) axs[1, 0].yaxis.set_ticklabels(events) for tick in axs[1, 0].yaxis.get_major_ticks(): tick.label.set_fontsize(8) axs[1, 0].set_visible(True) fig.suptitle(filename) plt.show()
def separate(args, bgn_iter, fin_iter, interval): workspace = cfg.workspace events = cfg.events te_fold = cfg.te_fold n_events = args.n_events n_window = cfg.n_window n_overlap = cfg.n_overlap fs = cfg.sample_rate clip_duration = cfg.clip_duration snr = args.snr # Load ground truth data. feature_dir = os.path.join(workspace, "features", "logmel", "n_events=%d" % n_events) yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y, te_na_list) = pp_data.load_data(feature_dir=feature_dir, yaml_dir=yaml_dir, te_fold=te_fold, snr=snr, is_scale=is_scale) at_y = te_at_y sed_y = te_sed_y na_list = te_na_list # Load and sum preds_dir = os.path.join(workspace, "preds", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) at_probs_list, seg_masks_list = [], [] for iter in xrange(bgn_iter, fin_iter, interval): seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter, "seg_masks.p") seg_masks = cPickle.load(open(seg_masks_path, 'rb')) seg_masks_list.append(seg_masks) seg_masks = np.mean(seg_masks_list, axis=0) # (n_clips, n_classes, n_time, n_freq) print(seg_masks.shape) # audio_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events) sep_dir = os.path.join(workspace, "sep_audio", pp_data.get_filename(__file__), "n_events=%d" % n_events, "fold=%d" % te_fold, "snr=%d" % snr) pp_data.create_folder(sep_dir) ham_win = np.hamming(n_window) recover_scaler = np.sqrt((ham_win**2).sum()) melW = librosa.filters.mel(sr=fs, n_fft=n_window, n_mels=64, fmin=0., fmax=fs / 2) inverse_melW = get_inverse_W(melW) # (64, 513) seg_stats = {} for e in events: seg_stats[e] = { 'fvalue': [], 'auc': [], 'iou': [], 'hit': [], 'fa': [], 'tp': [], 'fn': [], 'fp': [] } cnt = 0 for (i1, na) in enumerate(na_list): bare_na = os.path.splitext(na)[0] audio_path = os.path.join(audio_dir, "%s.wav" % bare_na) (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs) event_audio = stereo_audio[:, 0] noise_audio = stereo_audio[:, 1] mixed_audio = event_audio + noise_audio mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window, n_overlap) mixed_sp = np.abs(mixed_cmplx_sp) event_sp = np.abs( pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap)) noise_sp = np.abs( pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap)) sm = seg_masks[i1] # (n_classes, n_time, n_freq) sm_upsampled = np.dot(sm, inverse_melW) # (n_classes, n_time, 513) print(na) # Write out separated events. for j1 in xrange(len(events)): if at_y[i1][j1] == 1: (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) (hit, fa) = hit_fa(sm_upsampled[j1], event_sp, noise_sp, sed_y[i1, :, j1], seg_thres, inside_only=True) seg_stats[events[j1]]['fvalue'].append(fvalue) seg_stats[events[j1]]['auc'].append(auc) seg_stats[events[j1]]['iou'].append(iou) seg_stats[events[j1]]['hit'].append(hit) seg_stats[events[j1]]['fa'].append(fa) seg_stats[events[j1]]['tp'].append(tp) seg_stats[events[j1]]['fn'].append(fn) seg_stats[events[j1]]['fp'].append(fp) sep_event_sp = sm_upsampled[j1] * mixed_sp sep_event_s = spectrogram_to_wave.recover_wav( sep_event_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int(fs * clip_duration)) sep_event_s *= recover_scaler out_event_audio_path = os.path.join( sep_dir, "%s.%s.wav" % (bare_na, events[j1])) pp_data.write_audio(out_event_audio_path, sep_event_s, fs) # Write out separated noise. sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.) sep_noise_sp = sm_noise_upsampled * mixed_sp sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp, mixed_cmplx_sp, n_overlap=n_overlap, winfunc=np.hamming, wav_len=int( fs * clip_duration)) sep_noise_s *= recover_scaler out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na) pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs) cnt += 1 # if cnt == 2: break fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], [] for e in events: fvalues.append(np.mean(seg_stats[e]['fvalue'])) ious.append(np.mean(seg_stats[e]['iou'])) aucs.append(np.mean(seg_stats[e]['auc'])) hits.append(np.mean(seg_stats[e]['hit'])) fas.append(np.mean(seg_stats[e]['fa'])) tps.append(np.mean(seg_stats[e]['tp'])) fns.append(np.mean(seg_stats[e]['fn'])) fps.append(np.mean(seg_stats[e]['fp'])) logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" % ("".ljust(16))) logging.info( "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" % ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs), np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) - np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps))) for i1 in xrange(len(events)): logging.info( "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" % (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1], fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))