def plot_fig5(data_type, audio_idx):
    workspace = cfg.workspace
    fs = cfg.sample_rate

    # Read audio.
    audio_path = os.path.join(
        workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx)
    (audio, _) = pp_data.read_stereo_audio(audio_path, fs)
    event_audio = audio[:, 0]
    noise_audio = audio[:, 1]
    mixed_audio = (event_audio + noise_audio) / 2

    event_audio_1 = np.zeros_like(event_audio)
    event_audio_1[0:int(fs * 2.5)] = event_audio[0:int(fs * 2.5)]
    event_audio_2 = np.zeros_like(event_audio)
    event_audio_2[int(fs * 2.5):int(fs * 5.)] = event_audio[int(fs *
                                                                2.5):int(fs *
                                                                         5.)]
    event_audio_3 = np.zeros_like(event_audio)
    event_audio_3[int(fs * 5.):int(fs * 7.5)] = event_audio[int(fs *
                                                                5.):int(fs *
                                                                        7.5)]

    sep_dir = "/vol/vssp/msos/qk/workspaces/weak_source_separation/dcase2013_task2/sep_audio/tmp01/n_events=3/fold=0/snr=20"
    sep_paths = glob.glob(os.path.join(sep_dir, "%s*" % audio_idx))

    print([os.path.basename(e) for e in sep_paths])
    (sep_event_audio_1, _) = pp_data.read_audio(sep_paths[3])
    (sep_event_audio_2, _) = pp_data.read_audio(sep_paths[0])
    (sep_event_audio_3, _) = pp_data.read_audio(sep_paths[1])
    (sep_noise_audio, _) = pp_data.read_audio(sep_paths[2])

    fig, axs = plt.subplots(5, 2, sharex=True)
    axs[0, 0].plot(mixed_audio)
    axs[1, 0].plot(event_audio_1)
    axs[2, 0].plot(event_audio_2)
    axs[3, 0].plot(event_audio_3)
    axs[4, 0].plot(noise_audio)

    axs[1, 1].plot(sep_event_audio_1)
    axs[2, 1].plot(sep_event_audio_2)
    axs[3, 1].plot(sep_event_audio_3)
    axs[4, 1].plot(sep_noise_audio)

    T = len(noise_audio)
    for i1 in xrange(5):
        for i2 in xrange(2):
            axs[i1, i2].axis([0, T, -1, 1])
            axs[i1, i2].xaxis.set_ticks([])
            axs[i1, i2].yaxis.set_ticks([])
            axs[i1, i2].set_ylabel("Amplitude")
    plt.show()
def evaluate_separation(args):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    n_events = args.n_events
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)

    sep_stats = {}
    for e in events:
        sep_stats[e] = {'sdr': [], 'sir': [], 'sar': []}

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        gt_audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(gt_audio_path,
                                                      target_fs=fs)
        gt_event_audio = stereo_audio[:, 0]
        gt_noise_audio = stereo_audio[:, 1]

        print(na)
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                sep_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                (sep_event_audio, _) = pp_data.read_audio(sep_event_audio_path,
                                                          target_fs=fs)
                sep_noise_audio_path = os.path.join(sep_dir,
                                                    "%s.noise.wav" % bare_na)
                (sep_noise_audio, _) = pp_data.read_audio(sep_noise_audio_path,
                                                          target_fs=fs)
                ref_array = np.array((gt_event_audio, gt_noise_audio))
                est_array = np.array((sep_event_audio, sep_noise_audio))
                (sdr, sir, sar) = sdr_sir_sar(ref_array,
                                              est_array,
                                              sed_y[i1, :, j1],
                                              inside_only=True)
                print(sdr, sir, sar)
                sep_stats[events[j1]]['sdr'].append(sdr)
                sep_stats[events[j1]]['sir'].append(sir)
                sep_stats[events[j1]]['sar'].append(sar)

        cnt += 1
        # if cnt == 5: break

    print(sep_stats)
    sep_stat_path = os.path.join(workspace, "sep_stats",
                                 pp_data.get_filename(__file__),
                                 "n_events=%d" % n_events, "fold=%d" % te_fold,
                                 "snr=%d" % snr, "sep_stat.p")
    pp_data.create_folder(os.path.dirname(sep_stat_path))
    cPickle.dump(sep_stats, open(sep_stat_path, 'wb'))
def plot_fig4(data_type, audio_idx):
    workspace = cfg.workspace
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    events = cfg.events
    te_fold = cfg.te_fold

    # Read audio.
    audio_path = os.path.join(
        workspace, "mixed_audio/n_events=3/%s.mixed_20db.wav" % audio_idx)
    (audio, _) = pp_data.read_audio(audio_path, fs)

    # Calculate log Mel.
    x = _calc_feat(audio)
    sp = _calc_spectrogram(audio)
    print(x.shape)

    # Plot.
    fig, axs = plt.subplots(4, 4, sharex=False)

    # Plot log Mel spectrogram.
    for i2 in xrange(16):
        axs[i2 / 4, i2 % 4].set_visible(False)

    axs[0, 0].matshow(x.T, origin='lower', aspect='auto', cmap='jet')
    axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
    axs[0, 0].xaxis.tick_bottom()
    axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
    axs[0, 0].set_xlabel("time (s)")
    # axs[0,0].xaxis.set_label_coords(1.12, -0.05)

    axs[0, 0].yaxis.set_ticks([0, 16, 32, 48, 63])
    axs[0, 0].yaxis.set_ticklabels([0, 16, 32, 48, 63])
    axs[0, 0].set_ylabel('Mel freq. bin')

    axs[0, 0].set_title("Log Mel spectrogram")
    axs[0, 0].set_visible(True)

    # Plot spectrogram.
    axs[0, 2].matshow(np.log(sp.T + 1.),
                      origin='lower',
                      aspect='auto',
                      cmap='jet')
    axs[0, 2].xaxis.set_ticks([0, 60, 120, 180, 239])
    axs[0, 2].xaxis.tick_bottom()
    axs[0, 2].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
    axs[0, 2].set_xlabel("time (s)")
    # axs[0,2].xaxis.set_label_coords(1.12, -0.05)

    axs[0, 2].yaxis.set_ticks([0, 128, 256, 384, 512])
    axs[0, 2].yaxis.set_ticklabels([0, 128, 256, 384, 512])
    axs[0, 2].set_ylabel('FFT freq. bin')

    axs[0, 2].set_title("Spectrogram")
    axs[0, 2].set_visible(True)

    # plt.tight_layout()
    plt.show()

    # Load data.
    snr = 20
    n_events = 3
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    if data_type == "train":
        x = tr_x
        at_y = tr_at_y
        sed_y = tr_sed_y
        na_list = tr_na_list
    elif data_type == "test":
        x = te_x
        at_y = te_at_y
        sed_y = te_sed_y
        na_list = te_na_list

    for (i1, na) in enumerate(na_list):
        if audio_idx in na:
            idx = i1
    print(idx)

    # GT mask
    (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
    event_audio = stereo_audio[:, 0]
    noise_audio = stereo_audio[:, 1]
    mixed_audio = event_audio + noise_audio

    ham_win = np.hamming(n_window)
    mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                     n_overlap)
    mixed_sp = np.abs(mixed_cmplx_sp)
    event_sp = np.abs(
        pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
    noise_sp = np.abs(
        pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

    db = -5.
    gt_mask = (np.sign(20 * np.log10(event_sp / noise_sp) - db) +
               1.) / 2.  # (n_time, n_freq)
    fig, axs = plt.subplots(4, 4, sharex=True)
    for i2 in xrange(16):
        ind_gt_mask = gt_mask * sed_y[idx, :, i2][:, None]
        axs[i2 / 4, i2 % 4].matshow(ind_gt_mask.T,
                                    origin='lower',
                                    aspect='auto',
                                    cmap='jet')
        # axs[i2/4, i2%4].set_title(events[i2])
        axs[i2 / 4, i2 % 4].xaxis.set_ticks([])
        axs[i2 / 4, i2 % 4].yaxis.set_ticks([])
        axs[i2 / 4, i2 % 4].set_xlabel('time')
        axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin')
    plt.show()

    for filename in ["tmp01", "tmp02", "tmp03"]:
        # Plot up sampled seg masks.
        preds_dir = os.path.join(workspace, "preds", filename,
                                 "n_events=%d" % n_events, "fold=%d" % te_fold,
                                 "snr=%d" % snr)

        at_probs_list, seg_masks_list = [], []
        bgn_iter, fin_iter, interval = 2000, 3001, 200
        for iter in xrange(bgn_iter, fin_iter, interval):
            seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                          "seg_masks.p")
            seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
            seg_masks_list.append(seg_masks)
        seg_masks = np.mean(seg_masks_list,
                            axis=0)  # (n_clips, n_classes, n_time, n_freq)

        print(at_y[idx])

        melW = librosa.filters.mel(sr=fs,
                                   n_fft=cfg.n_window,
                                   n_mels=64,
                                   fmin=0.,
                                   fmax=fs / 2)
        inverse_melW = get_inverse_W(melW)

        spec_masks = np.dot(seg_masks[idx],
                            inverse_melW)  # (n_classes, n_time, 513)

        fig, axs = plt.subplots(4, 4, sharex=True)
        for i2 in xrange(16):
            axs[i2 / 4, i2 % 4].matshow(spec_masks[i2].T,
                                        origin='lower',
                                        aspect='auto',
                                        vmin=0,
                                        vmax=1,
                                        cmap='jet')
            # axs[i2/4, i2%4].set_title(events[i2])
            axs[i2 / 4, i2 % 4].xaxis.set_ticks([])
            axs[i2 / 4, i2 % 4].yaxis.set_ticks([])
            axs[i2 / 4, i2 % 4].set_xlabel('time')
            axs[i2 / 4, i2 % 4].set_ylabel('FFT freq. bin')
        fig.suptitle(filename)
        plt.show()

        # Plot SED probs.
        sed_probs = np.mean(seg_masks[idx], axis=-1)  # (n_classes, n_time)
        fig, axs = plt.subplots(4, 4, sharex=False)
        for i2 in xrange(16):
            axs[i2 / 4, i2 % 4].set_visible(False)
        axs[0, 0].matshow(sed_probs,
                          origin='lower',
                          aspect='auto',
                          vmin=0,
                          vmax=1,
                          cmap='jet')
        # axs[0, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
        # axs[0, 0].xaxis.tick_bottom()
        # axs[0, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
        axs[0, 0].xaxis.set_ticks([])
        # axs[0, 0].set_xlabel('time (s)')
        axs[0, 0].yaxis.set_ticks(xrange(len(events)))
        axs[0, 0].yaxis.set_ticklabels(events)
        for tick in axs[0, 0].yaxis.get_major_ticks():
            tick.label.set_fontsize(8)
        axs[0, 0].set_visible(True)

        axs[1, 0].matshow(sed_y[idx].T,
                          origin='lower',
                          aspect='auto',
                          vmin=0,
                          vmax=1,
                          cmap='jet')
        # axs[1, 0].xaxis.set_ticks([])
        axs[1, 0].xaxis.set_ticks([0, 60, 120, 180, 239])
        axs[1, 0].xaxis.tick_bottom()
        axs[1, 0].xaxis.set_ticklabels(np.arange(0, 10.1, 2.5))
        axs[1, 0].set_xlabel('time (s)')
        axs[1, 0].yaxis.set_ticks(xrange(len(events)))
        axs[1, 0].yaxis.set_ticklabels(events)
        for tick in axs[1, 0].yaxis.get_major_ticks():
            tick.label.set_fontsize(8)
        axs[1, 0].set_visible(True)
        fig.suptitle(filename)
        plt.show()
def separate(args, bgn_iter, fin_iter, interval):
    workspace = cfg.workspace
    events = cfg.events
    te_fold = cfg.te_fold
    n_events = args.n_events
    n_window = cfg.n_window
    n_overlap = cfg.n_overlap
    fs = cfg.sample_rate
    clip_duration = cfg.clip_duration
    snr = args.snr

    # Load ground truth data.
    feature_dir = os.path.join(workspace, "features", "logmel",
                               "n_events=%d" % n_events)
    yaml_dir = os.path.join(workspace, "mixed_audio", "n_events=%d" % n_events)
    (tr_x, tr_at_y, tr_sed_y, tr_na_list, te_x, te_at_y, te_sed_y,
     te_na_list) = pp_data.load_data(feature_dir=feature_dir,
                                     yaml_dir=yaml_dir,
                                     te_fold=te_fold,
                                     snr=snr,
                                     is_scale=is_scale)

    at_y = te_at_y
    sed_y = te_sed_y
    na_list = te_na_list

    # Load and sum
    preds_dir = os.path.join(workspace, "preds",
                             pp_data.get_filename(__file__),
                             "n_events=%d" % n_events, "fold=%d" % te_fold,
                             "snr=%d" % snr)

    at_probs_list, seg_masks_list = [], []
    for iter in xrange(bgn_iter, fin_iter, interval):
        seg_masks_path = os.path.join(preds_dir, "md%d_iters" % iter,
                                      "seg_masks.p")
        seg_masks = cPickle.load(open(seg_masks_path, 'rb'))
        seg_masks_list.append(seg_masks)
    seg_masks = np.mean(seg_masks_list,
                        axis=0)  # (n_clips, n_classes, n_time, n_freq)

    print(seg_masks.shape)

    #
    audio_dir = os.path.join(workspace, "mixed_audio",
                             "n_events=%d" % n_events)

    sep_dir = os.path.join(workspace, "sep_audio",
                           pp_data.get_filename(__file__),
                           "n_events=%d" % n_events, "fold=%d" % te_fold,
                           "snr=%d" % snr)
    pp_data.create_folder(sep_dir)

    ham_win = np.hamming(n_window)
    recover_scaler = np.sqrt((ham_win**2).sum())
    melW = librosa.filters.mel(sr=fs,
                               n_fft=n_window,
                               n_mels=64,
                               fmin=0.,
                               fmax=fs / 2)
    inverse_melW = get_inverse_W(melW)  # (64, 513)

    seg_stats = {}
    for e in events:
        seg_stats[e] = {
            'fvalue': [],
            'auc': [],
            'iou': [],
            'hit': [],
            'fa': [],
            'tp': [],
            'fn': [],
            'fp': []
        }

    cnt = 0
    for (i1, na) in enumerate(na_list):
        bare_na = os.path.splitext(na)[0]
        audio_path = os.path.join(audio_dir, "%s.wav" % bare_na)
        (stereo_audio, _) = pp_data.read_stereo_audio(audio_path, target_fs=fs)
        event_audio = stereo_audio[:, 0]
        noise_audio = stereo_audio[:, 1]
        mixed_audio = event_audio + noise_audio

        mixed_cmplx_sp = pp_data.calc_sp(mixed_audio, fs, ham_win, n_window,
                                         n_overlap)
        mixed_sp = np.abs(mixed_cmplx_sp)
        event_sp = np.abs(
            pp_data.calc_sp(event_audio, fs, ham_win, n_window, n_overlap))
        noise_sp = np.abs(
            pp_data.calc_sp(noise_audio, fs, ham_win, n_window, n_overlap))

        sm = seg_masks[i1]  # (n_classes, n_time, n_freq)
        sm_upsampled = np.dot(sm, inverse_melW)  # (n_classes, n_time, 513)

        print(na)

        # Write out separated events.
        for j1 in xrange(len(events)):
            if at_y[i1][j1] == 1:
                (fvalue, auc, iou, tp, fn, fp) = fvalue_iou(sm_upsampled[j1],
                                                            event_sp,
                                                            noise_sp,
                                                            sed_y[i1, :, j1],
                                                            seg_thres,
                                                            inside_only=True)
                (hit, fa) = hit_fa(sm_upsampled[j1],
                                   event_sp,
                                   noise_sp,
                                   sed_y[i1, :, j1],
                                   seg_thres,
                                   inside_only=True)
                seg_stats[events[j1]]['fvalue'].append(fvalue)
                seg_stats[events[j1]]['auc'].append(auc)
                seg_stats[events[j1]]['iou'].append(iou)
                seg_stats[events[j1]]['hit'].append(hit)
                seg_stats[events[j1]]['fa'].append(fa)
                seg_stats[events[j1]]['tp'].append(tp)
                seg_stats[events[j1]]['fn'].append(fn)
                seg_stats[events[j1]]['fp'].append(fp)

                sep_event_sp = sm_upsampled[j1] * mixed_sp
                sep_event_s = spectrogram_to_wave.recover_wav(
                    sep_event_sp,
                    mixed_cmplx_sp,
                    n_overlap=n_overlap,
                    winfunc=np.hamming,
                    wav_len=int(fs * clip_duration))
                sep_event_s *= recover_scaler

                out_event_audio_path = os.path.join(
                    sep_dir, "%s.%s.wav" % (bare_na, events[j1]))
                pp_data.write_audio(out_event_audio_path, sep_event_s, fs)

        # Write out separated noise.
        sm_noise_upsampled = np.clip(1. - np.sum(sm_upsampled, axis=0), 0., 1.)
        sep_noise_sp = sm_noise_upsampled * mixed_sp
        sep_noise_s = spectrogram_to_wave.recover_wav(sep_noise_sp,
                                                      mixed_cmplx_sp,
                                                      n_overlap=n_overlap,
                                                      winfunc=np.hamming,
                                                      wav_len=int(
                                                          fs * clip_duration))
        sep_noise_s *= recover_scaler
        out_noise_audio_path = os.path.join(sep_dir, "%s.noise.wav" % bare_na)
        pp_data.write_audio(out_noise_audio_path, sep_noise_s, fs)

        cnt += 1
        # if cnt == 2: break


    fvalues, aucs, ious, hits, fas, tps, fns, fps = [], [], [], [], [], [], [], []
    for e in events:
        fvalues.append(np.mean(seg_stats[e]['fvalue']))
        ious.append(np.mean(seg_stats[e]['iou']))
        aucs.append(np.mean(seg_stats[e]['auc']))
        hits.append(np.mean(seg_stats[e]['hit']))
        fas.append(np.mean(seg_stats[e]['fa']))
        tps.append(np.mean(seg_stats[e]['tp']))
        fns.append(np.mean(seg_stats[e]['fn']))
        fps.append(np.mean(seg_stats[e]['fp']))

    logging.info("%sfvalue\tauc\tiou\tHit\tFa\tHit-Fa\tTP\tFN\tFP" %
                 ("".ljust(16)))
    logging.info(
        "%s*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f\t*%.3f" %
        ("*Avg. of each".ljust(16), np.mean(fvalues), np.mean(aucs),
         np.mean(ious), np.mean(hits), np.mean(fas), np.mean(hits) -
         np.mean(fas), np.mean(tps), np.mean(fns), np.mean(fps)))
    for i1 in xrange(len(events)):
        logging.info(
            "%s%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f" %
            (events[i1].ljust(16), fvalues[i1], aucs[i1], ious[i1], hits[i1],
             fas[i1], hits[i1] - fas[i1], tps[i1], fns[i1], fps[i1]))