Ejemplo n.º 1
0
def main():
    import multiprocessing as mp
    import numpy as np
    import tensorflow as tf
    import os, sys
    from mir_util import infer, to_spec, to_wav_file
    import scipy.signal as sp
    import config as cfg
    sys.path.append("../lib")
    from eval_util import bss_eval
    from common import loadWav
    import redirect, simpleopt

    step_idx = int(simpleopt.get("step"))
    n_eval = simpleopt.get("first", None)
    with cfg.ConfigBoundary():
        batch_size = 1
        n_feature = cfg.frame_size // 2

        # Model
        print("* Initialize network")
        p_input = tf.compat.v1.placeholder(tf.float32,
                                           shape=(batch_size, 64, n_feature,
                                                  1),
                                           name="p_input")
        v_pred = infer(p_input, 2, False)
        if isinstance(v_pred, list):
            v_pred = v_pred[-1]
        v_pred = tf.clip_by_value(v_pred, 0.0, 1.0) * p_input

        x_input = np.zeros((batch_size, 64, n_feature, 1), dtype=np.float32)
        with tf.compat.v1.Session(config=cfg.sess_cfg) as sess:
            # Initialized, Load state
            sess.run(tf.compat.v1.global_variables_initializer())

            print("* Load checkpoint")
            ckpt_path = os.path.join(cfg.MIR2Config.ckpt_path,
                                     "checkpoint-%d" % (step_idx, ))
            tf.compat.v1.train.Saver().restore(sess, ckpt_path)
            print(" :Loaded: `%s`" % (ckpt_path, ))

            os.makedirs("./eval_output", exist_ok=True)
            name_list = []
            ret_list = []
            with mp.Pool(processes=1, initializer=worker_main) as pool:
                for (root, _, file_list) in os.walk(cfg.mir_wav_path):
                    file_list = sorted(f for f in file_list if not (
                        f.startswith("abjones") or f.startswith("amy")))
                    if n_eval is not None:
                        file_list = file_list[:int(n_eval)]
                    for i_file, filename in enumerate(file_list):
                        print("[%03d/%03d] SEND: `%s`" % (
                            i_file + 1,
                            len(file_list),
                            filename,
                        ))
                        name_list.append(filename)
                        path = os.path.join(root, filename)

                        mixed_wav, sr_orig = loadWav(path)
                        gt_wav_vocal = mixed_wav[:, 1]
                        gt_wav_inst = mixed_wav[:, 0]
                        mixed_wav = np.sum(mixed_wav, axis=1)

                        mixed_wav_orig = mixed_wav
                        gt_wav_vocal_orig = gt_wav_vocal
                        gt_wav_inst_orig = gt_wav_inst

                        gt_wav_vocal = sp.resample_poly(
                            gt_wav_vocal, cfg.sr, sr_orig).astype(np.float32)
                        gt_wav_inst = sp.resample_poly(
                            gt_wav_inst, cfg.sr, sr_orig).astype(np.float32)
                        mixed_wav = sp.resample_poly(
                            mixed_wav, cfg.sr, sr_orig).astype(np.float32)

                        mixed_spec = to_spec(mixed_wav)
                        mixed_spec_mag = np.abs(mixed_spec)
                        mixed_spec_phase = np.angle(mixed_spec)
                        max_tmp = np.max(mixed_spec_mag)
                        mixed_spec_mag = mixed_spec_mag / max_tmp

                        src_len = mixed_spec_mag.shape[0]
                        start_idx = 0
                        y_est_inst = np.zeros((src_len, n_feature),
                                              dtype=np.float32)
                        y_est_vocal = np.zeros((src_len, n_feature),
                                               dtype=np.float32)
                        while start_idx + 64 < src_len:
                            x_input[0, :, :,
                                    0] = mixed_spec_mag[start_idx:start_idx +
                                                        64, :n_feature]
                            y_output = sess.run(v_pred,
                                                feed_dict={p_input: x_input})
                            if start_idx == 0:
                                y_est_inst[start_idx:start_idx +
                                           64, :] = y_output[0, :, :, 0]
                                y_est_vocal[start_idx:start_idx +
                                            64, :] = y_output[0, :, :, 1]
                            else:
                                y_est_inst[start_idx + 16:start_idx +
                                           48, :] = y_output[0, 16:48, :, 0]
                                y_est_vocal[start_idx + 16:start_idx +
                                            48, :] = y_output[0, 16:48, :, 1]
                            start_idx += 32

                        x_input[0, :, :,
                                0] = mixed_spec_mag[src_len -
                                                    64:src_len, :n_feature]
                        y_output = sess.run(v_pred,
                                            feed_dict={p_input: x_input})
                        src_start = src_len - start_idx - 16
                        y_est_inst[start_idx +
                                   16:src_len, :] = y_output[0, 64 -
                                                             src_start:64, :,
                                                             0]
                        y_est_vocal[start_idx +
                                    16:src_len, :] = y_output[0, 64 -
                                                              src_start:64, :,
                                                              1]

                        y_est_inst *= max_tmp
                        y_est_vocal *= max_tmp
                        y_wav_inst = to_wav_file(
                            y_est_inst, mixed_spec_phase[:, :n_feature])
                        y_wav_vocal = to_wav_file(
                            y_est_vocal, mixed_spec_phase[:, :n_feature])
                        #saveWav("inst.wav", y_wav_inst, cfg.sr)
                        #saveWav("vocal.wav", y_wav_vocal, cfg.sr)

                        # upsample to original samprate
                        y_wav_inst_orig = sp.resample_poly(
                            y_wav_inst, sr_orig, cfg.sr).astype(np.float32)
                        y_wav_vocal_orig = sp.resample_poly(
                            y_wav_vocal, sr_orig, cfg.sr).astype(np.float32)
                        ret_list.append(
                            pool.apply_async(bss_eval, (
                                mixed_wav_orig,
                                gt_wav_inst_orig,
                                gt_wav_vocal_orig,
                                y_wav_inst_orig,
                                y_wav_vocal_orig,
                            )))
                with redirect.ConsoleAndFile(
                        "./eval_output/mir2_%s_%d_step%d.txt" %
                    (cfg.gene_ver, cfg.gene_value, step_idx)) as r:
                    gnsdr = 0.0
                    gsir = 0.0
                    gsar = 0.0
                    total_len = 0
                    for name, ret in zip(name_list, ret_list):
                        nsdr, sir, sar, lens = ret.get()
                        printstr = name + " " + str(nsdr) + " " + str(
                            sir) + " " + str(sar)
                        r.print(printstr)
                        total_len += lens
                        gnsdr += nsdr * lens
                        gsir += sir * lens
                        gsar += sar * lens
                    r.print("Final results")
                    r.print("GNSDR [Accompaniments, voice]")
                    r.print(gnsdr / total_len)
                    r.print("GSIR [Accompaniments, voice]")
                    r.print(gsir / total_len)
                    r.print("GSAR [Accompaniments, voice]")
                    r.print(gsar / total_len)
Ejemplo n.º 2
0
                    start_idx += 32

                x_input[0, :, :, 0] = mixed_spec_mag[src_len - 64:src_len,
                                                     0:n_feature]
                y_output = sess.run(v_pred, feed_dict={p_input: x_input})
                src_start = src_len - start_idx - 16
                for i_ch in range(n_ch):
                    y_est[i_ch, start_idx +
                          16:src_len, :] = y_output[0, 64 - src_start:64, :,
                                                    i_ch]

                y_est *= max_temp
                l = []
                for i_ch, ch_name in enumerate(ch_list):
                    y_wav = to_wav_file(y_est[i_ch, :, :],
                                        mixed_spec_phase[:, :n_feature],
                                        len_hop=1411)
                    l.append(y_wav[np.newaxis, np.newaxis, :])
                l = np.concatenate(l, axis=0)
                est_ch_list.append(l)
            print("* Save")
            out = np.concatenate(est_ch_list, axis=1)
            for i_ch, ch_name in enumerate(ch_list):
                os.makedirs("split_out_f/{}_{}_step{}/".format(
                    ver, gene, ckpt_step),
                            exist_ok=True)
                wav = np.clip(np.round(out[i_ch, :, :].T * 32767), -32768,
                              32767).astype(np.int16)
                saveWav(
                    "split_out_f/{}_{}_step{}/{}_{}.wav".format(
                        ver, gene, ckpt_step, ".".join(
Ejemplo n.º 3
0
def main():
    import multiprocessing as mp
    import numpy as np
    import tensorflow as tf
    import os, sys
    import config as cfg
    #import librosa
    from mir_util import infer, to_spec, to_wav_file
    import scipy.signal as sp
    sys.path.append("../lib")
    from eval_util import bss_eval_sdr_framewise
    from common import loadWav
    import redirect, simpleopt
    import pandas as pd

    step_idx = int(simpleopt.get("step"))
    n_eval = simpleopt.get("first", None)
    with cfg.ConfigBoundary():
        batch_size = 1
        n_feature = cfg.frame_size // 2

        # Model
        print("* Initialize network")
        p_input = tf.compat.v1.placeholder(tf.float32,
                                           shape=(batch_size, 64, n_feature,
                                                  1),
                                           name="p_input")
        v_pred = infer(p_input, 2, False)
        if isinstance(v_pred, list):
            v_pred = v_pred[-1]
        v_pred = tf.clip_by_value(v_pred, 0.0, 1.0) * p_input

        x_input = np.zeros((batch_size, 64, n_feature, 1), dtype=np.float32)
        with tf.compat.v1.Session(config=cfg.sess_cfg) as sess:
            # Initialized, Load state
            sess.run(tf.compat.v1.global_variables_initializer())

            print("* Load checkpoint")
            ckpt_path = os.path.join(cfg.DSD2Config.ckpt_path,
                                     "checkpoint-%d" % (step_idx, ))
            tf.compat.v1.train.Saver().restore(sess, ckpt_path)
            print(" :Loaded: `%s`" % (ckpt_path, ))

            os.makedirs("./eval_output", exist_ok=True)
            name_list = []
            ret_list = []
            with mp.Pool(processes=8, initializer=worker_main) as pool:
                for (root, dir_list, _) in os.walk(
                        os.path.join(cfg.dsd_path, "Mixtures", "Test")):
                    dir_list = sorted(dir_list)
                    if n_eval is not None:
                        dir_list = dir_list[:int(n_eval)]
                    for i_dir, d in enumerate(dir_list):
                        print("[%02d/%02d] STG1: `%s`" % (
                            i_dir + 1,
                            len(dir_list),
                            d,
                        ))
                        name_list.append(d)

                        filename_vocal = os.path.join(cfg.dsd_path, "Sources",
                                                      "Test", d, "vocals.wav")
                        filename_mix = os.path.join(cfg.dsd_path, "Mixtures",
                                                    "Test", d, "mixture.wav")

                        import time
                        t = time.time()
                        mixed_wav_orig, sr_orig = loadWav(
                            filename_mix
                        )  #librosa.load(filename_mix, sr=None, mono=True)
                        mixed_wav_orig = np.sum(mixed_wav_orig, axis=1)
                        gt_wav_vocal_orig, _ = loadWav(
                            filename_vocal
                        )  #librosa.load(filename_vocal, sr=None, mono=True)[0]
                        gt_wav_vocal_orig = np.sum(gt_wav_vocal_orig, axis=1)
                        gt_wav_inst_orig = mixed_wav_orig - gt_wav_vocal_orig

                        mixed_wav = sp.resample_poly(
                            mixed_wav_orig, cfg.sr, sr_orig
                        ).astype(
                            np.float32
                        )  #librosa.load(filename_mix, sr=cfg.sr, mono=True)[0]
                        gt_wav_vocal = sp.resample_poly(
                            gt_wav_vocal_orig, cfg.sr, sr_orig
                        ).astype(
                            np.float32
                        )  #librosa.load(filename_vocal, sr=cfg.sr, mono=True)[0]
                        gt_wav_inst = mixed_wav - gt_wav_vocal
                        mixed_spec = to_spec(mixed_wav)
                        mixed_spec_mag = np.abs(mixed_spec)
                        mixed_spec_phase = np.angle(mixed_spec)
                        max_tmp = np.max(mixed_spec_mag)
                        mixed_spec_mag = mixed_spec_mag / max_tmp

                        src_len = mixed_spec_mag.shape[0]
                        start_idx = 0
                        y_est_inst = np.zeros((src_len, n_feature),
                                              dtype=np.float32)
                        y_est_vocal = np.zeros((src_len, n_feature),
                                               dtype=np.float32)
                        while start_idx + 64 < src_len:
                            x_input[0, :, :,
                                    0] = mixed_spec_mag[start_idx:start_idx +
                                                        64, :n_feature]
                            y_output = sess.run(v_pred,
                                                feed_dict={p_input: x_input})
                            if start_idx == 0:
                                y_est_inst[start_idx:start_idx +
                                           64, :] = y_output[0, :, :, 0]
                                y_est_vocal[start_idx:start_idx +
                                            64, :] = y_output[0, :, :, 1]
                            else:
                                y_est_inst[start_idx + 16:start_idx +
                                           48, :] = y_output[0, 16:48, :, 0]
                                y_est_vocal[start_idx + 16:start_idx +
                                            48, :] = y_output[0, 16:48, :, 1]
                            start_idx += 32

                        x_input[0, :, :,
                                0] = mixed_spec_mag[src_len -
                                                    64:src_len, :n_feature]
                        y_output = sess.run(v_pred,
                                            feed_dict={p_input: x_input})
                        src_start = src_len - start_idx - 16
                        y_est_inst[start_idx +
                                   16:src_len, :] = y_output[0, 64 -
                                                             src_start:64, :,
                                                             0]
                        y_est_vocal[start_idx +
                                    16:src_len, :] = y_output[0, 64 -
                                                              src_start:64, :,
                                                              1]

                        y_est_inst *= max_tmp
                        y_est_vocal *= max_tmp
                        y_wav_inst = to_wav_file(
                            y_est_inst, mixed_spec_phase[:, :n_feature])
                        y_wav_vocal = to_wav_file(
                            y_est_vocal, mixed_spec_phase[:, :n_feature])
                        #saveWav("inst.wav", y_wav_inst, cfg.sr)
                        #saveWav("vocal.wav", y_wav_vocal, cfg.sr)

                        #upsample to original SR
                        y_wav_inst_orig = sp.resample_poly(
                            y_wav_inst, sr_orig, cfg.sr).astype(
                                np.float32
                            )  #librosa.resample(y_wav_inst, cfg.sr, sr_orig)
                        y_wav_vocal_orig = sp.resample_poly(
                            y_wav_vocal, sr_orig, cfg.sr).astype(
                                np.float32
                            )  #librosa.resample(y_wav_vocal, cfg.sr, sr_orig)

                        ret_list.append(
                            pool.apply_async(bss_eval_sdr_framewise, (
                                np.array([gt_wav_inst_orig, gt_wav_vocal_orig],
                                         dtype=np.float32),
                                np.array([y_wav_inst_orig, y_wav_vocal_orig],
                                         dtype=np.float32),
                            )))

                head_list = [
                    "method", "track", "target", "metric", "score", "time"
                ]
                row_list = []
                out_path = "./old_fw/dsd2_%s_%d_step%d.json" % (
                    cfg.gene_ver, cfg.gene_value, step_idx)
                method_name = "dsd2_%s_%d_step%d" % (cfg.gene_ver,
                                                     cfg.gene_value, step_idx)
                for name, ret in zip(name_list, ret_list):
                    print(name)
                    sdr, sir, sar = ret.get()
                    for i, v in enumerate(sdr[0]):
                        row_list.append((
                            method_name,
                            name,
                            "accompaniment",
                            "SDR",
                            v,
                            i,
                        ))
                    for i, v in enumerate(sir[0]):
                        row_list.append((
                            method_name,
                            name,
                            "accompaniment",
                            "SIR",
                            v,
                            i,
                        ))
                    for i, v in enumerate(sar[0]):
                        row_list.append((
                            method_name,
                            name,
                            "accompaniment",
                            "SAR",
                            v,
                            i,
                        ))

                    for i, v in enumerate(sdr[1]):
                        row_list.append((
                            method_name,
                            name,
                            "vocals",
                            "SDR",
                            v,
                            i,
                        ))
                    for i, v in enumerate(sir[1]):
                        row_list.append((
                            method_name,
                            name,
                            "vocals",
                            "SIR",
                            v,
                            i,
                        ))
                    for i, v in enumerate(sar[1]):
                        row_list.append((
                            method_name,
                            name,
                            "vocals",
                            "SAR",
                            v,
                            i,
                        ))
                out = pd.DataFrame(row_list, columns=head_list).reset_index()
                print(out)
                out.to_json(out_path)
Ejemplo n.º 4
0
def main_estimate(pool):
    import numpy as np
    import tensorflow as tf
    import os, sys, pathlib
    sys.path.append("../lib")
    import config as cfg
    #import librosa
    from mir_util import infer, to_spec, to_wav_file
    import scipy.signal as sp
    import musdb, museval
    import simpleopt
    sys.is_train = False

    step_idx = int(simpleopt.get("step"))
    n_eval = simpleopt.get("first", None)
    if n_eval is not None:
        n_eval = int(n_eval)
        assert n_eval > 0

    sound_sample_root = simpleopt.get("sound-out", None)
    source = simpleopt.get("source")
    if source == "vocals":
        source = None

    with cfg.ConfigBoundary():
        if source is None:
            model_name = "mus2f_%s_%d_step%d" % (
                cfg.gene_ver,
                cfg.gene_value,
                step_idx,
            )
        else:
            model_name = "mus2f_%s_%d_step%d_%s" % (
                cfg.gene_ver,
                cfg.gene_value,
                step_idx,
                source,
            )
        model_name_nosrc = "mus2f_%s_%d_step%d" % (
            cfg.gene_ver,
            cfg.gene_value,
            step_idx,
        )
        if sound_sample_root is None:
            sound_sample_root = "./sound_output_mus2f/{}".format(
                model_name_nosrc)
        pathlib.Path(sound_sample_root).mkdir(parents=True, exist_ok=True)
        ckpt_path = cfg.MUS2FConfig.ckpt_path
        if source != "vocals":
            ckpt_path = "{}_{}".format(ckpt_path, source)

        batch_size = 1
        n_feature = 5644 // 2

        # Model
        print("* Initialize network")
        p_input = tf.compat.v1.placeholder(tf.float32,
                                           shape=(batch_size, 64, n_feature,
                                                  1),
                                           name="p_input")
        v_pred = infer(p_input, 2, False)
        if isinstance(v_pred, list):
            v_pred = v_pred[-1]

        with tf.compat.v1.Session(config=cfg.sess_cfg) as sess:
            # Initialized, Load state
            sess.run(tf.compat.v1.global_variables_initializer())

            print("* Load checkpoint")
            ckpt_path = os.path.join(ckpt_path, "checkpoint-%d" % (step_idx, ))
            tf.compat.v1.train.Saver().restore(sess, ckpt_path)
            print(" :Loaded: `%s`" % (ckpt_path, ))

            os.makedirs("./eval_output", exist_ok=True)
            name_list = []
            ret_list = []

            mus = musdb.DB(root=cfg.mus_root_path,
                           download=False,
                           subsets="test",
                           is_wav=True)
            mus_trk_list = list(mus.tracks)
            mus_trk_list.sort(key=lambda x: x.name)
            assert len(mus_trk_list) > 0
            if n_eval is not None:
                mus_trk_list = mus_trk_list[:n_eval]

            results = museval.EvalStore()

            for i_song, track in enumerate(mus_trk_list):
                print("[%02d/%02d] Estimate: `%s`" % (
                    i_song + 1,
                    len(mus_trk_list),
                    track.name,
                ))
                voc_ch_list = []
                inst_ch_list = []
                for i_channel in range(2):
                    print(" :Channel #%d" % (i_channel, ))
                    name_list.append(track.name + " Channel %d" %
                                     (i_channel, ))

                    mixed_wav = track.audio[:, i_channel]
                    mixed_spec = to_spec(mixed_wav,
                                         len_frame=5644,
                                         len_hop=5644 // 4)
                    mixed_spec_mag = np.abs(mixed_spec)
                    mixed_spec_phase = np.angle(mixed_spec)
                    max_tmp = np.max(mixed_spec_mag)
                    mixed_spec_mag = mixed_spec_mag / max_tmp

                    src_len = mixed_spec_mag.shape[0]
                    start_idx = 0
                    y_est_inst = np.zeros((src_len, n_feature),
                                          dtype=np.float32)
                    y_est_vocal = np.zeros((src_len, n_feature),
                                           dtype=np.float32)
                    x_input = np.zeros((batch_size, 64, n_feature, 1),
                                       dtype=np.float32)
                    while start_idx + 64 < src_len:
                        x_input[0, :, :,
                                0] = mixed_spec_mag[start_idx:start_idx +
                                                    64, :n_feature]
                        y_output = sess.run(v_pred,
                                            feed_dict={p_input: x_input})
                        if start_idx == 0:
                            y_est_inst[start_idx:start_idx +
                                       64, :] = y_output[0, :, :, 0]
                            y_est_vocal[start_idx:start_idx +
                                        64, :] = y_output[0, :, :, 1]
                        else:
                            y_est_inst[start_idx + 16:start_idx +
                                       48, :] = y_output[0, 16:48, :, 0]
                            y_est_vocal[start_idx + 16:start_idx +
                                        48, :] = y_output[0, 16:48, :, 1]
                        start_idx += 32

                    x_input[0, :, :,
                            0] = mixed_spec_mag[src_len -
                                                64:src_len, :n_feature]
                    y_output = sess.run(v_pred, feed_dict={p_input: x_input})
                    src_start = src_len - start_idx - 16
                    y_est_inst[start_idx +
                               16:src_len, :] = y_output[0,
                                                         64 - src_start:64, :,
                                                         0]
                    y_est_vocal[start_idx +
                                16:src_len, :] = y_output[0,
                                                          64 - src_start:64, :,
                                                          1]

                    y_est_inst *= max_tmp
                    y_est_vocal *= max_tmp
                    y_wav_inst = to_wav_file(y_est_inst,
                                             mixed_spec_phase[:, :n_feature],
                                             len_hop=5644 // 4)
                    y_wav_vocal = to_wav_file(y_est_vocal,
                                              mixed_spec_phase[:, :n_feature],
                                              len_hop=5644 // 4)

                    voc_ch_list.append(y_wav_vocal.reshape(
                        y_wav_vocal.size, 1))
                    inst_ch_list.append(y_wav_inst.reshape(y_wav_inst.size, 1))
                    del y_wav_inst, y_wav_vocal, y_est_inst, y_est_vocal, src_start, x_input, y_output, mixed_spec_mag, max_tmp, mixed_spec_phase, mixed_spec, mixed_wav
                estimates = {
                    source: np.concatenate(voc_ch_list, axis=1),
                }
                del voc_ch_list, inst_ch_list
                if sound_sample_root:
                    mus.save_estimates(estimates, track, sound_sample_root)
                del estimates, i_song, track