Esempio n. 1
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate loudness and basic frequency (F0) L1 difference between a synthesized wav file to its original wav file')
    parser.add_argument('-sf', '--synthesized_file', type=str)
    parser.add_argument('-of', '--original_file', type=str)
    parser.add_argument('-sr', '--sample_rate', type=int, default=16000)
    parser.add_argument('-fr', '--frame_rate', type=int, default=250)
    parser.add_argument('-s', '--include_spectral', type=int, default=1)
    parser.add_argument('-f0', '--include_f0', type=int, default=1)
    parser.add_argument('-ld', '--include_ld', type=int, default=1)
    parser.add_argument('-sdr', '--include_sdr', type=int, default=1)
    
    args = parser.parse_args()
    
    synth_audio, _ = librosa.load(args.synthesized_file, args.sample_rate)
    original_audio, _ = librosa.load(args.original_file, args.sample_rate)

    synth_audio_samples = synth_audio.shape[0]
    original_audio_samples = original_audio.shape[0]

    if synth_audio_samples < original_audio_samples:
        print(f"Trimming original audio samples from {original_audio_samples} to {synth_audio_samples}")
        original_audio_samples = synth_audio_samples
        original_audio = original_audio[:original_audio_samples]

    elif original_audio_samples < synth_audio_samples:
        print(f"Trimming synthesized audio samples from {synth_audio_samples} to {original_audio_samples}")
        synth_audio_samples = original_audio_samples
        synth_audio = synth_audio[:synth_audio_samples]

    if args.include_sdr:
        print(f"SDR: {_calc_sdr(synth_audio, original_audio)}")

    if args.include_f0:
        print("Calculating F0 for synthesized audio")
        synth_f0 = spectral_ops.compute_f0(synth_audio, args.sample_rate, args.frame_rate)[0]

        print("Calculating F0 for original audio")
        original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0]
        f0_l1 = np.mean(abs(synth_f0 - original_f0))
        print(f"Average F0 L1: {f0_l1}")

    if args.include_ld:
        print("Calculating loudness for synthesized audio")
        synth_loudness = spectral_ops.compute_loudness(synth_audio, args.sample_rate, args.frame_rate)

        print("Calculating loudness for original audio")
        original_loudness = spectral_ops.compute_loudness(original_audio, args.sample_rate, args.frame_rate)

        loudness_l1 = np.mean(abs(synth_loudness - original_loudness))
        print(f"Average Loudness L1: {loudness_l1}")

    if args.include_spectral:
        from ddsp import losses
        loss_obj = losses.SpectralLoss(mag_weight=1.0, logmag_weight=1.0)
        spectral_loss = loss_obj(synth_audio, original_audio)
        print(f"Average Multi-scale spectrogram loss: {spectral_loss}")
Esempio n. 2
0
 def test_compute_f0_at_sample_rate(self, sample_rate, audio_len_sec):
     audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec)
     f0_hz, f0_confidence = spectral_ops.compute_f0(audio_sin, sample_rate,
                                                    self.frame_rate)
     expected_f0_hz_and_f0_conf_len = int(self.frame_rate * audio_len_sec)
     self.assertLen(f0_hz, expected_f0_hz_and_f0_conf_len)
     self.assertLen(f0_confidence, expected_f0_hz_and_f0_conf_len)
     self.assertTrue(np.all(np.isfinite(f0_hz)))
     self.assertTrue(np.all(np.isfinite(f0_confidence)))
Esempio n. 3
0
def prepare_partial_tfrecord(
        dataset_dir='nsynth_guitar',
        split='train',
        sample_rate=16000,
        frame_rate=250
):
    split_dir = os.path.join(dataset_dir, split)
    audio_dir = os.path.join(split_dir, 'audio')
    nsynth_dataset_file = os.path.join(split_dir, 'examples.json')
    partial_tfrecord_file = os.path.join(split_dir, 'partial.tfrecord')

    with open(nsynth_dataset_file, 'r') as file:
        nsynth_dataset_dict = json.load(file)

    steps = len(nsynth_dataset_dict)

    with tf.io.TFRecordWriter(partial_tfrecord_file) as writer:
        for step, (k, v) in enumerate(nsynth_dataset_dict.items()):
            start_time = time.perf_counter()

            file_name = '{}.wav'.format(k)
            target_path = os.path.join(audio_dir, file_name)

            audio = _load_audio(target_path, sample_rate)
            f0_hz, f0_confidence = spectral_ops.compute_f0(
                audio, sample_rate, frame_rate)
            mean_loudness_db = spectral_ops.compute_loudness(
                audio, sample_rate, frame_rate, 2048)

            audio = audio.astype(np.float32)
            f0_hz = f0_hz.astype(np.float32)
            f0_confidence = f0_confidence.astype(np.float32)
            mean_loudness_db = mean_loudness_db.astype(np.float32)

            # pitch_hz = core.midi_to_hz(v['pitch'])

            partial_dataset_dict = {
                'sample_name': _byte_feature([str.encode(k)]),
                'note_number': _int64_feature([v['pitch']]),
                'velocity': _int64_feature([v['velocity']]),
                'instrument_source': _int64_feature([v['instrument_source']]),
                'qualities': _int64_feature(v['qualities']),
                'audio': _float_feature(audio),
                'f0_hz': _float_feature(f0_hz),
                'f0_confidence': _float_feature(f0_confidence),
                'loudness_db': _float_feature(mean_loudness_db),
            }

            tf_example = tf.train.Example(
                features=tf.train.Features(feature=partial_dataset_dict))

            writer.write(tf_example.SerializeToString())

            stop_time = time.perf_counter()
            elapsed_time = stop_time - start_time
            print('{}/{} - sample_name: {} - elapsed_time: {:.3f}'.format(
                step+1, steps, k, elapsed_time))
Esempio n. 4
0
def _add_f0_estimate(ex, sample_rate, frame_rate):
  """Add fundamental frequency (f0) estimate using CREPE."""
  beam.metrics.Metrics.counter('prepare-tfrecord', 'estimate-f0').inc()
  audio = ex['audio']
  f0_hz, f0_confidence = spectral_ops.compute_f0(audio, sample_rate, frame_rate)
  ex = dict(ex)
  ex.update({
      'f0_hz': f0_hz.astype(np.float32),
      'f0_confidence': f0_confidence.astype(np.float32)
  })
  return ex
Esempio n. 5
0
def compute_audio_features(audio,
                           n_fft=2048,
                           sample_rate=16000,
                           frame_rate=250):
  """Compute features from audio."""
  audio_feats = {'audio': audio}
  audio = squeeze(audio)

  audio_feats['loudness_db'] = spectral_ops.compute_loudness(
      audio, sample_rate, frame_rate, n_fft)

  audio_feats['f0_hz'], audio_feats['f0_confidence'] = spectral_ops.compute_f0(
      audio, sample_rate, frame_rate)

  return audio_feats
Esempio n. 6
0
    def _encode(self):
        logging.info("Writing {}".format(self.tfr))
        with tf.python_io.TFRecordWriter(self.tfr) as writer:
            mix_wav_dir = os.path.join(self.wav_dir, "mix")
            s1_wav_dir = os.path.join(self.wav_dir, "s1")
            s2_wav_dir = os.path.join(self.wav_dir, "s2")
            s3_wav_dir = os.path.join(self.wav_dir, "s3")
            filenames = os.listdir(s1_wav_dir)
            for filename in tqdm(filenames):
                logging.info("Preprocessing %s" %
                             (os.path.join(mix_wav_dir, filename)))
                mix, _ = librosa.load(os.path.join(mix_wav_dir, filename),
                                      self.sample_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s1_wav_dir, filename)))
                s1, _ = librosa.load(os.path.join(s1_wav_dir, filename),
                                     self.sample_rate)
                s1_f0 = spectral_ops.compute_f0(s1, self.sample_rate,
                                                self.frame_rate)[0]
                s1_loudness = spectral_ops.compute_loudness(
                    s1, self.sample_rate, self.frame_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s2_wav_dir, filename)))
                s2, _ = librosa.load(os.path.join(s2_wav_dir, filename),
                                     self.sample_rate)
                s2_f0 = spectral_ops.compute_f0(s2, self.sample_rate,
                                                self.frame_rate)[0]
                s2_loudness = spectral_ops.compute_loudness(
                    s2, self.sample_rate, self.frame_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s3_wav_dir, filename)))
                s3, _ = librosa.load(os.path.join(s3_wav_dir, filename),
                                     self.sample_rate)
                s3_f0 = spectral_ops.compute_f0(s3, self.sample_rate,
                                                self.frame_rate)[0]
                s3_loudness = spectral_ops.compute_loudness(
                    s3, self.sample_rate, self.frame_rate)

                def sample_to_frame(sample_num):
                    return int(self.frame_rate * sample_num / self.sample_rate)

                def write(l, r):
                    l_frame = sample_to_frame(l)
                    r_frame = sample_to_frame(r)

                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            "mix_audio":
                            self._float_list_feature(mix[l:r]),
                            "s1_audio":
                            self._float_list_feature(s1[l:r]),
                            "s1_f0":
                            self._float_list_feature(s1_f0[l_frame:r_frame]),
                            "s1_loudness":
                            self._float_list_feature(
                                s1_loudness[l_frame:r_frame]),
                            "s2_audio":
                            self._float_list_feature(s2[l:r]),
                            "s2_f0":
                            self._float_list_feature(s2_f0[l_frame:r_frame]),
                            "s2_loudness":
                            self._float_list_feature(
                                s2_loudness[l_frame:r_frame]),
                            "s3_audio":
                            self._float_list_feature(s3[l:r]),
                            "s3_f0":
                            self._float_list_feature(s3_f0[l_frame:r_frame]),
                            "s3_loudness":
                            self._float_list_feature(
                                s3_loudness[l_frame:r_frame]),
                        }))
                    writer.write(example.SerializeToString())

                now_length = s1.shape[-1]
                if now_length < int(4 * self.sample_rate):
                    continue
                target_length = int(4 * self.sample_rate)
                stride = int(4 * self.sample_rate)
                for i in range(0, now_length - target_length, stride):
                    write(i, i + target_length)
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate MCTN output loudness & F0 of every instrument')
    parser.add_argument('-ld', '--log_dir', type=str, default='./mctn_log')
    parser.add_argument('-rd', '--results_dir', type=str, default='./results')
    parser.add_argument('-sr', '--sample_rate', type=int, default=16000)
    parser.add_argument('-fr', '--frame_rate', type=int, default=250)
    parser.add_argument('-osd',
                        '--original_sound_dir',
                        type=str,
                        default='./results/DDSP - Same artist - Test')

    args = parser.parse_args()
    args.log_file = os.path.join(args.log_dir, 'log.txt')

    logger = logging.getLogger()
    logger.addHandler(logging.FileHandler(args.log_file))
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.setLevel(logging.INFO)

    # Prepare input from files
    logger.info("Loading data from mctn files")
    audio_features_vocals = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "vocals_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "vocals_f0_hz.npy"),
                allow_pickle=False),
    }
    audio_features_bass = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "bass_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "bass_f0_hz.npy"),
                allow_pickle=False),
    }
    audio_features_drums = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "drums_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "drums_f0_hz.npy"),
                allow_pickle=False),
    }

    # Calc average loudness & F0 diff for every instrument
    for instrument, audio_features in [("bass", audio_features_bass),
                                       ("drums", audio_features_drums),
                                       ("vocals", audio_features_vocals)]:
        original_audio, _ = librosa.load(
            os.path.join(args.original_sound_dir,
                         f"original_{instrument}.wav"), args.sample_rate)

        synth_audio_samples = audio_features_vocals["f0_hz"].shape[
            1] * args.sample_rate // args.frame_rate
        original_audio_samples = original_audio.shape[0]

        if synth_audio_samples < original_audio_samples:
            logging.info(
                f"Trimming original {instrument} audio samples from {original_audio_samples} to {synth_audio_samples}"
            )
            original_audio_samples = synth_audio_samples
            original_audio = original_audio[:original_audio_samples]

        # Assuming only 1 batch
        synth_f0 = audio_features["f0_hz"][0]
        synth_loudness = audio_features["loudness_db"][0]

        logging.info(f"Calculating F0 for {instrument} original audio")
        original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate,
                                              args.frame_rate)[0]
        logging.info(f"Calculating loudness for {instrument} original audio")
        original_loudness = spectral_ops.compute_loudness(
            original_audio, args.sample_rate, args.frame_rate)

        f0_l1 = np.mean(abs(synth_f0 - original_f0))
        loudness_l1 = np.mean(abs(synth_loudness - original_loudness))
        logging.info(f"Average {instrument} F0 L1: {f0_l1}")
        logging.info(f"Average {instrument} Loudness L1: {loudness_l1}")
Esempio n. 8
0
def main(argv):
    # Check preconditions
    if "RPM" not in FLAGS.commands:
        raise Exception(
            "RPM must be part of the OBD commands since it is used for OBD-audio alignment."
        )
    elif FLAGS.commands[
            0] != "RPM":  # Make sure RPM is first in commands list (necessary for later)
        i_RPM = FLAGS.commands.index("RPM")
        FLAGS.commands[i_RPM] = FLAGS.commands[0]
        FLAGS.commands[0] = "RPM"
    if FLAGS.data_name is None:
        FLAGS.data_name = get_timestamp()

    # Create save folder
    save_dir = os.path.join(FLAGS.save_dir, FLAGS.data_name)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    if FLAGS.plot_mode == "save":
        plots_dir = os.path.join(save_dir, "plots")
        if not os.path.exists(plots_dir):
            os.makedirs(plots_dir)

    # Load OBD data
    n_commands = len(FLAGS.commands)
    c_dict = dict()
    if "obd" in FLAGS.plots:
        _, axes = plt.subplots(n_commands, 1, figsize=(15, 3 * n_commands))
        axes = list(iter(axes)) if n_commands > 1 else [axes]
    for i, c in enumerate(FLAGS.commands):
        c_pickle_path = os.path.join(FLAGS.data_dir, c + ".pickle")
        logging.info("Unpickling %s data..." % c)
        with open(c_pickle_path, "rb") as f:
            c_list = pickle.load(f)
        if c == "RPM":
            t0 = c_list.times[0]  # Let times be relative to RPM's initial time

        logging.info("Preprocessing data...")
        unit = str(c_list.values[0].units)
        y, t = preprocess_obd_data(c_list, t0=t0)
        c_dict[c] = {"values": y, "times": t}

        if "obd" in FLAGS.plots:
            logging.info("Plotting %s data..." % c)
            ax = axes[i]
            ax.plot(t, y, "-x")
            ax.set_title(c)
            ax.set_xlabel("time [s]")
            ax.set_ylabel("%s" % unit)
    if "obd" in FLAGS.plots:
        plt.tight_layout()
        if FLAGS.plot_mode == "save":
            plot_path = os.path.join(plots_dir, "odb.pdf")
            logging.info("Saving plot to '%s'...", plot_path)
            plt.savefig(plot_path)
            plt.close()

    # Estimate f0 from audio (if not already done)
    f0_frame_size = _CREPE_FRAME_SIZE // 2  #16
    f0_frame_rate = _CREPE_SAMPLE_RATE / f0_frame_size  # frame rate of f0 and loudness features
    f0_path = os.path.join(FLAGS.data_dir, "f0.npy")
    audio_path = os.path.join(FLAGS.data_dir, FLAGS.audio_filename)
    audio, _ = librosa.load(audio_path, FLAGS.sample_rate)
    if FLAGS.max_mins > 0.0:
        logging.info("Cropping out first %.1f minutes of audio..." %
                     FLAGS.max_mins)
        n_samples = int(FLAGS.max_mins * 60 * FLAGS.sample_rate)
        audio = audio[:n_samples]
    if os.path.exists(f0_path):
        logging.info("Using precomputed f0.")
        f0 = np.load(f0_path)
    elif FLAGS.skip_f0_sync:
        logging.info("Skipping f0 estimation since skipping sync.")
        f0 = np.zeros((f0_frame_size, ))
    else:
        logging.info("Estimating f0 using CREPE...")
        audio_lores = librosa.core.resample(audio, FLAGS.sample_rate,
                                            _CREPE_SAMPLE_RATE)
        f0, _ = compute_f0(audio_lores, _CREPE_SAMPLE_RATE, f0_frame_rate)
        np.save(f0_path, f0)

    # Interpolate OBD quantities to allow for upsampling later
    c_interp = dict()
    for c in FLAGS.commands:
        logging.info("Interpolating %s signal..." % c)
        c_interp[c] = interp1d(c_dict[c]["times"],
                               c_dict[c]["values"],
                               kind=FLAGS.interp_method)

    # Scale interpolated RPM signal to become f0 and sample on CREPE f0 times
    logging.info("Upsampling RPM signal...")
    f0_times = np.arange(0., len(f0)) / f0_frame_rate
    f0_rpm_times = f0_times[f0_times < np.max(c_dict["RPM"]["times"])]
    rpm_scale = 1. / 60
    if FLAGS.engine_type == "four-stroke":
        rpm_scale *= 2
    f0_rpm = rpm_scale * c_interp["RPM"](f0_rpm_times)

    # Find lag between f0 and RPM
    if not FLAGS.skip_f0_sync:
        logging.info("Calculating lag between f0 and RPM...")
        xcorr = correlate(f0 - np.mean(f0), f0_rpm - np.mean(f0_rpm))
        lag = xcorr.argmax() - (len(f0_rpm) - 1)
        logging.info("Found lag: %d frames (%.3f seconds)" %
                     (lag, lag / f0_frame_rate))
    else:
        logging.info("Will not sync f0 and RPM. Uses lag 0.")
        lag = 0
    if "f0-rpm" in FLAGS.plots:
        logging.info("Plotting RPM alignment...")
        _, axes = plt.subplots(2, 1, figsize=(15, 6))
        axes[0].plot(f0_times, f0, label="f0")
        axes[0].plot(f0_rpm_times, f0_rpm, label="f0-rpm")
        axes[0].set_title("Before alignment")
        axes[1].plot(f0_times - lag / f0_frame_rate, f0, label="f0")
        axes[1].plot(f0_rpm_times, f0_rpm, label="f0-rpm")
        axes[1].set_title("After alignment")
        plt.tight_layout()
        if FLAGS.plot_mode == "save":
            plot_path = os.path.join(plots_dir, "f0_rpm.pdf")
            logging.info("Saving plot to '%s'...", plot_path)
            plt.savefig(plot_path)
            plt.close()

    # Trim audio according to lag
    start = int(lag * FLAGS.sample_rate / f0_frame_rate)
    end = start + int(len(f0_rpm) * FLAGS.sample_rate / f0_frame_rate) + 1
    audio_trimmed = audio[start:end]
    audio_trimmed_length = len(audio_trimmed) / float(FLAGS.sample_rate)
    logging.info("Trimmed audio is %.3f seconds." % audio_trimmed_length)
    if "f0-audio" in FLAGS.plots:
        logging.info("Plotting audio spectrogram with f0...")
        fmax = 2**13
        n_fft = 2**13
        plt.figure(figsize=(15, 6))
        S = librosa.feature.melspectrogram(y=audio_trimmed,
                                           sr=FLAGS.sample_rate,
                                           n_fft=n_fft,
                                           n_mels=1024,
                                           fmax=fmax)
        S_dB = librosa.power_to_db(S, ref=np.max)
        ax = specshow(S_dB,
                      x_axis='time',
                      y_axis='mel',
                      sr=FLAGS.sample_rate,
                      fmax=fmax)
        f0_h, = ax.plot(f0_rpm_times, f0_rpm, "--")
        ax.set_ylim((0, 5 * np.max(f0_rpm)))
        ax.set_xlabel("time [s]")
        ax.set_ylabel("frequency [Hz]")
        ax.legend([f0_h], ["synched f0 from RPM"], loc="upper right")
        plt.tight_layout()
        if FLAGS.plot_mode == "save":
            plot_path = os.path.join(plots_dir, "f0_audio.pdf")
            logging.info("Saving plot to '%s'...", plot_path)
            plt.savefig(plot_path)
            plt.close()

    # Resample OBD signals and store together with audio in dict
    logging.info("Resampling input signals to given frame rate...")
    time_minmax = np.inf
    for c in FLAGS.commands:
        c_max = np.max(c_dict[c]["times"])
        time_minmax = time_minmax if time_minmax < c_max else c_max
    input_times = np.arange(0., time_minmax, 1 / FLAGS.frame_rate)
    f0_signal = rpm_scale * c_interp["RPM"](input_times)
    data = {
        "sample_rate": FLAGS.sample_rate,
        "frame_rate": FLAGS.frame_rate,
        "audio": audio_trimmed,
        "inputs": {
            "f0": f0_signal
        }
    }
    for c in FLAGS.commands:
        data["inputs"][c] = c_interp[c](input_times)
    audio_times = np.arange(0., len(audio_trimmed)) / FLAGS.sample_rate
    data_path = os.path.join(save_dir, "data.pickle")
    logging.info("Saving data to %s..." % data_path)
    pickle.dump(data, open(data_path, "wb"))
    if "data" in FLAGS.plots:
        if FLAGS.plot_mode == "save":
            plot_path = os.path.join(plots_dir, "data.pdf")
            logging.info("Plotting and saving to '%s'...", plot_path)
            plot_data_dict(data, save_path=plot_path)
            plt.close()

    if FLAGS.plot_mode == "show":
        plt.show()
    logging.info("Done.")