def main(): parser = argparse.ArgumentParser(description='Evaluate loudness and basic frequency (F0) L1 difference between a synthesized wav file to its original wav file') parser.add_argument('-sf', '--synthesized_file', type=str) parser.add_argument('-of', '--original_file', type=str) parser.add_argument('-sr', '--sample_rate', type=int, default=16000) parser.add_argument('-fr', '--frame_rate', type=int, default=250) parser.add_argument('-s', '--include_spectral', type=int, default=1) parser.add_argument('-f0', '--include_f0', type=int, default=1) parser.add_argument('-ld', '--include_ld', type=int, default=1) parser.add_argument('-sdr', '--include_sdr', type=int, default=1) args = parser.parse_args() synth_audio, _ = librosa.load(args.synthesized_file, args.sample_rate) original_audio, _ = librosa.load(args.original_file, args.sample_rate) synth_audio_samples = synth_audio.shape[0] original_audio_samples = original_audio.shape[0] if synth_audio_samples < original_audio_samples: print(f"Trimming original audio samples from {original_audio_samples} to {synth_audio_samples}") original_audio_samples = synth_audio_samples original_audio = original_audio[:original_audio_samples] elif original_audio_samples < synth_audio_samples: print(f"Trimming synthesized audio samples from {synth_audio_samples} to {original_audio_samples}") synth_audio_samples = original_audio_samples synth_audio = synth_audio[:synth_audio_samples] if args.include_sdr: print(f"SDR: {_calc_sdr(synth_audio, original_audio)}") if args.include_f0: print("Calculating F0 for synthesized audio") synth_f0 = spectral_ops.compute_f0(synth_audio, args.sample_rate, args.frame_rate)[0] print("Calculating F0 for original audio") original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0] f0_l1 = np.mean(abs(synth_f0 - original_f0)) print(f"Average F0 L1: {f0_l1}") if args.include_ld: print("Calculating loudness for synthesized audio") synth_loudness = spectral_ops.compute_loudness(synth_audio, args.sample_rate, args.frame_rate) print("Calculating loudness for original audio") original_loudness = spectral_ops.compute_loudness(original_audio, args.sample_rate, args.frame_rate) loudness_l1 = np.mean(abs(synth_loudness - original_loudness)) print(f"Average Loudness L1: {loudness_l1}") if args.include_spectral: from ddsp import losses loss_obj = losses.SpectralLoss(mag_weight=1.0, logmag_weight=1.0) spectral_loss = loss_obj(synth_audio, original_audio) print(f"Average Multi-scale spectrogram loss: {spectral_loss}")
def test_compute_f0_at_sample_rate(self, sample_rate, audio_len_sec): audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec) f0_hz, f0_confidence = spectral_ops.compute_f0(audio_sin, sample_rate, self.frame_rate) expected_f0_hz_and_f0_conf_len = int(self.frame_rate * audio_len_sec) self.assertLen(f0_hz, expected_f0_hz_and_f0_conf_len) self.assertLen(f0_confidence, expected_f0_hz_and_f0_conf_len) self.assertTrue(np.all(np.isfinite(f0_hz))) self.assertTrue(np.all(np.isfinite(f0_confidence)))
def prepare_partial_tfrecord( dataset_dir='nsynth_guitar', split='train', sample_rate=16000, frame_rate=250 ): split_dir = os.path.join(dataset_dir, split) audio_dir = os.path.join(split_dir, 'audio') nsynth_dataset_file = os.path.join(split_dir, 'examples.json') partial_tfrecord_file = os.path.join(split_dir, 'partial.tfrecord') with open(nsynth_dataset_file, 'r') as file: nsynth_dataset_dict = json.load(file) steps = len(nsynth_dataset_dict) with tf.io.TFRecordWriter(partial_tfrecord_file) as writer: for step, (k, v) in enumerate(nsynth_dataset_dict.items()): start_time = time.perf_counter() file_name = '{}.wav'.format(k) target_path = os.path.join(audio_dir, file_name) audio = _load_audio(target_path, sample_rate) f0_hz, f0_confidence = spectral_ops.compute_f0( audio, sample_rate, frame_rate) mean_loudness_db = spectral_ops.compute_loudness( audio, sample_rate, frame_rate, 2048) audio = audio.astype(np.float32) f0_hz = f0_hz.astype(np.float32) f0_confidence = f0_confidence.astype(np.float32) mean_loudness_db = mean_loudness_db.astype(np.float32) # pitch_hz = core.midi_to_hz(v['pitch']) partial_dataset_dict = { 'sample_name': _byte_feature([str.encode(k)]), 'note_number': _int64_feature([v['pitch']]), 'velocity': _int64_feature([v['velocity']]), 'instrument_source': _int64_feature([v['instrument_source']]), 'qualities': _int64_feature(v['qualities']), 'audio': _float_feature(audio), 'f0_hz': _float_feature(f0_hz), 'f0_confidence': _float_feature(f0_confidence), 'loudness_db': _float_feature(mean_loudness_db), } tf_example = tf.train.Example( features=tf.train.Features(feature=partial_dataset_dict)) writer.write(tf_example.SerializeToString()) stop_time = time.perf_counter() elapsed_time = stop_time - start_time print('{}/{} - sample_name: {} - elapsed_time: {:.3f}'.format( step+1, steps, k, elapsed_time))
def _add_f0_estimate(ex, sample_rate, frame_rate): """Add fundamental frequency (f0) estimate using CREPE.""" beam.metrics.Metrics.counter('prepare-tfrecord', 'estimate-f0').inc() audio = ex['audio'] f0_hz, f0_confidence = spectral_ops.compute_f0(audio, sample_rate, frame_rate) ex = dict(ex) ex.update({ 'f0_hz': f0_hz.astype(np.float32), 'f0_confidence': f0_confidence.astype(np.float32) }) return ex
def compute_audio_features(audio, n_fft=2048, sample_rate=16000, frame_rate=250): """Compute features from audio.""" audio_feats = {'audio': audio} audio = squeeze(audio) audio_feats['loudness_db'] = spectral_ops.compute_loudness( audio, sample_rate, frame_rate, n_fft) audio_feats['f0_hz'], audio_feats['f0_confidence'] = spectral_ops.compute_f0( audio, sample_rate, frame_rate) return audio_feats
def _encode(self): logging.info("Writing {}".format(self.tfr)) with tf.python_io.TFRecordWriter(self.tfr) as writer: mix_wav_dir = os.path.join(self.wav_dir, "mix") s1_wav_dir = os.path.join(self.wav_dir, "s1") s2_wav_dir = os.path.join(self.wav_dir, "s2") s3_wav_dir = os.path.join(self.wav_dir, "s3") filenames = os.listdir(s1_wav_dir) for filename in tqdm(filenames): logging.info("Preprocessing %s" % (os.path.join(mix_wav_dir, filename))) mix, _ = librosa.load(os.path.join(mix_wav_dir, filename), self.sample_rate) logging.info("Preprocessing %s" % (os.path.join(s1_wav_dir, filename))) s1, _ = librosa.load(os.path.join(s1_wav_dir, filename), self.sample_rate) s1_f0 = spectral_ops.compute_f0(s1, self.sample_rate, self.frame_rate)[0] s1_loudness = spectral_ops.compute_loudness( s1, self.sample_rate, self.frame_rate) logging.info("Preprocessing %s" % (os.path.join(s2_wav_dir, filename))) s2, _ = librosa.load(os.path.join(s2_wav_dir, filename), self.sample_rate) s2_f0 = spectral_ops.compute_f0(s2, self.sample_rate, self.frame_rate)[0] s2_loudness = spectral_ops.compute_loudness( s2, self.sample_rate, self.frame_rate) logging.info("Preprocessing %s" % (os.path.join(s3_wav_dir, filename))) s3, _ = librosa.load(os.path.join(s3_wav_dir, filename), self.sample_rate) s3_f0 = spectral_ops.compute_f0(s3, self.sample_rate, self.frame_rate)[0] s3_loudness = spectral_ops.compute_loudness( s3, self.sample_rate, self.frame_rate) def sample_to_frame(sample_num): return int(self.frame_rate * sample_num / self.sample_rate) def write(l, r): l_frame = sample_to_frame(l) r_frame = sample_to_frame(r) example = tf.train.Example(features=tf.train.Features( feature={ "mix_audio": self._float_list_feature(mix[l:r]), "s1_audio": self._float_list_feature(s1[l:r]), "s1_f0": self._float_list_feature(s1_f0[l_frame:r_frame]), "s1_loudness": self._float_list_feature( s1_loudness[l_frame:r_frame]), "s2_audio": self._float_list_feature(s2[l:r]), "s2_f0": self._float_list_feature(s2_f0[l_frame:r_frame]), "s2_loudness": self._float_list_feature( s2_loudness[l_frame:r_frame]), "s3_audio": self._float_list_feature(s3[l:r]), "s3_f0": self._float_list_feature(s3_f0[l_frame:r_frame]), "s3_loudness": self._float_list_feature( s3_loudness[l_frame:r_frame]), })) writer.write(example.SerializeToString()) now_length = s1.shape[-1] if now_length < int(4 * self.sample_rate): continue target_length = int(4 * self.sample_rate) stride = int(4 * self.sample_rate) for i in range(0, now_length - target_length, stride): write(i, i + target_length)
def main(): parser = argparse.ArgumentParser( description='Evaluate MCTN output loudness & F0 of every instrument') parser.add_argument('-ld', '--log_dir', type=str, default='./mctn_log') parser.add_argument('-rd', '--results_dir', type=str, default='./results') parser.add_argument('-sr', '--sample_rate', type=int, default=16000) parser.add_argument('-fr', '--frame_rate', type=int, default=250) parser.add_argument('-osd', '--original_sound_dir', type=str, default='./results/DDSP - Same artist - Test') args = parser.parse_args() args.log_file = os.path.join(args.log_dir, 'log.txt') logger = logging.getLogger() logger.addHandler(logging.FileHandler(args.log_file)) logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.INFO) # Prepare input from files logger.info("Loading data from mctn files") audio_features_vocals = { 'loudness_db': np.load(os.path.join(args.results_dir, "vocals_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "vocals_f0_hz.npy"), allow_pickle=False), } audio_features_bass = { 'loudness_db': np.load(os.path.join(args.results_dir, "bass_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "bass_f0_hz.npy"), allow_pickle=False), } audio_features_drums = { 'loudness_db': np.load(os.path.join(args.results_dir, "drums_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "drums_f0_hz.npy"), allow_pickle=False), } # Calc average loudness & F0 diff for every instrument for instrument, audio_features in [("bass", audio_features_bass), ("drums", audio_features_drums), ("vocals", audio_features_vocals)]: original_audio, _ = librosa.load( os.path.join(args.original_sound_dir, f"original_{instrument}.wav"), args.sample_rate) synth_audio_samples = audio_features_vocals["f0_hz"].shape[ 1] * args.sample_rate // args.frame_rate original_audio_samples = original_audio.shape[0] if synth_audio_samples < original_audio_samples: logging.info( f"Trimming original {instrument} audio samples from {original_audio_samples} to {synth_audio_samples}" ) original_audio_samples = synth_audio_samples original_audio = original_audio[:original_audio_samples] # Assuming only 1 batch synth_f0 = audio_features["f0_hz"][0] synth_loudness = audio_features["loudness_db"][0] logging.info(f"Calculating F0 for {instrument} original audio") original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0] logging.info(f"Calculating loudness for {instrument} original audio") original_loudness = spectral_ops.compute_loudness( original_audio, args.sample_rate, args.frame_rate) f0_l1 = np.mean(abs(synth_f0 - original_f0)) loudness_l1 = np.mean(abs(synth_loudness - original_loudness)) logging.info(f"Average {instrument} F0 L1: {f0_l1}") logging.info(f"Average {instrument} Loudness L1: {loudness_l1}")
def main(argv): # Check preconditions if "RPM" not in FLAGS.commands: raise Exception( "RPM must be part of the OBD commands since it is used for OBD-audio alignment." ) elif FLAGS.commands[ 0] != "RPM": # Make sure RPM is first in commands list (necessary for later) i_RPM = FLAGS.commands.index("RPM") FLAGS.commands[i_RPM] = FLAGS.commands[0] FLAGS.commands[0] = "RPM" if FLAGS.data_name is None: FLAGS.data_name = get_timestamp() # Create save folder save_dir = os.path.join(FLAGS.save_dir, FLAGS.data_name) if not os.path.exists(save_dir): os.makedirs(save_dir) if FLAGS.plot_mode == "save": plots_dir = os.path.join(save_dir, "plots") if not os.path.exists(plots_dir): os.makedirs(plots_dir) # Load OBD data n_commands = len(FLAGS.commands) c_dict = dict() if "obd" in FLAGS.plots: _, axes = plt.subplots(n_commands, 1, figsize=(15, 3 * n_commands)) axes = list(iter(axes)) if n_commands > 1 else [axes] for i, c in enumerate(FLAGS.commands): c_pickle_path = os.path.join(FLAGS.data_dir, c + ".pickle") logging.info("Unpickling %s data..." % c) with open(c_pickle_path, "rb") as f: c_list = pickle.load(f) if c == "RPM": t0 = c_list.times[0] # Let times be relative to RPM's initial time logging.info("Preprocessing data...") unit = str(c_list.values[0].units) y, t = preprocess_obd_data(c_list, t0=t0) c_dict[c] = {"values": y, "times": t} if "obd" in FLAGS.plots: logging.info("Plotting %s data..." % c) ax = axes[i] ax.plot(t, y, "-x") ax.set_title(c) ax.set_xlabel("time [s]") ax.set_ylabel("%s" % unit) if "obd" in FLAGS.plots: plt.tight_layout() if FLAGS.plot_mode == "save": plot_path = os.path.join(plots_dir, "odb.pdf") logging.info("Saving plot to '%s'...", plot_path) plt.savefig(plot_path) plt.close() # Estimate f0 from audio (if not already done) f0_frame_size = _CREPE_FRAME_SIZE // 2 #16 f0_frame_rate = _CREPE_SAMPLE_RATE / f0_frame_size # frame rate of f0 and loudness features f0_path = os.path.join(FLAGS.data_dir, "f0.npy") audio_path = os.path.join(FLAGS.data_dir, FLAGS.audio_filename) audio, _ = librosa.load(audio_path, FLAGS.sample_rate) if FLAGS.max_mins > 0.0: logging.info("Cropping out first %.1f minutes of audio..." % FLAGS.max_mins) n_samples = int(FLAGS.max_mins * 60 * FLAGS.sample_rate) audio = audio[:n_samples] if os.path.exists(f0_path): logging.info("Using precomputed f0.") f0 = np.load(f0_path) elif FLAGS.skip_f0_sync: logging.info("Skipping f0 estimation since skipping sync.") f0 = np.zeros((f0_frame_size, )) else: logging.info("Estimating f0 using CREPE...") audio_lores = librosa.core.resample(audio, FLAGS.sample_rate, _CREPE_SAMPLE_RATE) f0, _ = compute_f0(audio_lores, _CREPE_SAMPLE_RATE, f0_frame_rate) np.save(f0_path, f0) # Interpolate OBD quantities to allow for upsampling later c_interp = dict() for c in FLAGS.commands: logging.info("Interpolating %s signal..." % c) c_interp[c] = interp1d(c_dict[c]["times"], c_dict[c]["values"], kind=FLAGS.interp_method) # Scale interpolated RPM signal to become f0 and sample on CREPE f0 times logging.info("Upsampling RPM signal...") f0_times = np.arange(0., len(f0)) / f0_frame_rate f0_rpm_times = f0_times[f0_times < np.max(c_dict["RPM"]["times"])] rpm_scale = 1. / 60 if FLAGS.engine_type == "four-stroke": rpm_scale *= 2 f0_rpm = rpm_scale * c_interp["RPM"](f0_rpm_times) # Find lag between f0 and RPM if not FLAGS.skip_f0_sync: logging.info("Calculating lag between f0 and RPM...") xcorr = correlate(f0 - np.mean(f0), f0_rpm - np.mean(f0_rpm)) lag = xcorr.argmax() - (len(f0_rpm) - 1) logging.info("Found lag: %d frames (%.3f seconds)" % (lag, lag / f0_frame_rate)) else: logging.info("Will not sync f0 and RPM. Uses lag 0.") lag = 0 if "f0-rpm" in FLAGS.plots: logging.info("Plotting RPM alignment...") _, axes = plt.subplots(2, 1, figsize=(15, 6)) axes[0].plot(f0_times, f0, label="f0") axes[0].plot(f0_rpm_times, f0_rpm, label="f0-rpm") axes[0].set_title("Before alignment") axes[1].plot(f0_times - lag / f0_frame_rate, f0, label="f0") axes[1].plot(f0_rpm_times, f0_rpm, label="f0-rpm") axes[1].set_title("After alignment") plt.tight_layout() if FLAGS.plot_mode == "save": plot_path = os.path.join(plots_dir, "f0_rpm.pdf") logging.info("Saving plot to '%s'...", plot_path) plt.savefig(plot_path) plt.close() # Trim audio according to lag start = int(lag * FLAGS.sample_rate / f0_frame_rate) end = start + int(len(f0_rpm) * FLAGS.sample_rate / f0_frame_rate) + 1 audio_trimmed = audio[start:end] audio_trimmed_length = len(audio_trimmed) / float(FLAGS.sample_rate) logging.info("Trimmed audio is %.3f seconds." % audio_trimmed_length) if "f0-audio" in FLAGS.plots: logging.info("Plotting audio spectrogram with f0...") fmax = 2**13 n_fft = 2**13 plt.figure(figsize=(15, 6)) S = librosa.feature.melspectrogram(y=audio_trimmed, sr=FLAGS.sample_rate, n_fft=n_fft, n_mels=1024, fmax=fmax) S_dB = librosa.power_to_db(S, ref=np.max) ax = specshow(S_dB, x_axis='time', y_axis='mel', sr=FLAGS.sample_rate, fmax=fmax) f0_h, = ax.plot(f0_rpm_times, f0_rpm, "--") ax.set_ylim((0, 5 * np.max(f0_rpm))) ax.set_xlabel("time [s]") ax.set_ylabel("frequency [Hz]") ax.legend([f0_h], ["synched f0 from RPM"], loc="upper right") plt.tight_layout() if FLAGS.plot_mode == "save": plot_path = os.path.join(plots_dir, "f0_audio.pdf") logging.info("Saving plot to '%s'...", plot_path) plt.savefig(plot_path) plt.close() # Resample OBD signals and store together with audio in dict logging.info("Resampling input signals to given frame rate...") time_minmax = np.inf for c in FLAGS.commands: c_max = np.max(c_dict[c]["times"]) time_minmax = time_minmax if time_minmax < c_max else c_max input_times = np.arange(0., time_minmax, 1 / FLAGS.frame_rate) f0_signal = rpm_scale * c_interp["RPM"](input_times) data = { "sample_rate": FLAGS.sample_rate, "frame_rate": FLAGS.frame_rate, "audio": audio_trimmed, "inputs": { "f0": f0_signal } } for c in FLAGS.commands: data["inputs"][c] = c_interp[c](input_times) audio_times = np.arange(0., len(audio_trimmed)) / FLAGS.sample_rate data_path = os.path.join(save_dir, "data.pickle") logging.info("Saving data to %s..." % data_path) pickle.dump(data, open(data_path, "wb")) if "data" in FLAGS.plots: if FLAGS.plot_mode == "save": plot_path = os.path.join(plots_dir, "data.pdf") logging.info("Plotting and saving to '%s'...", plot_path) plot_data_dict(data, save_path=plot_path) plt.close() if FLAGS.plot_mode == "show": plt.show() logging.info("Done.")