def test_compute_loudness_at_sample_rate_2d(self, sample_rate, audio_len_sec): batch_size = 8 audio_sin_batch = gen_np_batched_sinusoids(self.frequency, self.amp, sample_rate, audio_len_sec, batch_size) expected_loudness_len = int(self.frame_rate * audio_len_sec) for use_tf in [False, True]: loudness_batch = spectral_ops.compute_loudness(audio_sin_batch, sample_rate, self.frame_rate, use_tf=use_tf) self.assertEqual(loudness_batch.shape[0], batch_size) self.assertEqual(loudness_batch.shape[1], expected_loudness_len) self.assertTrue(np.all(np.isfinite(loudness_batch))) # Check if batched loudness is equal to equivalent single computations audio_sin = gen_np_sinusoid(self.frequency, self.amp, sample_rate, audio_len_sec) loudness_target = spectral_ops.compute_loudness(audio_sin, sample_rate, self.frame_rate, use_tf=use_tf) loudness_batch_target = np.tile(loudness_target, (batch_size, 1)) # Allow tolerance within 1dB self.assertAllClose(loudness_batch, loudness_batch_target, atol=1, rtol=1)
def call(self, target_audio, audio): loss = 0.0 loss_ops = [] diff = spectral_ops.diff for size in self.fft_sizes: loss_op = functools.partial(spectral_ops.compute_mag, size=size) loss_ops.append(loss_op) # Compute loss for each fft size. for loss_op in loss_ops: target_mag = loss_op(target_audio) value_mag = loss_op(audio) # Add magnitude loss. if self.mag_weight > 0: loss += self.mag_weight * mean_difference( target_mag, value_mag, self.loss_type) if self.delta_time_weight > 0: target = diff(target_mag, axis=1) value = diff(value_mag, axis=1) loss += self.delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_time_weight > 0: target = diff(diff(target_mag, axis=1), axis=1) value = diff(diff(value_mag, axis=1), axis=1) loss += self.delta_delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_freq_weight > 0: target = diff(target_mag, axis=2) value = diff(value_mag, axis=2) loss += self.delta_freq_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_freq_weight > 0: target = diff(diff(target_mag, axis=2), axis=2) value = diff(diff(value_mag, axis=2), axis=2) loss += self.delta_delta_freq_weight * mean_difference( target, value, self.loss_type) # Add logmagnitude loss, reusing spectrogram. if self.logmag_weight > 0: target = spectral_ops.safe_log(target_mag) value = spectral_ops.safe_log(value_mag) loss += self.logmag_weight * mean_difference( target, value, self.loss_type) if self.loudness_weight > 0: target = spectral_ops.compute_loudness(target_audio, n_fft=2048) value = spectral_ops.compute_loudness(audio, n_fft=2048) loss += self.loudness_weight * mean_difference( target, value, self.loss_type) return loss
def main(): parser = argparse.ArgumentParser(description='Evaluate loudness and basic frequency (F0) L1 difference between a synthesized wav file to its original wav file') parser.add_argument('-sf', '--synthesized_file', type=str) parser.add_argument('-of', '--original_file', type=str) parser.add_argument('-sr', '--sample_rate', type=int, default=16000) parser.add_argument('-fr', '--frame_rate', type=int, default=250) parser.add_argument('-s', '--include_spectral', type=int, default=1) parser.add_argument('-f0', '--include_f0', type=int, default=1) parser.add_argument('-ld', '--include_ld', type=int, default=1) parser.add_argument('-sdr', '--include_sdr', type=int, default=1) args = parser.parse_args() synth_audio, _ = librosa.load(args.synthesized_file, args.sample_rate) original_audio, _ = librosa.load(args.original_file, args.sample_rate) synth_audio_samples = synth_audio.shape[0] original_audio_samples = original_audio.shape[0] if synth_audio_samples < original_audio_samples: print(f"Trimming original audio samples from {original_audio_samples} to {synth_audio_samples}") original_audio_samples = synth_audio_samples original_audio = original_audio[:original_audio_samples] elif original_audio_samples < synth_audio_samples: print(f"Trimming synthesized audio samples from {synth_audio_samples} to {original_audio_samples}") synth_audio_samples = original_audio_samples synth_audio = synth_audio[:synth_audio_samples] if args.include_sdr: print(f"SDR: {_calc_sdr(synth_audio, original_audio)}") if args.include_f0: print("Calculating F0 for synthesized audio") synth_f0 = spectral_ops.compute_f0(synth_audio, args.sample_rate, args.frame_rate)[0] print("Calculating F0 for original audio") original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0] f0_l1 = np.mean(abs(synth_f0 - original_f0)) print(f"Average F0 L1: {f0_l1}") if args.include_ld: print("Calculating loudness for synthesized audio") synth_loudness = spectral_ops.compute_loudness(synth_audio, args.sample_rate, args.frame_rate) print("Calculating loudness for original audio") original_loudness = spectral_ops.compute_loudness(original_audio, args.sample_rate, args.frame_rate) loudness_l1 = np.mean(abs(synth_loudness - original_loudness)) print(f"Average Loudness L1: {loudness_l1}") if args.include_spectral: from ddsp import losses loss_obj = losses.SpectralLoss(mag_weight=1.0, logmag_weight=1.0) spectral_loss = loss_obj(synth_audio, original_audio) print(f"Average Multi-scale spectrogram loss: {spectral_loss}")
def test_compute_loudness_at_indivisible_sample_rate( self, sample_rate, audio_len_sec): audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec) for use_tf in [False, True]: with self.assertRaises(ValueError): spectral_ops.compute_loudness(audio_sin, sample_rate, self.frame_rate, use_tf=use_tf)
def call(self, target_audio, audio): loss = 0.0 diff = spectral_ops.diff cumsum = tf.math.cumsum # Compute loss for each fft size. for loss_op in self.spectrogram_ops: target_mag = loss_op(target_audio) value_mag = loss_op(audio) # Add magnitude loss. if self.mag_weight > 0: loss += self.mag_weight * mean_difference( target_mag, value_mag, self.loss_type) if self.delta_time_weight > 0: target = diff(target_mag, axis=1) value = diff(value_mag, axis=1) loss += self.delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_freq_weight > 0: target = diff(target_mag, axis=2) value = diff(value_mag, axis=2) loss += self.delta_freq_weight * mean_difference( target, value, self.loss_type) # TODO(kyriacos) normalize cumulative spectrogram if self.cumsum_freq_weight > 0: target = cumsum(target_mag, axis=2) value = cumsum(value_mag, axis=2) loss += self.cumsum_freq_weight * mean_difference( target, value, self.loss_type) # Add logmagnitude loss, reusing spectrogram. if self.logmag_weight > 0: target = spectral_ops.safe_log(target_mag) value = spectral_ops.safe_log(value_mag) loss += self.logmag_weight * mean_difference( target, value, self.loss_type) if self.loudness_weight > 0: target = spectral_ops.compute_loudness(target_audio, n_fft=2048, use_tf=True) value = spectral_ops.compute_loudness(audio, n_fft=2048, use_tf=True) loss += self.loudness_weight * mean_difference( target, value, self.loss_type) return loss
def test_tf_and_np_are_consistent(self): amp = 1e-2 audio = amp * (np.random.rand(64000).astype(np.float32) * 2.0 - 1.0) frame_size = 2048 frame_rate = 250 ld_tf = spectral_ops.compute_loudness( audio, n_fft=frame_size, frame_rate=frame_rate, use_tf=True) ld_np = spectral_ops.compute_loudness( audio, n_fft=frame_size, frame_rate=frame_rate, use_tf=False) self.assertAllClose(np.abs(ld_np), np.abs(ld_tf), rtol=1e-3, atol=1e-3)
def _add_loudness(ex, sample_rate, frame_rate, n_fft=2048): """Add loudness in dB.""" beam.metrics.Metrics.counter('prepare-tfrecord', 'compute-loudness').inc() audio = ex['audio'] mean_loudness_db = compute_loudness(audio, sample_rate, frame_rate, n_fft) ex = dict(ex) ex['loudness_db'] = mean_loudness_db.astype(np.float32) return ex
def test_tf_compute_loudness_at_sample_rate(self, sample_rate, audio_len_sec): audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec) loudness = spectral_ops.compute_loudness(audio_sin, sample_rate, self.frame_rate) expected_loudness_len = int(self.frame_rate * audio_len_sec) self.assertLen(loudness, expected_loudness_len) self.assertTrue(np.all(np.isfinite(loudness)))
def prepare_partial_tfrecord( dataset_dir='nsynth_guitar', split='train', sample_rate=16000, frame_rate=250 ): split_dir = os.path.join(dataset_dir, split) audio_dir = os.path.join(split_dir, 'audio') nsynth_dataset_file = os.path.join(split_dir, 'examples.json') partial_tfrecord_file = os.path.join(split_dir, 'partial.tfrecord') with open(nsynth_dataset_file, 'r') as file: nsynth_dataset_dict = json.load(file) steps = len(nsynth_dataset_dict) with tf.io.TFRecordWriter(partial_tfrecord_file) as writer: for step, (k, v) in enumerate(nsynth_dataset_dict.items()): start_time = time.perf_counter() file_name = '{}.wav'.format(k) target_path = os.path.join(audio_dir, file_name) audio = _load_audio(target_path, sample_rate) f0_hz, f0_confidence = spectral_ops.compute_f0( audio, sample_rate, frame_rate) mean_loudness_db = spectral_ops.compute_loudness( audio, sample_rate, frame_rate, 2048) audio = audio.astype(np.float32) f0_hz = f0_hz.astype(np.float32) f0_confidence = f0_confidence.astype(np.float32) mean_loudness_db = mean_loudness_db.astype(np.float32) # pitch_hz = core.midi_to_hz(v['pitch']) partial_dataset_dict = { 'sample_name': _byte_feature([str.encode(k)]), 'note_number': _int64_feature([v['pitch']]), 'velocity': _int64_feature([v['velocity']]), 'instrument_source': _int64_feature([v['instrument_source']]), 'qualities': _int64_feature(v['qualities']), 'audio': _float_feature(audio), 'f0_hz': _float_feature(f0_hz), 'f0_confidence': _float_feature(f0_confidence), 'loudness_db': _float_feature(mean_loudness_db), } tf_example = tf.train.Example( features=tf.train.Features(feature=partial_dataset_dict)) writer.write(tf_example.SerializeToString()) stop_time = time.perf_counter() elapsed_time = stop_time - start_time print('{}/{} - sample_name: {} - elapsed_time: {:.3f}'.format( step+1, steps, k, elapsed_time))
def _add_loudness(ex, sample_rate, frame_rate, n_fft=2048): """Add loudness in dB.""" beam.metrics.Metrics.counter('prepare-tfrecord', 'compute-loudness').inc() audio = ex['audio'] expected_len = int(len(audio) / sample_rate * frame_rate) mean_loudness_db = compute_loudness(audio, sample_rate, frame_rate, n_fft) # Trim `mean_loudness_db` or pad to dB floor mean_loudness_db = _make_array_expected_length(mean_loudness_db, expected_len, -LD_RANGE) ex = dict(ex) ex['loudness_db'] = mean_loudness_db.astype(np.float32) return ex
def test_compute_loudness_at_sample_rate_1d(self, sample_rate, audio_len_sec): audio_sin = gen_np_sinusoid(self.frequency, self.amp, sample_rate, audio_len_sec) expected_loudness_len = int(self.frame_rate * audio_len_sec) for use_tf in [False, True]: loudness = spectral_ops.compute_loudness(audio_sin, sample_rate, self.frame_rate, use_tf=use_tf) self.assertLen(loudness, expected_loudness_len) self.assertTrue(np.all(np.isfinite(loudness)))
def compute_audio_features(audio, n_fft=2048, sample_rate=16000, frame_rate=250): """Compute features from audio.""" audio_feats = {'audio': audio} audio = squeeze(audio) audio_feats['loudness_db'] = spectral_ops.compute_loudness( audio, sample_rate, frame_rate, n_fft) audio_feats['f0_hz'], audio_feats['f0_confidence'] = spectral_ops.compute_f0( audio, sample_rate, frame_rate) return audio_feats
def _encode(self): logging.info("Writing {}".format(self.tfr)) with tf.python_io.TFRecordWriter(self.tfr) as writer: mix_wav_dir = os.path.join(self.wav_dir, "mix") s1_wav_dir = os.path.join(self.wav_dir, "s1") s2_wav_dir = os.path.join(self.wav_dir, "s2") s3_wav_dir = os.path.join(self.wav_dir, "s3") filenames = os.listdir(s1_wav_dir) for filename in tqdm(filenames): logging.info("Preprocessing %s" % (os.path.join(mix_wav_dir, filename))) mix, _ = librosa.load(os.path.join(mix_wav_dir, filename), self.sample_rate) logging.info("Preprocessing %s" % (os.path.join(s1_wav_dir, filename))) s1, _ = librosa.load(os.path.join(s1_wav_dir, filename), self.sample_rate) s1_f0 = spectral_ops.compute_f0(s1, self.sample_rate, self.frame_rate)[0] s1_loudness = spectral_ops.compute_loudness( s1, self.sample_rate, self.frame_rate) logging.info("Preprocessing %s" % (os.path.join(s2_wav_dir, filename))) s2, _ = librosa.load(os.path.join(s2_wav_dir, filename), self.sample_rate) s2_f0 = spectral_ops.compute_f0(s2, self.sample_rate, self.frame_rate)[0] s2_loudness = spectral_ops.compute_loudness( s2, self.sample_rate, self.frame_rate) logging.info("Preprocessing %s" % (os.path.join(s3_wav_dir, filename))) s3, _ = librosa.load(os.path.join(s3_wav_dir, filename), self.sample_rate) s3_f0 = spectral_ops.compute_f0(s3, self.sample_rate, self.frame_rate)[0] s3_loudness = spectral_ops.compute_loudness( s3, self.sample_rate, self.frame_rate) def sample_to_frame(sample_num): return int(self.frame_rate * sample_num / self.sample_rate) def write(l, r): l_frame = sample_to_frame(l) r_frame = sample_to_frame(r) example = tf.train.Example(features=tf.train.Features( feature={ "mix_audio": self._float_list_feature(mix[l:r]), "s1_audio": self._float_list_feature(s1[l:r]), "s1_f0": self._float_list_feature(s1_f0[l_frame:r_frame]), "s1_loudness": self._float_list_feature( s1_loudness[l_frame:r_frame]), "s2_audio": self._float_list_feature(s2[l:r]), "s2_f0": self._float_list_feature(s2_f0[l_frame:r_frame]), "s2_loudness": self._float_list_feature( s2_loudness[l_frame:r_frame]), "s3_audio": self._float_list_feature(s3[l:r]), "s3_f0": self._float_list_feature(s3_f0[l_frame:r_frame]), "s3_loudness": self._float_list_feature( s3_loudness[l_frame:r_frame]), })) writer.write(example.SerializeToString()) now_length = s1.shape[-1] if now_length < int(4 * self.sample_rate): continue target_length = int(4 * self.sample_rate) stride = int(4 * self.sample_rate) for i in range(0, now_length - target_length, stride): write(i, i + target_length)
def main(): parser = argparse.ArgumentParser( description='Evaluate MCTN output loudness & F0 of every instrument') parser.add_argument('-ld', '--log_dir', type=str, default='./mctn_log') parser.add_argument('-rd', '--results_dir', type=str, default='./results') parser.add_argument('-sr', '--sample_rate', type=int, default=16000) parser.add_argument('-fr', '--frame_rate', type=int, default=250) parser.add_argument('-osd', '--original_sound_dir', type=str, default='./results/DDSP - Same artist - Test') args = parser.parse_args() args.log_file = os.path.join(args.log_dir, 'log.txt') logger = logging.getLogger() logger.addHandler(logging.FileHandler(args.log_file)) logger.addHandler(logging.StreamHandler(sys.stdout)) logger.setLevel(logging.INFO) # Prepare input from files logger.info("Loading data from mctn files") audio_features_vocals = { 'loudness_db': np.load(os.path.join(args.results_dir, "vocals_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "vocals_f0_hz.npy"), allow_pickle=False), } audio_features_bass = { 'loudness_db': np.load(os.path.join(args.results_dir, "bass_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "bass_f0_hz.npy"), allow_pickle=False), } audio_features_drums = { 'loudness_db': np.load(os.path.join(args.results_dir, "drums_loudness_db.npy"), allow_pickle=False), 'f0_hz': np.load(os.path.join(args.results_dir, "drums_f0_hz.npy"), allow_pickle=False), } # Calc average loudness & F0 diff for every instrument for instrument, audio_features in [("bass", audio_features_bass), ("drums", audio_features_drums), ("vocals", audio_features_vocals)]: original_audio, _ = librosa.load( os.path.join(args.original_sound_dir, f"original_{instrument}.wav"), args.sample_rate) synth_audio_samples = audio_features_vocals["f0_hz"].shape[ 1] * args.sample_rate // args.frame_rate original_audio_samples = original_audio.shape[0] if synth_audio_samples < original_audio_samples: logging.info( f"Trimming original {instrument} audio samples from {original_audio_samples} to {synth_audio_samples}" ) original_audio_samples = synth_audio_samples original_audio = original_audio[:original_audio_samples] # Assuming only 1 batch synth_f0 = audio_features["f0_hz"][0] synth_loudness = audio_features["loudness_db"][0] logging.info(f"Calculating F0 for {instrument} original audio") original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0] logging.info(f"Calculating loudness for {instrument} original audio") original_loudness = spectral_ops.compute_loudness( original_audio, args.sample_rate, args.frame_rate) f0_l1 = np.mean(abs(synth_f0 - original_f0)) loudness_l1 = np.mean(abs(synth_loudness - original_loudness)) logging.info(f"Average {instrument} F0 L1: {f0_l1}") logging.info(f"Average {instrument} Loudness L1: {loudness_l1}")
def call(self, target_audio, audio, weights=None): loss = 0.0 diff = spectral_ops.diff cumsum = tf.math.cumsum # Compute loss for each fft size. for loss_op in self.spectrogram_ops: target_mag = loss_op(target_audio) value_mag = loss_op(audio) # Add magnitude loss. if self.mag_weight > 0: loss += self.mag_weight * mean_difference( target_mag, value_mag, self.loss_type, weights=weights) if self.delta_time_weight > 0: target = diff(target_mag, axis=1) value = diff(value_mag, axis=1) loss += self.delta_time_weight * mean_difference( target, value, self.loss_type, weights=weights) if self.delta_freq_weight > 0: target = diff(target_mag, axis=2) value = diff(value_mag, axis=2) loss += self.delta_freq_weight * mean_difference( target, value, self.loss_type, weights=weights) # TODO(kyriacos) normalize cumulative spectrogram if self.cumsum_freq_weight > 0: target = cumsum(target_mag, axis=-1) value = cumsum(value_mag, axis=-1) loss += self.cumsum_freq_weight * mean_difference( target, value, self.loss_type, weights=weights) if self.bin_time_weight > 0: target = tf.reduce_sum(target_mag, axis=-1) value = tf.reduce_sum(value_mag, axis=-1) # target = tf.cumsum(target, axis=-1) # value = tf.cumsum(value, axis=-1) loss += self.bin_time_weight * mean_difference( target, value, self.loss_type, weights=weights) # times = tf.cast(tf.linspace(0, 1, tf.shape(target)[-1]), dtype=tf.float32) # target = target / tf.reduce_sum(target, axis=-1) # value = value / tf.reduce_sum(value, axis=-1) # target = tf.cast(tf.expand_dims(target, axis=1), dtype=tf.float32) # value = tf.cast(tf.expand_dims(value, axis=1), dtype=tf.float32) # loss += self.bin_time_weight * tf.reduce_mean(wasserstein_distance(times, times, target, value)) if self.max_power_weight > 0: target = spectral_ops.safe_log( tf.reduce_max(target_mag, axis=2)) value = spectral_ops.safe_log(tf.reduce_max(value_mag, axis=2)) loss += self.max_power_weight * mean_difference( target, value, self.loss_type, weights=weights) # Add logmagnitude loss, reusing spectrogram. if self.logmag_weight > 0: target = spectral_ops.safe_log(target_mag) value = spectral_ops.safe_log(value_mag) loss += self.logmag_weight * mean_difference( target, value, self.loss_type, weights=weights) if self.mel_weight > 0 or self.logmel_weight > 0: target_mel = spectral_ops.compute_mel_from_mag( target_mag, lo_hz=2.0, bins=None, fft_size=loss_op.keywords['size']) value_mel = spectral_ops.compute_mel_from_mag( value_mag, lo_hz=2.0, bins=None, fft_size=loss_op.keywords['size']) if self.mel_weight > 0: loss += self.mel_weight * mean_difference( target_mel, value_mel, self.loss_type, weights=weights) if self.logmel_weight > 0: target_logmel = spectral_ops.safe_log(target_mel) value_logmel = spectral_ops.safe_log(value_mel) loss += self.logmel_weight * mean_difference( target_logmel, value_logmel, self.loss_type, weights=weights) if self.loudness_weight > 0: target = spectral_ops.compute_loudness(target_audio, n_fft=2048, use_tf=True) value = spectral_ops.compute_loudness(audio, n_fft=2048, use_tf=True) loss += self.loudness_weight * mean_difference( target, value, self.loss_type, weights=weights) return loss