def test_output_shape_is_correct(self): loss_obj = losses.SpectralLoss() input_audio = tf.random.uniform((3, 16000), dtype=tf.float32) target_audio = tf.random.uniform((3, 16000), dtype=tf.float32) loss = loss_obj(input_audio, target_audio) self.assertListEqual([], loss.shape.as_list())
def main(): parser = argparse.ArgumentParser(description='Evaluate loudness and basic frequency (F0) L1 difference between a synthesized wav file to its original wav file') parser.add_argument('-sf', '--synthesized_file', type=str) parser.add_argument('-of', '--original_file', type=str) parser.add_argument('-sr', '--sample_rate', type=int, default=16000) parser.add_argument('-fr', '--frame_rate', type=int, default=250) parser.add_argument('-s', '--include_spectral', type=int, default=1) parser.add_argument('-f0', '--include_f0', type=int, default=1) parser.add_argument('-ld', '--include_ld', type=int, default=1) parser.add_argument('-sdr', '--include_sdr', type=int, default=1) args = parser.parse_args() synth_audio, _ = librosa.load(args.synthesized_file, args.sample_rate) original_audio, _ = librosa.load(args.original_file, args.sample_rate) synth_audio_samples = synth_audio.shape[0] original_audio_samples = original_audio.shape[0] if synth_audio_samples < original_audio_samples: print(f"Trimming original audio samples from {original_audio_samples} to {synth_audio_samples}") original_audio_samples = synth_audio_samples original_audio = original_audio[:original_audio_samples] elif original_audio_samples < synth_audio_samples: print(f"Trimming synthesized audio samples from {synth_audio_samples} to {original_audio_samples}") synth_audio_samples = original_audio_samples synth_audio = synth_audio[:synth_audio_samples] if args.include_sdr: print(f"SDR: {_calc_sdr(synth_audio, original_audio)}") if args.include_f0: print("Calculating F0 for synthesized audio") synth_f0 = spectral_ops.compute_f0(synth_audio, args.sample_rate, args.frame_rate)[0] print("Calculating F0 for original audio") original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0] f0_l1 = np.mean(abs(synth_f0 - original_f0)) print(f"Average F0 L1: {f0_l1}") if args.include_ld: print("Calculating loudness for synthesized audio") synth_loudness = spectral_ops.compute_loudness(synth_audio, args.sample_rate, args.frame_rate) print("Calculating loudness for original audio") original_loudness = spectral_ops.compute_loudness(original_audio, args.sample_rate, args.frame_rate) loudness_l1 = np.mean(abs(synth_loudness - original_loudness)) print(f"Average Loudness L1: {loudness_l1}") if args.include_spectral: from ddsp import losses loss_obj = losses.SpectralLoss(mag_weight=1.0, logmag_weight=1.0) spectral_loss = loss_obj(synth_audio, original_audio) print(f"Average Multi-scale spectrogram loss: {spectral_loss}")
def test_output_shape_is_correct(self): """Test correct shape with all losses active.""" loss_obj = losses.SpectralLoss( mag_weight=1.0, delta_time_weight=1.0, delta_freq_weight=1.0, cumsum_freq_weight=1.0, logmag_weight=1.0, loudness_weight=1.0, ) input_audio = tf.ones((3, 8000), dtype=tf.float32) target_audio = tf.ones((3, 8000), dtype=tf.float32) loss = loss_obj(input_audio, target_audio) self.assertListEqual([], loss.shape.as_list()) self.assertTrue(np.isfinite(loss))
def setUp(self): """Create some dummy input data for the chain.""" super().setUp() # Create a network output dictionary. self.nn_outputs = { 'audio': tf.ones((3, 8000), dtype=tf.float32), 'audio_synth': tf.ones((3, 8000), dtype=tf.float32), 'magnitudes': tf.ones((3, 200, 2), dtype=tf.float32), 'f0_hz': 200 + tf.ones((3, 200, 1), dtype=tf.float32), } # Create Processors. spectral_loss = losses.SpectralLoss() crepe_loss = losses.PretrainedCREPEEmbeddingLoss(name='crepe_loss') # Create DAG for testing. self.dag = [ (spectral_loss, ['audio', 'audio_synth']), (crepe_loss, ['audio', 'audio_synth']), ] self.expected_outputs = ['spectral_loss', 'crepe_loss']