Esempio n. 1
0
    def test_compute_loudness_at_sample_rate_2d(self, sample_rate,
                                                audio_len_sec):
        batch_size = 8
        audio_sin_batch = gen_np_batched_sinusoids(self.frequency, self.amp,
                                                   sample_rate, audio_len_sec,
                                                   batch_size)
        expected_loudness_len = int(self.frame_rate * audio_len_sec)

        for use_tf in [False, True]:
            loudness_batch = spectral_ops.compute_loudness(audio_sin_batch,
                                                           sample_rate,
                                                           self.frame_rate,
                                                           use_tf=use_tf)

            self.assertEqual(loudness_batch.shape[0], batch_size)
            self.assertEqual(loudness_batch.shape[1], expected_loudness_len)
            self.assertTrue(np.all(np.isfinite(loudness_batch)))

            # Check if batched loudness is equal to equivalent single computations
            audio_sin = gen_np_sinusoid(self.frequency, self.amp, sample_rate,
                                        audio_len_sec)
            loudness_target = spectral_ops.compute_loudness(audio_sin,
                                                            sample_rate,
                                                            self.frame_rate,
                                                            use_tf=use_tf)
            loudness_batch_target = np.tile(loudness_target, (batch_size, 1))
            # Allow tolerance within 1dB
            self.assertAllClose(loudness_batch,
                                loudness_batch_target,
                                atol=1,
                                rtol=1)
Esempio n. 2
0
    def call(self, target_audio, audio):

        loss = 0.0
        loss_ops = []
        diff = spectral_ops.diff

        for size in self.fft_sizes:
            loss_op = functools.partial(spectral_ops.compute_mag, size=size)
            loss_ops.append(loss_op)

        # Compute loss for each fft size.
        for loss_op in loss_ops:
            target_mag = loss_op(target_audio)
            value_mag = loss_op(audio)

            # Add magnitude loss.
            if self.mag_weight > 0:
                loss += self.mag_weight * mean_difference(
                    target_mag, value_mag, self.loss_type)

            if self.delta_time_weight > 0:
                target = diff(target_mag, axis=1)
                value = diff(value_mag, axis=1)
                loss += self.delta_time_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_delta_time_weight > 0:
                target = diff(diff(target_mag, axis=1), axis=1)
                value = diff(diff(value_mag, axis=1), axis=1)
                loss += self.delta_delta_time_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_freq_weight > 0:
                target = diff(target_mag, axis=2)
                value = diff(value_mag, axis=2)
                loss += self.delta_freq_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_delta_freq_weight > 0:
                target = diff(diff(target_mag, axis=2), axis=2)
                value = diff(diff(value_mag, axis=2), axis=2)
                loss += self.delta_delta_freq_weight * mean_difference(
                    target, value, self.loss_type)

            # Add logmagnitude loss, reusing spectrogram.
            if self.logmag_weight > 0:
                target = spectral_ops.safe_log(target_mag)
                value = spectral_ops.safe_log(value_mag)
                loss += self.logmag_weight * mean_difference(
                    target, value, self.loss_type)

        if self.loudness_weight > 0:
            target = spectral_ops.compute_loudness(target_audio, n_fft=2048)
            value = spectral_ops.compute_loudness(audio, n_fft=2048)
            loss += self.loudness_weight * mean_difference(
                target, value, self.loss_type)

        return loss
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(description='Evaluate loudness and basic frequency (F0) L1 difference between a synthesized wav file to its original wav file')
    parser.add_argument('-sf', '--synthesized_file', type=str)
    parser.add_argument('-of', '--original_file', type=str)
    parser.add_argument('-sr', '--sample_rate', type=int, default=16000)
    parser.add_argument('-fr', '--frame_rate', type=int, default=250)
    parser.add_argument('-s', '--include_spectral', type=int, default=1)
    parser.add_argument('-f0', '--include_f0', type=int, default=1)
    parser.add_argument('-ld', '--include_ld', type=int, default=1)
    parser.add_argument('-sdr', '--include_sdr', type=int, default=1)
    
    args = parser.parse_args()
    
    synth_audio, _ = librosa.load(args.synthesized_file, args.sample_rate)
    original_audio, _ = librosa.load(args.original_file, args.sample_rate)

    synth_audio_samples = synth_audio.shape[0]
    original_audio_samples = original_audio.shape[0]

    if synth_audio_samples < original_audio_samples:
        print(f"Trimming original audio samples from {original_audio_samples} to {synth_audio_samples}")
        original_audio_samples = synth_audio_samples
        original_audio = original_audio[:original_audio_samples]

    elif original_audio_samples < synth_audio_samples:
        print(f"Trimming synthesized audio samples from {synth_audio_samples} to {original_audio_samples}")
        synth_audio_samples = original_audio_samples
        synth_audio = synth_audio[:synth_audio_samples]

    if args.include_sdr:
        print(f"SDR: {_calc_sdr(synth_audio, original_audio)}")

    if args.include_f0:
        print("Calculating F0 for synthesized audio")
        synth_f0 = spectral_ops.compute_f0(synth_audio, args.sample_rate, args.frame_rate)[0]

        print("Calculating F0 for original audio")
        original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate, args.frame_rate)[0]
        f0_l1 = np.mean(abs(synth_f0 - original_f0))
        print(f"Average F0 L1: {f0_l1}")

    if args.include_ld:
        print("Calculating loudness for synthesized audio")
        synth_loudness = spectral_ops.compute_loudness(synth_audio, args.sample_rate, args.frame_rate)

        print("Calculating loudness for original audio")
        original_loudness = spectral_ops.compute_loudness(original_audio, args.sample_rate, args.frame_rate)

        loudness_l1 = np.mean(abs(synth_loudness - original_loudness))
        print(f"Average Loudness L1: {loudness_l1}")

    if args.include_spectral:
        from ddsp import losses
        loss_obj = losses.SpectralLoss(mag_weight=1.0, logmag_weight=1.0)
        spectral_loss = loss_obj(synth_audio, original_audio)
        print(f"Average Multi-scale spectrogram loss: {spectral_loss}")
Esempio n. 4
0
    def test_compute_loudness_at_indivisible_sample_rate(
            self, sample_rate, audio_len_sec):
        audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec)

        for use_tf in [False, True]:
            with self.assertRaises(ValueError):
                spectral_ops.compute_loudness(audio_sin,
                                              sample_rate,
                                              self.frame_rate,
                                              use_tf=use_tf)
Esempio n. 5
0
    def call(self, target_audio, audio):

        loss = 0.0

        diff = spectral_ops.diff
        cumsum = tf.math.cumsum

        # Compute loss for each fft size.
        for loss_op in self.spectrogram_ops:
            target_mag = loss_op(target_audio)
            value_mag = loss_op(audio)

            # Add magnitude loss.
            if self.mag_weight > 0:
                loss += self.mag_weight * mean_difference(
                    target_mag, value_mag, self.loss_type)

            if self.delta_time_weight > 0:
                target = diff(target_mag, axis=1)
                value = diff(value_mag, axis=1)
                loss += self.delta_time_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_freq_weight > 0:
                target = diff(target_mag, axis=2)
                value = diff(value_mag, axis=2)
                loss += self.delta_freq_weight * mean_difference(
                    target, value, self.loss_type)

            # TODO(kyriacos) normalize cumulative spectrogram
            if self.cumsum_freq_weight > 0:
                target = cumsum(target_mag, axis=2)
                value = cumsum(value_mag, axis=2)
                loss += self.cumsum_freq_weight * mean_difference(
                    target, value, self.loss_type)

            # Add logmagnitude loss, reusing spectrogram.
            if self.logmag_weight > 0:
                target = spectral_ops.safe_log(target_mag)
                value = spectral_ops.safe_log(value_mag)
                loss += self.logmag_weight * mean_difference(
                    target, value, self.loss_type)

        if self.loudness_weight > 0:
            target = spectral_ops.compute_loudness(target_audio,
                                                   n_fft=2048,
                                                   use_tf=True)
            value = spectral_ops.compute_loudness(audio,
                                                  n_fft=2048,
                                                  use_tf=True)
            loss += self.loudness_weight * mean_difference(
                target, value, self.loss_type)

        return loss
Esempio n. 6
0
  def test_tf_and_np_are_consistent(self):
    amp = 1e-2
    audio = amp * (np.random.rand(64000).astype(np.float32) * 2.0 - 1.0)
    frame_size = 2048
    frame_rate = 250

    ld_tf = spectral_ops.compute_loudness(
        audio, n_fft=frame_size, frame_rate=frame_rate, use_tf=True)

    ld_np = spectral_ops.compute_loudness(
        audio, n_fft=frame_size, frame_rate=frame_rate, use_tf=False)

    self.assertAllClose(np.abs(ld_np), np.abs(ld_tf), rtol=1e-3, atol=1e-3)
Esempio n. 7
0
def _add_loudness(ex, sample_rate, frame_rate, n_fft=2048):
    """Add loudness in dB."""
    beam.metrics.Metrics.counter('prepare-tfrecord', 'compute-loudness').inc()
    audio = ex['audio']
    mean_loudness_db = compute_loudness(audio, sample_rate, frame_rate, n_fft)
    ex = dict(ex)
    ex['loudness_db'] = mean_loudness_db.astype(np.float32)
    return ex
Esempio n. 8
0
 def test_tf_compute_loudness_at_sample_rate(self, sample_rate,
                                             audio_len_sec):
     audio_sin = self._gen_np_sinusoid(sample_rate, audio_len_sec)
     loudness = spectral_ops.compute_loudness(audio_sin, sample_rate,
                                              self.frame_rate)
     expected_loudness_len = int(self.frame_rate * audio_len_sec)
     self.assertLen(loudness, expected_loudness_len)
     self.assertTrue(np.all(np.isfinite(loudness)))
Esempio n. 9
0
def prepare_partial_tfrecord(
        dataset_dir='nsynth_guitar',
        split='train',
        sample_rate=16000,
        frame_rate=250
):
    split_dir = os.path.join(dataset_dir, split)
    audio_dir = os.path.join(split_dir, 'audio')
    nsynth_dataset_file = os.path.join(split_dir, 'examples.json')
    partial_tfrecord_file = os.path.join(split_dir, 'partial.tfrecord')

    with open(nsynth_dataset_file, 'r') as file:
        nsynth_dataset_dict = json.load(file)

    steps = len(nsynth_dataset_dict)

    with tf.io.TFRecordWriter(partial_tfrecord_file) as writer:
        for step, (k, v) in enumerate(nsynth_dataset_dict.items()):
            start_time = time.perf_counter()

            file_name = '{}.wav'.format(k)
            target_path = os.path.join(audio_dir, file_name)

            audio = _load_audio(target_path, sample_rate)
            f0_hz, f0_confidence = spectral_ops.compute_f0(
                audio, sample_rate, frame_rate)
            mean_loudness_db = spectral_ops.compute_loudness(
                audio, sample_rate, frame_rate, 2048)

            audio = audio.astype(np.float32)
            f0_hz = f0_hz.astype(np.float32)
            f0_confidence = f0_confidence.astype(np.float32)
            mean_loudness_db = mean_loudness_db.astype(np.float32)

            # pitch_hz = core.midi_to_hz(v['pitch'])

            partial_dataset_dict = {
                'sample_name': _byte_feature([str.encode(k)]),
                'note_number': _int64_feature([v['pitch']]),
                'velocity': _int64_feature([v['velocity']]),
                'instrument_source': _int64_feature([v['instrument_source']]),
                'qualities': _int64_feature(v['qualities']),
                'audio': _float_feature(audio),
                'f0_hz': _float_feature(f0_hz),
                'f0_confidence': _float_feature(f0_confidence),
                'loudness_db': _float_feature(mean_loudness_db),
            }

            tf_example = tf.train.Example(
                features=tf.train.Features(feature=partial_dataset_dict))

            writer.write(tf_example.SerializeToString())

            stop_time = time.perf_counter()
            elapsed_time = stop_time - start_time
            print('{}/{} - sample_name: {} - elapsed_time: {:.3f}'.format(
                step+1, steps, k, elapsed_time))
Esempio n. 10
0
def _add_loudness(ex, sample_rate, frame_rate, n_fft=2048):
    """Add loudness in dB."""
    beam.metrics.Metrics.counter('prepare-tfrecord', 'compute-loudness').inc()
    audio = ex['audio']
    expected_len = int(len(audio) / sample_rate * frame_rate)
    mean_loudness_db = compute_loudness(audio, sample_rate, frame_rate, n_fft)
    # Trim `mean_loudness_db` or pad to dB floor
    mean_loudness_db = _make_array_expected_length(mean_loudness_db,
                                                   expected_len, -LD_RANGE)
    ex = dict(ex)
    ex['loudness_db'] = mean_loudness_db.astype(np.float32)
    return ex
Esempio n. 11
0
    def test_compute_loudness_at_sample_rate_1d(self, sample_rate,
                                                audio_len_sec):
        audio_sin = gen_np_sinusoid(self.frequency, self.amp, sample_rate,
                                    audio_len_sec)
        expected_loudness_len = int(self.frame_rate * audio_len_sec)

        for use_tf in [False, True]:
            loudness = spectral_ops.compute_loudness(audio_sin,
                                                     sample_rate,
                                                     self.frame_rate,
                                                     use_tf=use_tf)
            self.assertLen(loudness, expected_loudness_len)
            self.assertTrue(np.all(np.isfinite(loudness)))
Esempio n. 12
0
def compute_audio_features(audio,
                           n_fft=2048,
                           sample_rate=16000,
                           frame_rate=250):
  """Compute features from audio."""
  audio_feats = {'audio': audio}
  audio = squeeze(audio)

  audio_feats['loudness_db'] = spectral_ops.compute_loudness(
      audio, sample_rate, frame_rate, n_fft)

  audio_feats['f0_hz'], audio_feats['f0_confidence'] = spectral_ops.compute_f0(
      audio, sample_rate, frame_rate)

  return audio_feats
Esempio n. 13
0
    def _encode(self):
        logging.info("Writing {}".format(self.tfr))
        with tf.python_io.TFRecordWriter(self.tfr) as writer:
            mix_wav_dir = os.path.join(self.wav_dir, "mix")
            s1_wav_dir = os.path.join(self.wav_dir, "s1")
            s2_wav_dir = os.path.join(self.wav_dir, "s2")
            s3_wav_dir = os.path.join(self.wav_dir, "s3")
            filenames = os.listdir(s1_wav_dir)
            for filename in tqdm(filenames):
                logging.info("Preprocessing %s" %
                             (os.path.join(mix_wav_dir, filename)))
                mix, _ = librosa.load(os.path.join(mix_wav_dir, filename),
                                      self.sample_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s1_wav_dir, filename)))
                s1, _ = librosa.load(os.path.join(s1_wav_dir, filename),
                                     self.sample_rate)
                s1_f0 = spectral_ops.compute_f0(s1, self.sample_rate,
                                                self.frame_rate)[0]
                s1_loudness = spectral_ops.compute_loudness(
                    s1, self.sample_rate, self.frame_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s2_wav_dir, filename)))
                s2, _ = librosa.load(os.path.join(s2_wav_dir, filename),
                                     self.sample_rate)
                s2_f0 = spectral_ops.compute_f0(s2, self.sample_rate,
                                                self.frame_rate)[0]
                s2_loudness = spectral_ops.compute_loudness(
                    s2, self.sample_rate, self.frame_rate)

                logging.info("Preprocessing %s" %
                             (os.path.join(s3_wav_dir, filename)))
                s3, _ = librosa.load(os.path.join(s3_wav_dir, filename),
                                     self.sample_rate)
                s3_f0 = spectral_ops.compute_f0(s3, self.sample_rate,
                                                self.frame_rate)[0]
                s3_loudness = spectral_ops.compute_loudness(
                    s3, self.sample_rate, self.frame_rate)

                def sample_to_frame(sample_num):
                    return int(self.frame_rate * sample_num / self.sample_rate)

                def write(l, r):
                    l_frame = sample_to_frame(l)
                    r_frame = sample_to_frame(r)

                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            "mix_audio":
                            self._float_list_feature(mix[l:r]),
                            "s1_audio":
                            self._float_list_feature(s1[l:r]),
                            "s1_f0":
                            self._float_list_feature(s1_f0[l_frame:r_frame]),
                            "s1_loudness":
                            self._float_list_feature(
                                s1_loudness[l_frame:r_frame]),
                            "s2_audio":
                            self._float_list_feature(s2[l:r]),
                            "s2_f0":
                            self._float_list_feature(s2_f0[l_frame:r_frame]),
                            "s2_loudness":
                            self._float_list_feature(
                                s2_loudness[l_frame:r_frame]),
                            "s3_audio":
                            self._float_list_feature(s3[l:r]),
                            "s3_f0":
                            self._float_list_feature(s3_f0[l_frame:r_frame]),
                            "s3_loudness":
                            self._float_list_feature(
                                s3_loudness[l_frame:r_frame]),
                        }))
                    writer.write(example.SerializeToString())

                now_length = s1.shape[-1]
                if now_length < int(4 * self.sample_rate):
                    continue
                target_length = int(4 * self.sample_rate)
                stride = int(4 * self.sample_rate)
                for i in range(0, now_length - target_length, stride):
                    write(i, i + target_length)
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser(
        description='Evaluate MCTN output loudness & F0 of every instrument')
    parser.add_argument('-ld', '--log_dir', type=str, default='./mctn_log')
    parser.add_argument('-rd', '--results_dir', type=str, default='./results')
    parser.add_argument('-sr', '--sample_rate', type=int, default=16000)
    parser.add_argument('-fr', '--frame_rate', type=int, default=250)
    parser.add_argument('-osd',
                        '--original_sound_dir',
                        type=str,
                        default='./results/DDSP - Same artist - Test')

    args = parser.parse_args()
    args.log_file = os.path.join(args.log_dir, 'log.txt')

    logger = logging.getLogger()
    logger.addHandler(logging.FileHandler(args.log_file))
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.setLevel(logging.INFO)

    # Prepare input from files
    logger.info("Loading data from mctn files")
    audio_features_vocals = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "vocals_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "vocals_f0_hz.npy"),
                allow_pickle=False),
    }
    audio_features_bass = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "bass_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "bass_f0_hz.npy"),
                allow_pickle=False),
    }
    audio_features_drums = {
        'loudness_db':
        np.load(os.path.join(args.results_dir, "drums_loudness_db.npy"),
                allow_pickle=False),
        'f0_hz':
        np.load(os.path.join(args.results_dir, "drums_f0_hz.npy"),
                allow_pickle=False),
    }

    # Calc average loudness & F0 diff for every instrument
    for instrument, audio_features in [("bass", audio_features_bass),
                                       ("drums", audio_features_drums),
                                       ("vocals", audio_features_vocals)]:
        original_audio, _ = librosa.load(
            os.path.join(args.original_sound_dir,
                         f"original_{instrument}.wav"), args.sample_rate)

        synth_audio_samples = audio_features_vocals["f0_hz"].shape[
            1] * args.sample_rate // args.frame_rate
        original_audio_samples = original_audio.shape[0]

        if synth_audio_samples < original_audio_samples:
            logging.info(
                f"Trimming original {instrument} audio samples from {original_audio_samples} to {synth_audio_samples}"
            )
            original_audio_samples = synth_audio_samples
            original_audio = original_audio[:original_audio_samples]

        # Assuming only 1 batch
        synth_f0 = audio_features["f0_hz"][0]
        synth_loudness = audio_features["loudness_db"][0]

        logging.info(f"Calculating F0 for {instrument} original audio")
        original_f0 = spectral_ops.compute_f0(original_audio, args.sample_rate,
                                              args.frame_rate)[0]
        logging.info(f"Calculating loudness for {instrument} original audio")
        original_loudness = spectral_ops.compute_loudness(
            original_audio, args.sample_rate, args.frame_rate)

        f0_l1 = np.mean(abs(synth_f0 - original_f0))
        loudness_l1 = np.mean(abs(synth_loudness - original_loudness))
        logging.info(f"Average {instrument} F0 L1: {f0_l1}")
        logging.info(f"Average {instrument} Loudness L1: {loudness_l1}")
Esempio n. 15
0
    def call(self, target_audio, audio, weights=None):
        loss = 0.0

        diff = spectral_ops.diff
        cumsum = tf.math.cumsum

        # Compute loss for each fft size.
        for loss_op in self.spectrogram_ops:
            target_mag = loss_op(target_audio)
            value_mag = loss_op(audio)

            # Add magnitude loss.
            if self.mag_weight > 0:
                loss += self.mag_weight * mean_difference(
                    target_mag, value_mag, self.loss_type, weights=weights)

            if self.delta_time_weight > 0:
                target = diff(target_mag, axis=1)
                value = diff(value_mag, axis=1)
                loss += self.delta_time_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)

            if self.delta_freq_weight > 0:
                target = diff(target_mag, axis=2)
                value = diff(value_mag, axis=2)
                loss += self.delta_freq_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)

            # TODO(kyriacos) normalize cumulative spectrogram
            if self.cumsum_freq_weight > 0:
                target = cumsum(target_mag, axis=-1)
                value = cumsum(value_mag, axis=-1)
                loss += self.cumsum_freq_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)

            if self.bin_time_weight > 0:
                target = tf.reduce_sum(target_mag, axis=-1)
                value = tf.reduce_sum(value_mag, axis=-1)
                # target = tf.cumsum(target, axis=-1)
                # value = tf.cumsum(value, axis=-1)
                loss += self.bin_time_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)
                # times = tf.cast(tf.linspace(0, 1, tf.shape(target)[-1]), dtype=tf.float32)
                # target = target / tf.reduce_sum(target, axis=-1)
                # value = value / tf.reduce_sum(value, axis=-1)
                # target = tf.cast(tf.expand_dims(target, axis=1), dtype=tf.float32)
                # value = tf.cast(tf.expand_dims(value, axis=1), dtype=tf.float32)
                # loss += self.bin_time_weight * tf.reduce_mean(wasserstein_distance(times, times, target, value))

            if self.max_power_weight > 0:
                target = spectral_ops.safe_log(
                    tf.reduce_max(target_mag, axis=2))
                value = spectral_ops.safe_log(tf.reduce_max(value_mag, axis=2))
                loss += self.max_power_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)

            # Add logmagnitude loss, reusing spectrogram.
            if self.logmag_weight > 0:
                target = spectral_ops.safe_log(target_mag)
                value = spectral_ops.safe_log(value_mag)
                loss += self.logmag_weight * mean_difference(
                    target, value, self.loss_type, weights=weights)

            if self.mel_weight > 0 or self.logmel_weight > 0:
                target_mel = spectral_ops.compute_mel_from_mag(
                    target_mag,
                    lo_hz=2.0,
                    bins=None,
                    fft_size=loss_op.keywords['size'])
                value_mel = spectral_ops.compute_mel_from_mag(
                    value_mag,
                    lo_hz=2.0,
                    bins=None,
                    fft_size=loss_op.keywords['size'])
                if self.mel_weight > 0:
                    loss += self.mel_weight * mean_difference(
                        target_mel, value_mel, self.loss_type, weights=weights)
                if self.logmel_weight > 0:
                    target_logmel = spectral_ops.safe_log(target_mel)
                    value_logmel = spectral_ops.safe_log(value_mel)
                    loss += self.logmel_weight * mean_difference(
                        target_logmel,
                        value_logmel,
                        self.loss_type,
                        weights=weights)

        if self.loudness_weight > 0:
            target = spectral_ops.compute_loudness(target_audio,
                                                   n_fft=2048,
                                                   use_tf=True)
            value = spectral_ops.compute_loudness(audio,
                                                  n_fft=2048,
                                                  use_tf=True)
            loss += self.loudness_weight * mean_difference(
                target, value, self.loss_type, weights=weights)

        return loss