Example #1
0
  def call(self, target_audio, audio):

    loss = 0.0
    loss_ops = []
    diff = spectral_ops.diff

    for size in self.fft_sizes:
      loss_op = functools.partial(spectral_ops.compute_mag, size=size)
      loss_ops.append(loss_op)

    # Compute loss for each fft size.
    for loss_op in loss_ops:
      target_mag = loss_op(target_audio)
      value_mag = loss_op(audio)

      # Add magnitude loss.
      if self.mag_weight > 0:
        loss += self.mag_weight * mean_difference(target_mag, value_mag,
                                                  self.loss_type)

      if self.delta_time_weight > 0:
        target = diff(target_mag, axis=1)
        value = diff(value_mag, axis=1)
        loss += self.delta_time_weight * mean_difference(
            target, value, self.loss_type)

      if self.delta_delta_time_weight > 0:
        target = diff(diff(target_mag, axis=1), axis=1)
        value = diff(diff(value_mag, axis=1), axis=1)
        loss += self.delta_delta_time_weight * mean_difference(
            target, value, self.loss_type)

      if self.delta_freq_weight > 0:
        target = diff(target_mag, axis=2)
        value = diff(value_mag, axis=2)
        loss += self.delta_freq_weight * mean_difference(
            target, value, self.loss_type)

      if self.delta_delta_freq_weight > 0:
        target = diff(diff(target_mag, axis=2), axis=2)
        value = diff(diff(value_mag, axis=2), axis=2)
        loss += self.delta_delta_freq_weight * mean_difference(
            target, value, self.loss_type)

      # Add logmagnitude loss, reusing spectrogram.
      if self.logmag_weight > 0:
        target = spectral_ops.safe_log(target_mag)
        value = spectral_ops.safe_log(value_mag)
        loss += self.logmag_weight * mean_difference(target, value,
                                                     self.loss_type)

    if self.loudness_weight > 0:
      target = spectral_ops.compute_loudness(target_audio, n_fft=2048)
      value = spectral_ops.compute_loudness(audio, n_fft=2048)
      loss += self.loudness_weight * mean_difference(target, value,
                                                     self.loss_type)

    return loss
Example #2
0
def compute_audio_features(audio,
                           n_fft=6144,
                           sample_rate=48000,
                           frame_rate=250):
    """Compute features from audio."""
    # adapted for stereo
    audioM = np.squeeze(np.mean(audio, axis=1))
    audioL = np.squeeze(audio[:, 0:1])
    audioR = np.squeeze(audio[:, 1:2])
    audioM = np.expand_dims(audioM, axis=0)
    audioL = np.expand_dims(audioL, axis=0)
    audioR = np.expand_dims(audioR, axis=0)
    audio_feats = {'audioM': audioM, 'audioL': audioL, 'audioR': audioR}
    audioM32 = audioM.astype(np.float32)
    audioL32 = audioL.astype(np.float32)
    audioR32 = audioR.astype(np.float32)

    audio_feats['loudness_dbM'] = spectral_ops.compute_loudness(
        audioM32, sample_rate, frame_rate, n_fft)

    audio_feats['loudness_dbL'] = spectral_ops.compute_loudness(
        audioL32, sample_rate, frame_rate, n_fft)

    audio_feats['loudness_dbR'] = spectral_ops.compute_loudness(
        audioR32, sample_rate, frame_rate, n_fft)

    audio_feats['f0_hzM'], audio_feats['f0_confidenceM'] = (
        spectral_ops.compute_f0(np.squeeze(audioM), sample_rate, frame_rate))

    audio_feats['f0_hzL'], audio_feats['f0_confidenceL'] = (
        spectral_ops.compute_f0(np.squeeze(audioL), sample_rate, frame_rate))

    audio_feats['f0_hzR'], audio_feats['f0_confidenceR'] = (
        spectral_ops.compute_f0(np.squeeze(audioR), sample_rate, frame_rate))

    return audio_feats
Example #3
0
def compute_audio_features(audio,
                           n_fft=2048,
                           sample_rate=16000,
                           frame_rate=250):
    """Compute features from audio."""
    audio_feats = {'audio': audio}
    audio = squeeze(audio)

    audio_feats['loudness_db'] = spectral_ops.compute_loudness(
        audio, sample_rate, frame_rate, n_fft)

    audio_feats['f0_hz'], audio_feats['f0_confidence'] = (
        spectral_ops.compute_f0(audio, sample_rate, frame_rate))

    return audio_feats
Example #4
0
    def update_state(self, batch, audio_gen):
        """Update metrics based on a batch of audio.

    Args:
      batch: Dictionary of input features.
      audio_gen: Batch of generated audio.
    """
        loudness_original = batch['loudness_db']
        # Compute loudness across entire batch
        loudness_gen = spectral_ops.compute_loudness(
            audio_gen,
            sample_rate=self._sample_rate,
            frame_rate=self._frame_rate)

        batch_size = int(audio_gen.shape[0])
        for i in range(batch_size):
            ld_dist = np.mean(
                l1_distance(loudness_original[i], loudness_gen[i]))
            self.metrics['loudness_db'].update_state(ld_dist)
            log_str = f'{self._name} | sample {i} | ld_dist(db): {ld_dist:.3f}'
            logging.info(log_str)
Example #5
0
    def call(self, target_audio, audio):

        loss = 0.0

        diff = spectral_ops.diff
        cumsum = tf.math.cumsum

        # Compute loss for each fft size.
        for loss_op in self.spectrogram_ops:
            target_mag = loss_op(target_audio)
            value_mag = loss_op(audio)

            # Add magnitude loss.
            if self.mag_weight > 0:
                loss += self.mag_weight * mean_difference(
                    target_mag, value_mag, self.loss_type)

            if self.delta_time_weight > 0:
                target = diff(target_mag, axis=1)
                value = diff(value_mag, axis=1)
                loss += self.delta_time_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_delta_time_weight > 0:
                target = diff(diff(target_mag, axis=1), axis=1)
                value = diff(diff(value_mag, axis=1), axis=1)
                loss += self.delta_delta_time_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_freq_weight > 0:
                target = diff(target_mag, axis=2)
                value = diff(value_mag, axis=2)
                loss += self.delta_freq_weight * mean_difference(
                    target, value, self.loss_type)

            if self.delta_delta_freq_weight > 0:
                target = diff(diff(target_mag, axis=2), axis=2)
                value = diff(diff(value_mag, axis=2), axis=2)
                loss += self.delta_delta_freq_weight * mean_difference(
                    target, value, self.loss_type)
            # TODO(kyriacos) normalize cumulative spectrogram
            if self.cumsum_freq_weight > 0:
                target = cumsum(target_mag, axis=2)
                value = cumsum(value_mag, axis=2)
                loss += self.cumsum_freq_weight * mean_difference(
                    target, value, self.loss_type)

            # Add logmagnitude loss, reusing spectrogram.
            if self.logmag_weight > 0:
                target = spectral_ops.safe_log(target_mag)
                value = spectral_ops.safe_log(value_mag)
                loss += self.logmag_weight * mean_difference(
                    target, value, self.loss_type)

        if self.loudness_weight > 0:
            target = spectral_ops.compute_loudness(target_audio,
                                                   n_fft=6144,
                                                   use_tf=True)
            value = spectral_ops.compute_loudness(audio,
                                                  n_fft=6144,
                                                  use_tf=True)
            loss += self.loudness_weight * mean_difference(
                target, value, self.loss_type)

        return loss