Example #1
0
def spectrogram_summary(audio, audio_gen, step, name=''):
    """Writes a summary of spectrograms for a batch of images."""
    specgram = lambda a: ddsp.spectral_ops.compute_logmag(tf_float32(a),
                                                          size=768)

    # Batch spectrogram operations
    spectrograms = specgram(audio)
    spectrograms_gen = specgram(audio_gen)

    batch_size = int(audio.shape[0])
    for i in range(batch_size):
        # Manually specify exact size of fig for tensorboard
        fig, axs = plt.subplots(2, 1, figsize=(8, 8))

        ax = axs[0]
        spec = np.rot90(spectrograms[i])
        ax.matshow(spec, vmin=-5, vmax=1, aspect='auto', cmap=plt.cm.magma)
        ax.set_title('original')
        ax.set_xticks([])
        ax.set_yticks([])

        ax = axs[1]
        spec = np.rot90(spectrograms_gen[i])
        ax.matshow(spec, vmin=-5, vmax=1, aspect='auto', cmap=plt.cm.magma)
        ax.set_title('synthesized')
        ax.set_xticks([])
        ax.set_yticks([])

        # Format and save plot to image
        name = name + '_' if name else ''
        tag = 'spectrogram/{}{}'.format(name, i + 1)
        fig_summary(tag, fig, step)
Example #2
0
def stft(audio, frame_size=2048, overlap=0.75, pad_end=True):
    """Differentiable stft in tensorflow, computed in batch."""
    audio = tf_float32(audio)
    assert frame_size * overlap % 2.0 == 0.0
    s = tf.signal.stft(signals=audio,
                       frame_length=int(frame_size),
                       frame_step=int(frame_size * (1.0 - overlap)),
                       fft_length=int(frame_size),
                       pad_end=pad_end)
    return s
Example #3
0
 def setUp(self):
     """Create some dummy input data for the chain."""
     super().setUp()
     # Create inputs.
     self.n_batch = 4
     self.n_frames = 1001
     self.n_samples = 64000
     inputs = {
         'loudness_db': np.zeros([self.n_batch, self.n_frames]),
         'f0_hz': np.zeros([self.n_batch, self.n_frames]),
         'audio': np.random.randn(self.n_batch, self.n_samples),
     }
     self.inputs = {k: tf_float32(v) for k, v in inputs.items()}
Example #4
0
def compute_mag(audio, size=2048, overlap=0.75, pad_end=True):
    mag = tf.abs(stft(audio, frame_size=size, overlap=overlap,
                      pad_end=pad_end))
    return tf_float32(mag)
Example #5
0
def compute_loudness(audio,
                     sample_rate=16000,
                     frame_rate=250,
                     n_fft=2048,
                     range_db=LD_RANGE,
                     ref_db=20.7,
                     use_tf=False):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

  Function is differentiable if use_tf=True.
  Args:
    audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
      [batch_size,].
    sample_rate: Audio sample rate in Hz.
    frame_rate: Rate of loudness frames in Hz.
    n_fft: Fft window size.
    range_db: Sets the dynamic range of loudness in decibles. The minimum
      loudness (per a frequency bin) corresponds to -range_db.
    ref_db: Sets the reference maximum perceptual loudness as given by
      (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
      corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
      slight dependence on fft_size due to different granularity of perceptual
      weighting.
    use_tf: Make function differentiable by using librosa.

  Returns:
    Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
  """
    # Pick tensorflow or numpy.
    lib = tf if use_tf else np

    # Make inputs tensors for tensorflow.
    audio = tf_float32(audio) if use_tf else audio

    # Temporarily a batch dimension for single examples.
    is_1d = (len(audio.shape) == 1)
    audio = audio[lib.newaxis, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_tf else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power
    amplitude = lib.abs(s)
    log10 = (
        lambda x: tf.math.log(x) / tf.math.log(10.0)) if use_tf else np.log10
    amin = 1e-20  # Avoid log(0) instabilities.
    power_db = log10(lib.maximum(amin, amplitude))
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[lib.newaxis, lib.newaxis, :]
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = lib.maximum(loudness, -range_db)
    mean = tf.reduce_mean if use_tf else np.mean

    # Average over frequency bins.
    loudness = mean(loudness, axis=-1)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness
    return loudness
Example #6
0
def get_spectrogram(audio, rotate=False, size=1024):
    """Compute logmag spectrogram."""
    mag = ddsp.spectral_ops.compute_logmag(tf_float32(audio), size=size)
    if rotate:
        mag = np.rot90(mag)
    return mag
Example #7
0
 def call(self, target_audio, audio):
   audio, target_audio = tf_float32(audio), tf_float32(target_audio)
   target_emb = self.pretrained_model(target_audio)
   synth_emb = self.pretrained_model(audio)
   loss = self.weight * mean_difference(target_emb, synth_emb, self.loss_type)
   return loss