Esempio n. 1
0
def stft(audio, frame_size=2048, overlap=0.75, pad_end=True):
    """Differentiable stft in torch, computed in batch."""
    audio = f32(audio)
    assert frame_size * overlap % 2.0 == 0.0
    hop_size = int(frame_size * (1.0 - overlap))
    is_2d = len(audio.shape) == 2

    if pad_end:
        n_samples_initial = audio.shape[-1]
        n_frames = int(np.ceil(n_samples_initial / hop_size))
        n_samples_final = (n_frames - 1) * hop_size + frame_size
        pad = n_samples_final - n_samples_initial
        padding = (0, pad)
        audio = audio[None, ...] if not is_2d else audio
        audio = th.nn.functional.pad(audio, padding, "constant")
        audio = audio[0] if not is_2d else audio

    s = th.stft(
        audio,
        window=th.hann_window(int(frame_size)),
        hop_length=hop_size,
        n_fft=int(frame_size),
        center=False,
    )
    return s
Esempio n. 2
0
 def forward(self, audio, target_audio):
     audio, target_audio = f32(audio, target_audio)
     target_emb = self.pretrained_model(target_audio)
     synth_emb = self.pretrained_model(audio)
     loss = self.weight * mean_difference(target_emb, synth_emb,
                                          self.loss_type)
     return loss
Esempio n. 3
0
def pad_or_trim_to_expected_length(
    vector, expected_len, pad_value=0, len_tolerance=20, use_th=False
):
    """Make vector equal to the expected length.

    Feature extraction functions like `compute_loudness()` or `compute_f0` produce
    feature vectors that vary in length depending on factors such as `sample_rate`
    or `hop_size`. This function corrects vectors to the expected length, warning
    the user if the difference between the vector and expected length was
    unusually high to begin with.

    Args:
        vector: Numpy 1D ndarray. Shape [vector_length,]
        expected_len: Expected length of vector.
        pad_value: Value to pad at end of vector.
        len_tolerance: Tolerance of difference between original and desired vector
            length.
        use_th: Make function differentiable by using tensorflow.

    Returns:
        vector: Vector with corrected length.

    Raises:
        ValueError: if `len(vector)` is different from `expected_len` beyond
            `len_tolerance` to begin with.
    """
    expected_len = int(expected_len)
    vector_len = int(vector.shape[-1])

    if abs(vector_len - expected_len) > len_tolerance:
        # Ensure vector was close to expected length to begin with
        raise ValueError(
            "Vector length: {} differs from expected length: {} "
            "beyond tolerance of : {}".format(
                vector_len, expected_len, len_tolerance
            )
        )

    is_1d = len(vector.shape) == 1
    vector = vector[None, :] if is_1d else vector

    # Pad missing samples
    if vector_len < expected_len:
        n_padding = expected_len - vector_len
        if use_th:
            vector = th.nn.functional.pad(
                f32(vector), (0, n_padding), mode="constant", value=pad_value
            )
        else:
            vector = np.pad(
                vector,
                ((0, 0), (0, n_padding)),
                mode="constant",
                constant_values=pad_value,
            )
    # Trim samples
    elif vector_len > expected_len:
        vector = vector[..., :expected_len]

    # Remove temporary batch dimension.
    vector = vector[0] if is_1d else vector
    return vector
Esempio n. 4
0
def hertz_to_mel(frequencies_hertz):
    """Converts frequencies in `frequencies_hertz` in Hertz to the mel scale."""
    return (
        _MEL_HIGH_FREQUENCY_Q
        * (1.0 + (f32(frequencies_hertz) / _MEL_BREAK_FREQUENCY_HERTZ)).log()
    )
Esempio n. 5
0
def mel_to_hertz(mel_values):
    """Converts frequencies in `mel_values` from the mel scale to linear scale."""
    return _MEL_BREAK_FREQUENCY_HERTZ * (
        (f32(mel_values) / _MEL_HIGH_FREQUENCY_Q).exp() - 1.0
    )
Esempio n. 6
0
def compute_loudness(
    audio,
    sample_rate=16000,
    frame_rate=250,
    n_fft=2048,
    range_db=LD_RANGE,
    ref_db=20.7,
    use_th=False,
):
    """Perceptual loudness in dB, relative to white noise, amplitude=1.

    Function is differentiable if use_th=True.

    Args:
        audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or
            [batch_size,].
        sample_rate: Audio sample rate in Hz.
        frame_rate: Rate of loudness frames in Hz.
        n_fft: Fft window size.
        range_db: Sets the dynamic range of loudness in decibles. The minimum
        loudness (per a frequency bin) corresponds to -range_db.
        ref_db: Sets the reference maximum perceptual loudness as given by
            (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value
            corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a
            slight dependence on fft_size due to different granularity of perceptual
            weighting.
        use_th: Make function differentiable by using tensorflow.

    Returns:
        Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,].
    """
    if sample_rate % frame_rate != 0:
        raise ValueError(
            "frame_rate: {} must evenly divide sample_rate: {}."
            "For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz".format(
                frame_rate, sample_rate
            )
        )

    # Avoid log(0) instabilities.
    amin = 1e-20

    # Pick tensorflow or numpy.
    lib = th if use_th else np

    # Make inputs tensors for tensorflow.
    if use_th:
        audio, range_db, amin = f32(audio, range_db, amin)

    # Temporarily a batch dimension for single examples.
    is_1d = len(audio.shape) == 1
    audio = audio[None, :] if is_1d else audio

    # Take STFT.
    hop_size = sample_rate // frame_rate
    overlap = 1 - hop_size / n_fft
    stft_fn = stft if use_th else stft_np
    s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True)

    # Compute power
    amplitude = complex_abs(s) if use_th else np.abs(s)
    maximum = th.max if use_th else np.maximum
    power_db = lib.log10(maximum(amin, amplitude))
    power_db *= 20.0

    # Perceptual weighting.
    frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft)
    a_weighting = librosa.A_weighting(frequencies)[None, :, None]
    a_weighting = f32(a_weighting) if use_th else a_weighting
    loudness = power_db + a_weighting

    # Set dynamic range.
    loudness -= ref_db
    loudness = maximum(-range_db, loudness)

    # Average over frequency bins.
    loudness = lib.mean(loudness, -2)

    # Remove temporary batch dimension.
    loudness = loudness[0] if is_1d else loudness

    # Compute expected length of loudness vector
    n_secs = audio.shape[-1] / sample_rate  # `n_secs` can have milliseconds
    expected_len = int(n_secs * frame_rate)

    # Pad with `-range_db` noise floor or trim vector
    loudness = pad_or_trim_to_expected_length(
        loudness, expected_len, -range_db, use_th=use_th
    )
    return loudness