def stft(audio, frame_size=2048, overlap=0.75, pad_end=True): """Differentiable stft in torch, computed in batch.""" audio = f32(audio) assert frame_size * overlap % 2.0 == 0.0 hop_size = int(frame_size * (1.0 - overlap)) is_2d = len(audio.shape) == 2 if pad_end: n_samples_initial = audio.shape[-1] n_frames = int(np.ceil(n_samples_initial / hop_size)) n_samples_final = (n_frames - 1) * hop_size + frame_size pad = n_samples_final - n_samples_initial padding = (0, pad) audio = audio[None, ...] if not is_2d else audio audio = th.nn.functional.pad(audio, padding, "constant") audio = audio[0] if not is_2d else audio s = th.stft( audio, window=th.hann_window(int(frame_size)), hop_length=hop_size, n_fft=int(frame_size), center=False, ) return s
def forward(self, audio, target_audio): audio, target_audio = f32(audio, target_audio) target_emb = self.pretrained_model(target_audio) synth_emb = self.pretrained_model(audio) loss = self.weight * mean_difference(target_emb, synth_emb, self.loss_type) return loss
def pad_or_trim_to_expected_length( vector, expected_len, pad_value=0, len_tolerance=20, use_th=False ): """Make vector equal to the expected length. Feature extraction functions like `compute_loudness()` or `compute_f0` produce feature vectors that vary in length depending on factors such as `sample_rate` or `hop_size`. This function corrects vectors to the expected length, warning the user if the difference between the vector and expected length was unusually high to begin with. Args: vector: Numpy 1D ndarray. Shape [vector_length,] expected_len: Expected length of vector. pad_value: Value to pad at end of vector. len_tolerance: Tolerance of difference between original and desired vector length. use_th: Make function differentiable by using tensorflow. Returns: vector: Vector with corrected length. Raises: ValueError: if `len(vector)` is different from `expected_len` beyond `len_tolerance` to begin with. """ expected_len = int(expected_len) vector_len = int(vector.shape[-1]) if abs(vector_len - expected_len) > len_tolerance: # Ensure vector was close to expected length to begin with raise ValueError( "Vector length: {} differs from expected length: {} " "beyond tolerance of : {}".format( vector_len, expected_len, len_tolerance ) ) is_1d = len(vector.shape) == 1 vector = vector[None, :] if is_1d else vector # Pad missing samples if vector_len < expected_len: n_padding = expected_len - vector_len if use_th: vector = th.nn.functional.pad( f32(vector), (0, n_padding), mode="constant", value=pad_value ) else: vector = np.pad( vector, ((0, 0), (0, n_padding)), mode="constant", constant_values=pad_value, ) # Trim samples elif vector_len > expected_len: vector = vector[..., :expected_len] # Remove temporary batch dimension. vector = vector[0] if is_1d else vector return vector
def hertz_to_mel(frequencies_hertz): """Converts frequencies in `frequencies_hertz` in Hertz to the mel scale.""" return ( _MEL_HIGH_FREQUENCY_Q * (1.0 + (f32(frequencies_hertz) / _MEL_BREAK_FREQUENCY_HERTZ)).log() )
def mel_to_hertz(mel_values): """Converts frequencies in `mel_values` from the mel scale to linear scale.""" return _MEL_BREAK_FREQUENCY_HERTZ * ( (f32(mel_values) / _MEL_HIGH_FREQUENCY_Q).exp() - 1.0 )
def compute_loudness( audio, sample_rate=16000, frame_rate=250, n_fft=2048, range_db=LD_RANGE, ref_db=20.7, use_th=False, ): """Perceptual loudness in dB, relative to white noise, amplitude=1. Function is differentiable if use_th=True. Args: audio: Numpy ndarray or tensor. Shape [batch_size, audio_length] or [batch_size,]. sample_rate: Audio sample rate in Hz. frame_rate: Rate of loudness frames in Hz. n_fft: Fft window size. range_db: Sets the dynamic range of loudness in decibles. The minimum loudness (per a frequency bin) corresponds to -range_db. ref_db: Sets the reference maximum perceptual loudness as given by (A_weighting + 10 * log10(abs(stft(audio))**2.0). The default value corresponds to white noise with amplitude=1.0 and n_fft=2048. There is a slight dependence on fft_size due to different granularity of perceptual weighting. use_th: Make function differentiable by using tensorflow. Returns: Loudness in decibels. Shape [batch_size, n_frames] or [n_frames,]. """ if sample_rate % frame_rate != 0: raise ValueError( "frame_rate: {} must evenly divide sample_rate: {}." "For default frame_rate: 250Hz, suggested sample_rate: 16kHz or 48kHz".format( frame_rate, sample_rate ) ) # Avoid log(0) instabilities. amin = 1e-20 # Pick tensorflow or numpy. lib = th if use_th else np # Make inputs tensors for tensorflow. if use_th: audio, range_db, amin = f32(audio, range_db, amin) # Temporarily a batch dimension for single examples. is_1d = len(audio.shape) == 1 audio = audio[None, :] if is_1d else audio # Take STFT. hop_size = sample_rate // frame_rate overlap = 1 - hop_size / n_fft stft_fn = stft if use_th else stft_np s = stft_fn(audio, frame_size=n_fft, overlap=overlap, pad_end=True) # Compute power amplitude = complex_abs(s) if use_th else np.abs(s) maximum = th.max if use_th else np.maximum power_db = lib.log10(maximum(amin, amplitude)) power_db *= 20.0 # Perceptual weighting. frequencies = librosa.fft_frequencies(sr=sample_rate, n_fft=n_fft) a_weighting = librosa.A_weighting(frequencies)[None, :, None] a_weighting = f32(a_weighting) if use_th else a_weighting loudness = power_db + a_weighting # Set dynamic range. loudness -= ref_db loudness = maximum(-range_db, loudness) # Average over frequency bins. loudness = lib.mean(loudness, -2) # Remove temporary batch dimension. loudness = loudness[0] if is_1d else loudness # Compute expected length of loudness vector n_secs = audio.shape[-1] / sample_rate # `n_secs` can have milliseconds expected_len = int(n_secs * frame_rate) # Pad with `-range_db` noise floor or trim vector loudness = pad_or_trim_to_expected_length( loudness, expected_len, -range_db, use_th=use_th ) return loudness