def call(self, target_audio, audio): loss = 0.0 loss_ops = [] diff = spectral_ops.diff for size in self.fft_sizes: loss_op = functools.partial(spectral_ops.compute_mag, size=size) loss_ops.append(loss_op) # Compute loss for each fft size. for loss_op in loss_ops: target_mag = loss_op(target_audio) value_mag = loss_op(audio) # Add magnitude loss. if self.mag_weight > 0: loss += self.mag_weight * mean_difference(target_mag, value_mag, self.loss_type) if self.delta_time_weight > 0: target = diff(target_mag, axis=1) value = diff(value_mag, axis=1) loss += self.delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_time_weight > 0: target = diff(diff(target_mag, axis=1), axis=1) value = diff(diff(value_mag, axis=1), axis=1) loss += self.delta_delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_freq_weight > 0: target = diff(target_mag, axis=2) value = diff(value_mag, axis=2) loss += self.delta_freq_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_freq_weight > 0: target = diff(diff(target_mag, axis=2), axis=2) value = diff(diff(value_mag, axis=2), axis=2) loss += self.delta_delta_freq_weight * mean_difference( target, value, self.loss_type) # Add logmagnitude loss, reusing spectrogram. if self.logmag_weight > 0: target = spectral_ops.safe_log(target_mag) value = spectral_ops.safe_log(value_mag) loss += self.logmag_weight * mean_difference(target, value, self.loss_type) if self.loudness_weight > 0: target = spectral_ops.compute_loudness(target_audio, n_fft=2048) value = spectral_ops.compute_loudness(audio, n_fft=2048) loss += self.loudness_weight * mean_difference(target, value, self.loss_type) return loss
def compute_audio_features(audio, n_fft=6144, sample_rate=48000, frame_rate=250): """Compute features from audio.""" # adapted for stereo audioM = np.squeeze(np.mean(audio, axis=1)) audioL = np.squeeze(audio[:, 0:1]) audioR = np.squeeze(audio[:, 1:2]) audioM = np.expand_dims(audioM, axis=0) audioL = np.expand_dims(audioL, axis=0) audioR = np.expand_dims(audioR, axis=0) audio_feats = {'audioM': audioM, 'audioL': audioL, 'audioR': audioR} audioM32 = audioM.astype(np.float32) audioL32 = audioL.astype(np.float32) audioR32 = audioR.astype(np.float32) audio_feats['loudness_dbM'] = spectral_ops.compute_loudness( audioM32, sample_rate, frame_rate, n_fft) audio_feats['loudness_dbL'] = spectral_ops.compute_loudness( audioL32, sample_rate, frame_rate, n_fft) audio_feats['loudness_dbR'] = spectral_ops.compute_loudness( audioR32, sample_rate, frame_rate, n_fft) audio_feats['f0_hzM'], audio_feats['f0_confidenceM'] = ( spectral_ops.compute_f0(np.squeeze(audioM), sample_rate, frame_rate)) audio_feats['f0_hzL'], audio_feats['f0_confidenceL'] = ( spectral_ops.compute_f0(np.squeeze(audioL), sample_rate, frame_rate)) audio_feats['f0_hzR'], audio_feats['f0_confidenceR'] = ( spectral_ops.compute_f0(np.squeeze(audioR), sample_rate, frame_rate)) return audio_feats
def compute_audio_features(audio, n_fft=2048, sample_rate=16000, frame_rate=250): """Compute features from audio.""" audio_feats = {'audio': audio} audio = squeeze(audio) audio_feats['loudness_db'] = spectral_ops.compute_loudness( audio, sample_rate, frame_rate, n_fft) audio_feats['f0_hz'], audio_feats['f0_confidence'] = ( spectral_ops.compute_f0(audio, sample_rate, frame_rate)) return audio_feats
def update_state(self, batch, audio_gen): """Update metrics based on a batch of audio. Args: batch: Dictionary of input features. audio_gen: Batch of generated audio. """ loudness_original = batch['loudness_db'] # Compute loudness across entire batch loudness_gen = spectral_ops.compute_loudness( audio_gen, sample_rate=self._sample_rate, frame_rate=self._frame_rate) batch_size = int(audio_gen.shape[0]) for i in range(batch_size): ld_dist = np.mean( l1_distance(loudness_original[i], loudness_gen[i])) self.metrics['loudness_db'].update_state(ld_dist) log_str = f'{self._name} | sample {i} | ld_dist(db): {ld_dist:.3f}' logging.info(log_str)
def call(self, target_audio, audio): loss = 0.0 diff = spectral_ops.diff cumsum = tf.math.cumsum # Compute loss for each fft size. for loss_op in self.spectrogram_ops: target_mag = loss_op(target_audio) value_mag = loss_op(audio) # Add magnitude loss. if self.mag_weight > 0: loss += self.mag_weight * mean_difference( target_mag, value_mag, self.loss_type) if self.delta_time_weight > 0: target = diff(target_mag, axis=1) value = diff(value_mag, axis=1) loss += self.delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_time_weight > 0: target = diff(diff(target_mag, axis=1), axis=1) value = diff(diff(value_mag, axis=1), axis=1) loss += self.delta_delta_time_weight * mean_difference( target, value, self.loss_type) if self.delta_freq_weight > 0: target = diff(target_mag, axis=2) value = diff(value_mag, axis=2) loss += self.delta_freq_weight * mean_difference( target, value, self.loss_type) if self.delta_delta_freq_weight > 0: target = diff(diff(target_mag, axis=2), axis=2) value = diff(diff(value_mag, axis=2), axis=2) loss += self.delta_delta_freq_weight * mean_difference( target, value, self.loss_type) # TODO(kyriacos) normalize cumulative spectrogram if self.cumsum_freq_weight > 0: target = cumsum(target_mag, axis=2) value = cumsum(value_mag, axis=2) loss += self.cumsum_freq_weight * mean_difference( target, value, self.loss_type) # Add logmagnitude loss, reusing spectrogram. if self.logmag_weight > 0: target = spectral_ops.safe_log(target_mag) value = spectral_ops.safe_log(value_mag) loss += self.logmag_weight * mean_difference( target, value, self.loss_type) if self.loudness_weight > 0: target = spectral_ops.compute_loudness(target_audio, n_fft=6144, use_tf=True) value = spectral_ops.compute_loudness(audio, n_fft=6144, use_tf=True) loss += self.loudness_weight * mean_difference( target, value, self.loss_type) return loss