def freq_loss(f_hz, f_hz_target, loss_type='L1', weights=None): """Loss comparing two frequencies.""" # Convert to MIDI. f_midi = hz_to_midi(f_hz) f_midi_target = hz_to_midi(f_hz_target) # Take the difference. return mean_difference(f_midi, f_midi_target, loss_type, weights)
def call(self, amps_a, freqs_a, amps_b, freqs_b): """Returns the sinusoidal consistency loss scalar. Args: amps_a: Amplitudes of first sinusoids, greater than 0. Shape [batch, time, freq]. freqs_a: Frequencies of first sinusoids in hertz. Shape [batch, time, feq]. amps_b: Amplitudes of second sinusoids, greater than 0. Shape [batch, time, freq]. freqs_b: Frequencies of second sinusoids in hertz. Shape [batch, time, feq]. Returns: Scalar, weighted wasserstein distance. """ loss = 0.0 if self.weight > 0.0: if self.midi: freqs_a = hz_to_midi(freqs_a) freqs_b = hz_to_midi(freqs_b) loss = wasserstein_distance(freqs_a, freqs_b, amps_a, amps_b, p=1.0) loss = tf.reduce_mean(self.weight * loss) return loss
def nll(self, amps, freqs, amps_target, freqs_target, scale_target): """Returns negative log-likelihood of source sins given target sins. Args: amps: Amplitudes of source sinusoids, greater than 0. Shape [batch, time, freq]. freqs: Frequencies of source sinusoids in hertz. Shape [batch, time, feq]. amps_target: Amplitudes of target sinusoids, greater than 0. Shape [batch, time, freq]. freqs_target: Frequencies of target sinusoids in hertz. Shape [batch, time, feq]. scale_target: Scale of gaussian kernel in MIDI. Returns: - log(p(source|target)). Shape [batch, time]. """ p_source_given_target = self.kernel_density_estimate( amps_target, freqs_target, scale_target) # KDE is on a logarithmic scale (MIDI). freqs_midi = hz_to_midi(freqs) # Need to rearrage shape as tfp expects, [sample_sh, batch_sh, event_sh]. freqs_transpose = tf.transpose(freqs_midi, [2, 0, 1]) # [freq, batch, time] nll_transpose = - p_source_given_target.log_prob(freqs_transpose) nll = tf.transpose(nll_transpose, [1, 2, 0]) # [batch, time, freq] # Weighted sum over sinusoids -> [batch, time] amps_norm = safe_divide(amps, tf.reduce_sum(amps, axis=-1, keepdims=True)) return tf.reduce_mean(nll * amps_norm, axis=-1)
def test_hz_to_midi_is_accurate(self): """Tests converting between MIDI values and their frequencies in hertz.""" hz = np.linspace(20.0, 20000.0, 128) librosa_midi = librosa.hz_to_midi(hz) with self.cached_session() as sess: tf_midi = sess.run(core.hz_to_midi(hz)) self.assertAllClose(librosa_midi, tf_midi)
def test_hz_to_midi_is_accurate(self): """Tests converting between MIDI values and their frequencies in hertz.""" hz = np.linspace(0.0, 20000.0, 128) librosa_midi = librosa.hz_to_midi(hz) librosa_midi = tf.where(tf.less_equal(hz, 0.0), 0.0, librosa_midi) tf_midi = core.hz_to_midi(hz) self.assertAllClose(librosa_midi, tf_midi)
def f0_summary(f0_hz, f0_hz_predict, step, name=''): """Creates a plot comparison of ground truth f0_hz and predicted values.""" batch_size = int(f0_hz.shape[0]) for i in range(batch_size): f0_midi = hz_to_midi(squeeze(f0_hz[i])) f0_midi_predict = hz_to_midi(squeeze(f0_hz_predict[i])) # Manually specify exact size of fig for tensorboard fig, ax = plt.subplots(1, 1, figsize=(2.5, 2.5)) ax.plot(f0_midi) ax.plot(f0_midi_predict) # Format and save plot to image name = name + '_' if name else '' tag = 'f0_midi/{}{}'.format(name, i + 1) fig_summary(tag, fig, step)
def _default_processing(self, features): '''Always resample to time_steps and scale input signals.''' for k in [ "f0", "phase", "phase_unwrapped", "osc", "osc_sub", "phase_sub", "phase_unwrapped_sub", "osc_sub_sync", "phase_unwrapped_sub_sync", "phase_sub_sync" ]: if features.get(k, None) is not None: features[k] = at_least_3d(features[k]) features[k] = resample(features[k], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0_sub"] = features["f0"] / self.denom # Set additive input features["f0_additive"] = features["f0_sub"] # Prepare decoder network inputs features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL features["f0_sub_scaled"] = hz_to_mel( features["f0_sub"]) / F0_SUB_RANGE for k in ["phase", "phase_sub", "phase_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi for k in ["osc", "osc_sub", "osc_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] return features
def get_candidate_harmonics(self, f0_candidates, as_midi=True): """Build a harmonic series off of each candidate partial.""" n = tf.range(1, self.n_harmonic_points + 1, dtype=tf.float32) # -> [batch, time, candidate, harmonic] harmonics = (f0_candidates[:, :, :, tf.newaxis] * n[tf.newaxis, tf.newaxis, tf.newaxis, :]) if as_midi: harmonics = hz_to_midi(harmonics) return harmonics
def call(self, audio) -> ['z', 'f0_binned', 'f0_scaled', 'f0_hz']: x = self.audio_feature_extractor(audio) z = self.z_out(x) t_steps = x.shape[1] z = ddsp.core.resample(z, t_steps) f0_binned_logits = self.f0_out(x) f0_binned = tf.nn.softmax(f0_binned_logits) # TODO correlate neighbouring bins via 1d convolution along bin axis f0_hz = self.f0_to_bins.invert(f0_binned) f0_scaled = hz_to_midi(f0_hz) / F0_RANGE return z, f0_binned, f0_scaled, f0_hz
def _default_processing(self, features): '''Always resample to time_steps and scale f0 signal.''' # Make sure inputs have the right dimensions, i.e. [batch_size, n_frames, {context dependent}] for k in [ "f0", "phase", "phase_unwrapped", "osc", "osc_sub", "phase_sub", "phase_unwrapped_sub", "osc_sub_sync", "phase_unwrapped_sub_sync", "phase_sub_sync" ]: if features.get(k, None) is not None: features[k] = at_least_3d(features[k]) features[k] = resample(features[k], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0_sub"] = features["f0"] / self.denom # Set additive input features["f0_additive"] = features[self.f0_additive] # Generate osc and phase from f0 if missing for suffix in ["", "_sub"]: if features.get("osc" + suffix, None) is None: amplitudes = tf.ones(tf.shape(features["f0" + suffix])) features["osc" + suffix] = oscillator_bank( features["f0" + suffix], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] if features.get("phase" + suffix, None) is None: omegas = 2.0 * np.pi * features["f0" + suffix] / float( self.rate) phases = tf.cumsum(omegas, axis=1) features["phase_unwrapped" + suffix] = phases phases_wrapped = tf.math.mod(phases + np.pi, 2 * np.pi) - np.pi features["phase" + suffix] = phases_wrapped for prefix in ["osc_sub", "phase_sub", "phase_unwrapped_sub"]: if features.get(prefix + "_sync", None) is None: features[prefix + "_sync"] = features[prefix] # Prepare decoder network inputs features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL features["f0_sub_scaled"] = hz_to_mel( features["f0_sub"]) / F0_SUB_RANGE for k in ["phase", "phase_sub", "phase_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi for k in ["osc", "osc_sub", "osc_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] return features
def _default_processing(self, features): '''Always resample to time_steps and scale f0 signal.''' features["f0"] = at_least_3d(features["f0"]) features["f0"] = resample(features["f0"], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0"] /= self.denom # Set additive input features["f0_additive"] = features["f0"] # Prepare decoder network inputs if self.feature_domain == "freq": features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE elif self.feature_domain == "freq-old": '''DEPRICATED. This option is for backward compability with a version containing a typo.''' features["f0_scaled"] = hz_to_midi( self.denom * features["f0"]) / F0_RANGE / self.denom elif self.feature_domain == "time": amplitudes = tf.ones(tf.shape(features["f0"])) features["f0_scaled"] = oscillator_bank( features["f0"], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] elif self.feature_domain == "osc": if features.get("osc", None) is None: amplitudes = tf.ones(tf.shape(features["f0"])) features["f0_scaled"] = oscillator_bank( self.denom * features["f0"], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] else: features["f0_scaled"] = features["osc"][:, :, tf.newaxis] else: raise ValueError("%s is not a valid value for feature_domain." % self.feature_domain) return features
def prepare_tfrecord_no_beam(input_audio_paths, output_tfrecord_path, num_shards=None, sample_rate=16000, frame_rate=250, window_secs=4, hop_secs=1, pipeline_options=''): if num_shards is not None or pipeline_options != '': logging.warning( 'num_shards and pipeline_options arguments are not supported if not using apache beam!' ) examples = do_multiprocess(partial(_load_audio, sample_rate=sample_rate), input_audio_paths) examples = [_add_f0_estimate(ex, frame_rate) for ex in examples] pitch_mean = np.mean( np.concatenate([hz_to_midi(item['f0_hz']) for item in examples])) examples = do_multiprocess( partial(_add_loudness, sample_rate=sample_rate, frame_rate=frame_rate), examples) loudness_avg_max = np.mean( [np.max(item['loudness_db']) for item in examples]) loudness_mean = np.mean( np.concatenate([item['loudness_db'] for item in examples])) split_examples = [] for ex in examples: split = _split_example(ex, sample_rate, frame_rate, window_secs, hop_secs) for s in split: split_examples.append(s) tfexamples = do_multiprocess(_float_dict_to_tfexample, split_examples) with tf.io.TFRecordWriter(output_tfrecord_path) as writer: for ex in tfexamples: writer.write(ex.SerializeToString()) print(f'model_pitch_mean: {pitch_mean}') print(f'model_loudness_avg_max: {loudness_avg_max}') print(f'model_loudness_mean: {loudness_mean}')
def get_p_harmonics_given_sinusoids(self, freqs, amps): """Gets distribution of harmonics from candidate f0s given sinusoids. Performs a gaussian kernel density estimate on the sinusoid points, with the height of each gaussian component given by the sinusoidal amplitude. Args: freqs: Frequencies of sinusoids in hertz. amps: Amplitudes of sinusoids, must be greater than 0. Returns: MixtureSameFamily, Gaussian distribution. """ # Gaussian KDE around each partial, height=amplitude, center=frequency. sinusoids_midi = hz_to_midi(freqs) # NLL can be a nan if sinusoid amps are all zero, add a small offset. amps = tf.where(amps == 0.0, 1e-7 * tf.ones_like(amps), amps) amps_norm = safe_divide(amps, tf.reduce_sum(amps, axis=-1, keepdims=True)) # P(candidate_harmonics | sinusoids) return tfd.MixtureSameFamily( tfd.Categorical(probs=amps_norm), tfd.Normal(loc=sinusoids_midi, scale=self.sinusoids_scale))
def get_loss_tensors(self, f0_candidates, freqs, amps): """Get traces of loss to estimate fundamental frequency. Args: f0_candidates: Frequencies of candidates in hertz. [batch, time, freq]. freqs: Frequencies of sinusoids in hertz. [batch, time, feq]. amps: Amplitudes of sinusoids, greater than 0. [batch, time, freq]. Returns: sinusoids_loss: -log p(sinusoids|harmonics), [batch, time, f0_candidate]. harmonics_loss: - log p(harmonics|sinusoids), [batch, time, f0_candidate]. """ # ========================================================================== # P(sinusoids | candidate_harmonics). # ========================================================================== p_sinusoids_given_harmonics = self.get_p_sinusoids_given_harmonics() # Treat each partial as a candidate. # Get the ratio of each partial to each candidate. # -> [batch, time, candidate, partial] freq_ratios = safe_divide(freqs[:, :, tf.newaxis, :], f0_candidates[:, :, :, tf.newaxis]) nll_sinusoids = - p_sinusoids_given_harmonics.log_prob(freq_ratios) a = tf.convert_to_tensor(amps[:, :, tf.newaxis, :]) # # Don't count sinusoids that are less than 1 std > mean. # a_mean, a_var = tf.nn.moments(a, axes=-1, keepdims=True) # a = tf.where(a > a_mean + 0.5 * a_var**0.5, a, tf.zeros_like(a)) # Weighted sum by sinusoid amplitude. # -> [batch, time, candidate] sinusoids_loss = safe_divide(tf.reduce_sum(nll_sinusoids * a, axis=-1), tf.reduce_sum(a, axis=-1)) # ========================================================================== # P(candidate_harmonics | sinusoids) # ========================================================================== p_harm_given_sin = self.get_p_harmonics_given_sinusoids(freqs, amps) harmonics = self.get_candidate_harmonics(f0_candidates, as_midi=True) # Need to rearrage shape as tfp expects, [sample_sh, batch_sh, event_sh]. # -> [candidate, harmonic, batch, time] harmonics_transpose = tf.transpose(harmonics, [2, 3, 0, 1]) nll_harmonics_transpose = - p_harm_given_sin.log_prob(harmonics_transpose) # -> [batch, time, candidate, harm] nll_harmonics = tf.transpose(nll_harmonics_transpose, [2, 3, 0, 1]) # Prior decreasing importance of upper harmonics. amps_prior = tf.linspace( 1.0, 1.0 / self.n_harmonic_points, self.n_harmonic_points) harmonics_loss = (nll_harmonics * amps_prior[tf.newaxis, tf.newaxis, tf.newaxis, :]) # Don't count loss for harmonics above nyquist. # Reweight by the number of harmonics below nyquist, # (so it doesn't just pick the highest frequency possible). nyquist_midi = hz_to_midi(self.sample_rate / 2.0) nyquist_mask = tf.where(harmonics < nyquist_midi, tf.ones_like(harmonics_loss), tf.zeros_like(harmonics_loss)) harmonics_loss *= safe_divide( nyquist_mask, tf.reduce_mean(nyquist_mask, axis=-1, keepdims=True)) # Sum over harmonics. harmonics_loss = tf.reduce_mean(harmonics_loss, axis=-1) return sinusoids_loss, harmonics_loss