def get_signal(self, amps, f0_hz, mod_amps, mod_f0_hz): """Synthesize audio with am synthesizer from controls. Args: amps: Amplitude tensor of shape [batch, n_frames, 1]. Expects float32 that is strictly positive. f0_hz: The fundamental frequency in Hertz. Tensor of shape [batch, n_frames, 1]. mod_amps: Amplitude tensor of shape [batch, n_frames, 1]. Expects float32 that is strictly positive. mod_f0_hz: Tensor of shape [batch, n_frames, 1]. Expects float32 in Hertz that is strictly positive. Returns: signal: A tensor of shape [batch, n_samples]. """ # Create sample-wise envelopes. amps_envelopes = core.resample(amps, self.n_samples, method=self.amp_resample_method) f0_hz_envelopes = core.resample(f0_hz, self.n_samples) mod_amps_envelopes = core.resample(mod_amps, self.n_samples, method=self.amp_resample_method) mod_f0_hz_envelopes = core.resample(mod_f0_hz, self.n_samples) signal = core.modulate_amplitude(amps=amps_envelopes, f0_hz=f0_hz_envelopes, mod_amps=mod_amps_envelopes, mod_f0_hz=mod_f0_hz_envelopes, sample_rate=self.sample_rate) return signal
def call(self, conditioning): batch_size = conditioning['f0_hz'].shape[0] noise = tf.random.normal([batch_size, self.n_total, 1]) f0_hz = core.resample(conditioning['f0_hz'], self.n_total) frequency_envelopes = core.get_harmonic_frequencies(f0_hz, self.n_harmonics) audios = core.oscillator_bank(frequency_envelopes=frequency_envelopes, amplitude_envelopes=tf.ones_like(frequency_envelopes), sample_rate=self.sample_rate, sum_sinusoids=False) inputs = [conditioning[k] for k in self.input_keys] inputs = [stack(x) for stack, x in zip(self.input_stacks, inputs)] # Resample all inputs to the target sample rate inputs = [core.resample(x, self.n_total) for x in inputs] c = tf.concat(inputs + [audios, noise], axis=-1) # Conv layers x = self.first_conv(c) skips = 0 for f in self.conv_layers: x, h = f(x, c) skips += h skips *= tf.sqrt(1.0 / len(self.conv_layers)) return {'audio_tensor': self.dense_out(skips)}
def get_signal(self, magnitudes, taus): """Synthesize audio with sinusoidal synthesizer from controls. Args: magnitudes: magnitude tensor of shape [batch, n_frames, 1]. Expects float32 that is strictly positive. stdevs: Tensor of shape [batch, n_frames, 1]. Expects float32 in that is strictly positive. taus: Tensor of shape [batch, n_frames, 1]. Expects float32 in that is strictly positive. Returns: signal: A tensor of the force impulse profile of shape [batch, n_samples]. """ # Create sample-wise envelopes. weight_distance = 100 diff_order = 1 # magnitude_diffs = tf.experimental.numpy.diff(magnitudes, n=diff_order, axis=1) # magnitude_diffs = tf.concat((tf.zeros((tf.shape(magnitude_diffs)[0], int(math.floor(diff_order/2)), 1), dtype=tf.float32), # magnitude_diffs, # tf.zeros((tf.shape(magnitude_diffs)[0], int(math.ceil(diff_order/2)), 1), dtype=tf.float32)), axis=1) # magnitude_envelopes = core.resample(((-1)**diff_order) * magnitude_diffs, self.n_samples, magnitude_envelopes = core.resample(magnitudes, self.n_samples, method=self.resample_method) taus = core.resample(taus, self.n_samples, method=self.resample_method) window_size = int(self.sample_rate / self.max_impact_frequency) magnitude_envelopes = tf.expand_dims(magnitude_envelopes, axis=1) vals, inds = tf.nn.max_pool_with_argmax(magnitude_envelopes, window_size, window_size, 'SAME') # Use a weighted average of magnitude to select peak time so that things can shift around if self.timing_adjust: augmented_inds = tf.concat([inds - weight_distance, inds, inds + weight_distance], axis=-1) augmented_inds = tf.clip_by_value(augmented_inds, 0, self.n_samples - 1) b,w,h,c = magnitude_envelopes.get_shape().as_list() mags_pooled = tf.gather(tf.reshape(magnitude_envelopes, shape=[b*w*h*c]), augmented_inds) weighted_inds = tf.reduce_sum(tf.cast(augmented_inds, dtype=tf.float32) * mags_pooled, axis=-1) / tf.reduce_sum(mags_pooled, axis=-1) peak_times = tf.cast(weighted_inds / self.sample_rate, dtype=tf.float32) else: peak_times = tf.squeeze(tf.cast(inds / self.sample_rate, dtype=tf.float32), axis=3) scale_heights = tf.squeeze(vals, axis=3) taus = tf.expand_dims(taus, axis=1) b,w,h,c = taus.get_shape().as_list() taus_pooled = tf.gather(tf.reshape(taus,shape= [b*w*h*c,]),inds) taus_pooled = tf.squeeze(taus_pooled, axis=3) basis_impulses = self.hertz_gaussian(peak_times, taus_pooled) signal = tf.reduce_sum(scale_heights * basis_impulses, axis=2) return signal
def _default_processing(self, features): '''Always resample to time_steps and scale input signals.''' for k in [ "f0", "phase", "phase_unwrapped", "osc", "osc_sub", "phase_sub", "phase_unwrapped_sub", "osc_sub_sync", "phase_unwrapped_sub_sync", "phase_sub_sync" ]: if features.get(k, None) is not None: features[k] = at_least_3d(features[k]) features[k] = resample(features[k], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0_sub"] = features["f0"] / self.denom # Set additive input features["f0_additive"] = features["f0_sub"] # Prepare decoder network inputs features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL features["f0_sub_scaled"] = hz_to_mel( features["f0_sub"]) / F0_SUB_RANGE for k in ["phase", "phase_sub", "phase_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi for k in ["osc", "osc_sub", "osc_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] return features
def call(self, audio, *conditioning): if self.spectral_op == 'compute_mfcc': z = spectral_ops.compute_mfcc( audio, lo_hz=20.0, hi_hz=8000.0, fft_size=self.fft_size, mel_bins=128, mfcc_bins=30, overlap=self.overlap, pad_end=True) elif self.spectral_op == 'compute_logmag': z = spectral_ops.compute_logmag(core.tf_float32(audio), size=self.fft_size) # Normalize. z = self.z_norm(z[:, :, tf.newaxis, :])[:, :, 0, :] n_timesteps = z.shape[1] conditioning = [resample(c, n_timesteps) for c in conditioning] z = tf.concat([z] + conditioning, axis=-1) # Run an RNN over the latents. z = self.rnn(z) # Bounce down to compressed z dimensions. w = tf.math.sigmoid(self.confidence(z)) z = self.dense_out(z) z = tf.reduce_sum(z * w, axis=1, keepdims=True) / tf.reduce_sum(w, axis=1, keepdims=True) return z
def get_controls(self, signal_one: tf.Tensor, signal_two: tf.Tensor, nn_out_mix_level: tf.Tensor) -> TensorDict: """Standardize inputs to same length, mix_level to range [0, 1]. Args: signal_one: 2-D or 3-D tensor. signal_two: 2-D or 3-D tensor. nn_out_mix_level: Tensor of shape [batch, n_time, 1] output of the network determining relative levels of signal one and two. Returns: Dict of control parameters. Raises: ValueError: If signal_one and signal_two are not the same length. """ n_time_one = int(signal_one.shape[1]) n_time_two = int(signal_two.shape[1]) if n_time_one != n_time_two: raise ValueError( 'The two signals must have the same length instead of' '{} and {}'.format(n_time_one, n_time_two)) mix_level = tf.nn.sigmoid(nn_out_mix_level) mix_level = core.resample(mix_level, n_time_one) mix_level = tf.reshape(mix_level, (signal_one.shape[0], n_time_one)) return { 'signal_one': signal_one, 'signal_two': signal_two, 'mix_level': mix_level }
def call(self, *args, **unused_kwargs): """Resamples all inputs to the maximal resolution and computes the score""" inputs = [preprocessing.at_least_3d(i) for i in args] n_timesteps = max(i.shape[1] for i in inputs) inputs = [core.resample(i, n_timesteps) for i in inputs] score = self.compute_score(*inputs) score = tf.reduce_mean(score, axis=list(range(1, len(score.shape)))) return score
def estimate_spec(self, f0, amp): quantized_f0 = nn.differential_onehot(nn.ensure_3d(f0), self.f_quantization_bins, 200) x = [resample(x, self.t_bins) for x in [quantized_f0, nn.ensure_3d(amp)]] x = tf.concat(x, axis=-1) # x: (batch, time, synth_params + fbins) x = self.dense(x) # x: (batch, time, f_bins*channels) x = tf.reshape(x, [-1, self.t_bins, self.f_bins, self.channels]) res = self.conv1d(x) # assume close time steps interact with each other return x + res
def get_signal(self, amplitudes, frequencies): """Synthesize audio with sinusoidal synthesizer from controls. Args: amplitudes: Amplitude tensor of shape [batch, n_frames, n_sinusoids]. Expects float32 that is strictly positive. frequencies: Tensor of shape [batch, n_frames, n_sinusoids]. Expects float32 in Hertz that is strictly positive. Returns: signal: A tensor of harmonic waves of shape [batch, n_samples]. """ # Create sample-wise envelopes. amplitude_envelopes = core.resample(amplitudes, self.n_samples, method=self.amp_resample_method) frequency_envelopes = core.resample(frequencies, self.n_samples) signal = core.oscillator_bank(frequency_envelopes=frequency_envelopes, amplitude_envelopes=amplitude_envelopes, sample_rate=self.sample_rate) return signal
def _default_processing(self, features): '''Always resample to time_steps and scale f0 signal.''' # Make sure inputs have the right dimensions, i.e. [batch_size, n_frames, {context dependent}] for k in [ "f0", "phase", "phase_unwrapped", "osc", "osc_sub", "phase_sub", "phase_unwrapped_sub", "osc_sub_sync", "phase_unwrapped_sub_sync", "phase_sub_sync" ]: if features.get(k, None) is not None: features[k] = at_least_3d(features[k]) features[k] = resample(features[k], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0_sub"] = features["f0"] / self.denom # Set additive input features["f0_additive"] = features[self.f0_additive] # Generate osc and phase from f0 if missing for suffix in ["", "_sub"]: if features.get("osc" + suffix, None) is None: amplitudes = tf.ones(tf.shape(features["f0" + suffix])) features["osc" + suffix] = oscillator_bank( features["f0" + suffix], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] if features.get("phase" + suffix, None) is None: omegas = 2.0 * np.pi * features["f0" + suffix] / float( self.rate) phases = tf.cumsum(omegas, axis=1) features["phase_unwrapped" + suffix] = phases phases_wrapped = tf.math.mod(phases + np.pi, 2 * np.pi) - np.pi features["phase" + suffix] = phases_wrapped for prefix in ["osc_sub", "phase_sub", "phase_unwrapped_sub"]: if features.get(prefix + "_sync", None) is None: features[prefix + "_sync"] = features[prefix] # Prepare decoder network inputs features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE features["f0_scaled_mel"] = hz_to_mel(features["f0"]) / F0_RANGE_MEL features["f0_sub_scaled"] = hz_to_mel( features["f0_sub"]) / F0_SUB_RANGE for k in ["phase", "phase_sub", "phase_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] / np.pi for k in ["osc", "osc_sub", "osc_sub_sync"]: if features.get(k, None) is not None: features[k + "_scaled"] = 0.5 + 0.5 * features[k] return features
def create_resampled_signals(self, n_before, n_after, add_endpoint, method): """Helper function to resample a test signal using core.resample(). Args: n_before: Number of timesteps before resampling. n_after: Number of timesteps after resampling. add_endpoint: Add extra timestep at end of resampling. method: Method of resampling. Returns: before: Numpy array before resampling. Shape (n_before,). after: Numpy array after resampling. Shape (n_after,). """ before = 1.0 - np.sin(np.linspace(0, np.pi, n_before)) before = before[np.newaxis, :, np.newaxis] after = core.resample( before, n_after, method=method, add_endpoint=add_endpoint).numpy() return before[0, :, 0], after[0, :, 0]
def get_signal(self, amplitudes, wavetables, f0_hz): """Synthesize audio with additive synthesizer from controls. Args: amplitudes: Amplitude tensor of shape [batch, n_frames, 1]. Expects float32 that is strictly positive. wavetables: Tensor of shape [batch, n_frames, n_wavetable]. f0_hz: The fundamental frequency in Hertz. Tensor of shape [batch, n_frames, 1]. Returns: signal: A tensor of of shape [batch, n_samples]. """ wavetables = core.resample(wavetables, self.n_samples) signal = core.wavetable_synthesis(amplitudes=amplitudes, wavetables=wavetables, frequencies=f0_hz, n_samples=self.n_samples, sample_rate=self.sample_rate) return signal
def test_multi_dimensional_inputs(self, dimensions): """Test the shapes are correct for different dimensional inputs. Args: dimensions: The number of dimensions of the input test signal. """ # Create test signal. inputs_shape = [self.n_smaller] * dimensions inputs = np.ones(inputs_shape) # Run through the resampling op. outputs = core.resample(inputs, self.n_larger) # Compute output shape. outputs_shape = inputs_shape if dimensions == 1: outputs_shape[0] = self.n_larger else: outputs_shape[1] = self.n_larger self.assertListEqual(list(outputs.shape), outputs_shape)
def setUp(self): """Create input dictionary and preprocessor.""" super().setUp() sr = 16000 frame_rate = 250 frame_size = 256 n_samples = 16000 n_t = 250 # Replicate preprocessor computations. audio = 0.5 * tf.sin(tf.range(0, n_samples, dtype=tf.float32))[None, :] power_db = compute_power(audio, sample_rate=sr, frame_rate=frame_rate, frame_size=frame_size) power_db = preprocessing.at_least_3d(power_db) power_db = resample(power_db, n_t) self.input_dict = { 'f0_hz': tf.ones([1, n_t]), 'audio': audio, 'power_db': power_db, } self.preprocessor = preprocessing.F0PowerPreprocessor( time_steps=n_t, frame_rate=frame_rate, sample_rate=sr)
def _default_processing(self, features): '''Always resample to time_steps and scale f0 signal.''' features["f0"] = at_least_3d(features["f0"]) features["f0"] = resample(features["f0"], n_timesteps=self.time_steps) # Divide by denom (e.g. number of cylinders in engine to produce subharmonics) features["f0"] /= self.denom # Set additive input features["f0_additive"] = features["f0"] # Prepare decoder network inputs if self.feature_domain == "freq": features["f0_scaled"] = hz_to_midi(features["f0"]) / F0_RANGE elif self.feature_domain == "freq-old": '''DEPRICATED. This option is for backward compability with a version containing a typo.''' features["f0_scaled"] = hz_to_midi( self.denom * features["f0"]) / F0_RANGE / self.denom elif self.feature_domain == "time": amplitudes = tf.ones(tf.shape(features["f0"])) features["f0_scaled"] = oscillator_bank( features["f0"], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] elif self.feature_domain == "osc": if features.get("osc", None) is None: amplitudes = tf.ones(tf.shape(features["f0"])) features["f0_scaled"] = oscillator_bank( self.denom * features["f0"], amplitudes, sample_rate=self.rate)[:, :, tf.newaxis] else: features["f0_scaled"] = features["osc"][:, :, tf.newaxis] else: raise ValueError("%s is not a valid value for feature_domain." % self.feature_domain) return features
def get_signal(self, f0, op1, op2, op3, op4, modulators): """Synthesize audio with fm synthesizer from controls. Args: f0_hz: Fundamental frequencies in hertz. Shape [batch, n_frames , 1] op1-4: Amp, idx and ADSR of each operator. Shape [batch, n_frames , 3] modulators: Modulation between operators. Shape [batch, n_frames , 6] Returns: signal: A tensor of shape [batch, n_samples]. """ # Create sample-wise envelopes. f0_env = core.resample(f0, self.n_samples, add_endpoint=self.add_endpoint) op1 = core.resample(op1, self.n_samples, add_endpoint=self.add_endpoint) op2 = core.resample(op2, self.n_samples, add_endpoint=self.add_endpoint) op3 = core.resample(op3, self.n_samples, add_endpoint=self.add_endpoint) op4 = core.resample(op4, self.n_samples, add_endpoint=self.add_endpoint) modulators_env = core.resample(modulators, self.n_samples, add_endpoint=self.add_endpoint) signal = core.modulate_frequency(f0=f0_env, op1=op1, op2=op2, op3=op3, op4=op4, modulators=modulators_env, sample_rate=self.sample_rate) return signal
def additive_synthesis(self, amplitudes, frequency_shifts=None, frequency_distribution=None, n_samples=64000, sample_rate=16000, amp_resample_method="window"): '''Generate audio from frame-wise monophonic harmonic oscillator bank. Args: amplitudes: Frame-wise oscillator peak amplitude. Shape [batch_size, n_frames, 1]. frequency_shifts: Harmonic frequency variations (Hz), zero-centered. Total frequency of a harmonic is equal to (frequencies * (1 + frequency_shifts)). Shape [batch_size, n_frames, n_harmonics]. frequency_distribution: Harmonic amplitude variations, ranged zero to one. Total amplitude of a harmonic is equal to (amplitudes * frequency_distribution). Shape [batch_size, n_frames, n_harmonics]. n_samples: Total length of output audio. Interpolates and crops to this. sample_rate: Sample rate. amp_resample_method: Mode with which to resample amplitude envelopes. Returns: audio: Output audio. Shape [batch_size, n_samples, 1] ''' amplitudes = core.tf_float32(amplitudes) batch_size = amplitudes.shape[0] n_frames = amplitudes.shape[1] if frequency_distribution is not None: frequency_distribution = core.tf_float32(frequency_distribution) n_frequencies = int(frequency_distribution.shape[-1]) elif harmonic_shifts is not None: harmonic_shifts = core.tf_float32(harmonic_shifts) n_frequencies = int(frequency_shifts.shape[-1]) else: n_frequencies = 1 # Create frequencies [batch_size, n_frames, n_frequencies]. frequencies = self.get_linear_frequencies(batch_size, n_frames, n_frequencies) if frequency_shifts is not None: frequencies *= (1.0 + harmonic_shifts) # Create harmonic amplitudes [batch_size, n_frames, n_frequencies]. if frequency_distribution is not None: frequency_amplitudes = amplitudes * frequency_distribution else: frequency_amplitudes = amplitudes # Create sample-wise envelopes. frequency_envelopes = core.resample(frequencies, n_samples) # cycles/sec amplitude_envelopes = core.resample(frequency_amplitudes, n_samples, method=amp_resample_method) # Synthesize from harmonics [batch_size, n_samples]. audio = core.oscillator_bank(frequency_envelopes, amplitude_envelopes, sample_rate=sample_rate) return audio
def get_signal(self, signal_in: tf.Tensor) -> tf.Tensor: return core.resample(signal_in, self.n_timesteps, self.method)