def __init__(self, rnn_channels=512, rnn_type='gru', z_dims=32, z_time_steps=250, other_encoders=None, **kwargs): super().__init__(other_encoders=other_encoders, **kwargs) self.z_time_steps = z_time_steps # Layers. self.z_norm = nn.Normalize('instance') self.rnn = nn.temporal_cnn(rnn_channels, 10) self.dense_out = nn.dense(z_dims) self.frame_shape = (360, 640, 3) self.cv_net = tf.keras.applications.ResNet50V2(include_top=False, weights='imagenet', input_shape=self.frame_shape, pooling=None) #TODO(sclarke): Change this for fine tuning self.cv_net.trainable = True print('Vision network layer count: %i'%len(self.cv_net.layers)) # for l in self.cv_net.layers: # if not('block7' in l.name or 'top' in l.name): # l.trainable = False self.final_layers = tf.keras.Sequential(layers=[ tf.keras.layers.GlobalAveragePooling2D(), ])
def __init__(self, net=None, f0_residual=True, **kwargs): """Constructor.""" super().__init__(**kwargs) self.net = net self.f0_residual = f0_residual self.dense_out = tfkl.Dense(2) self.norm = nn.Normalize('layer')
def __init__(self, rnn_channels=512, rnn_type='gru', z_dims=512, mean_aggregate=False, **kwargs): super().__init__(**kwargs) self.mean_aggregate = mean_aggregate # Layers. self.norm_in = nn.Normalize('instance') self.rnn = nn.Rnn(rnn_channels, rnn_type) self.dense_z = tfkl.Dense(z_dims)
def __init__(self, net=None, z_dims=128, input_keys=('f0_scaled', 'ld_scaled'), mfcc_bins=60, fft_size=1024, mel_bins=128, pool_time=True, **kwargs): self.input_keys = input_keys super().__init__(input_keys, **kwargs) self.compute_mfccs = 'audio' in self.input_keys self.mfcc_bins = mfcc_bins self.fft_size = fft_size self.mel_bins = mel_bins self.pool_time = pool_time # Layers. self.net = net self.norm = nn.Normalize('layer') self.dense_out = tfkl.Dense(z_dims) if self.compute_mfccs: self.norm_mfcc = nn.Normalize('instance')
def __init__(self, fft_sizes=(1024, ), mel_bins=(128, ), mfcc_bins=(30, ), time_steps=250, **kwargs): super().__init__(**kwargs) self.fft_sizes = ddsp.core.make_iterable(fft_sizes) self.mel_bins = ddsp.core.make_iterable(mel_bins) self.mfcc_bins = ddsp.core.make_iterable(mfcc_bins) self.time_steps = time_steps # Layers. self.norm_out = nn.Normalize('instance')
def __init__(self, rnn_channels=512, rnn_type='gru', z_dims=32, mfcc_time_steps=250, z_time_steps=250, sample_rate=16000, other_encoders=None, tcnn_kernel=7, **kwargs): super().__init__(other_encoders=other_encoders, **kwargs) if mfcc_time_steps not in [63, 125, 250, 500, 1000]: raise ValueError( '`mfcc_time_steps` currently limited to 63,125,250,500 and 1000') self.z_audio_spec = { '63': { 'fft_size': 2048, 'overlap': 0.5 }, '125': { 'fft_size': 1024, 'overlap': 0.5 }, '250': { 'fft_size': 1024, 'overlap': 0.75 }, '500': { 'fft_size': 512, 'overlap': 0.75 }, '1000': { 'fft_size': 256, 'overlap': 0.75 } } self.fft_size = self.z_audio_spec[str(mfcc_time_steps)]['fft_size'] self.overlap = self.z_audio_spec[str(mfcc_time_steps)]['overlap'] self.sample_rate = sample_rate if z_time_steps: print('Z time steps: %i'%z_time_steps) self.z_time_steps = z_time_steps # Layers. self.z_norm = nn.Normalize('instance') self.rnn = nn.Rnn(rnn_channels, rnn_type) self.tcnn = nn.temporal_cnn(rnn_channels, tcnn_kernel, causal=False) self.dense_out = tfkl.Dense(z_dims)
def __init__(self, rnn_channels=512, rnn_type='gru', z_time_steps=250, input_keys=['discriminator_audio', 'f0_hz', 'ld_scaled'], spectral_op='compute_mfcc', **kwargs): # make the input key that contains audio the first input_keys = sorted(input_keys, key=lambda i: not 'audio' in i) if len(input_keys) > 1: assert 'audio' not in input_keys[1], "This discriminator only handles a single audio input" super().__init__(**kwargs, input_keys=input_keys) if z_time_steps not in [63, 125, 250, 500, 1000]: raise ValueError( '`z_time_steps` currently limited to 63,125,250,500 and 1000') self.z_audio_spec = { '63': { 'fft_size': 2048, 'overlap': 0.5 }, '125': { 'fft_size': 1024, 'overlap': 0.5 }, '250': { 'fft_size': 1024, 'overlap': 0.75 }, '500': { 'fft_size': 512, 'overlap': 0.75 }, '1000': { 'fft_size': 256, 'overlap': 0.75 } } self.fft_size = self.z_audio_spec[str(z_time_steps)]['fft_size'] self.spectral_op = spectral_op self.overlap = self.z_audio_spec[str(z_time_steps)]['overlap'] # Layers. self.z_norm = nn.Normalize('layer') self.rnn = nn.Rnn(rnn_channels, rnn_type) self.dense_out = tfkl.Dense(1) self.confidence = tfkl.Dense(1)
def __init__(self, net=None, f0_residual=True, norm=True, output_splits=(('f0_midi', 1), ('amplitudes', 1), ('harmonic_distribution', 60), ('magnitudes', 65)), **kwargs): """Constructor.""" self.output_splits = output_splits self.n_out = sum([v[1] for v in output_splits]) output_keys = [v[0] for v in output_splits] + ['f0_hz'] super().__init__(output_keys=output_keys, **kwargs) # Layers. self.net = net self.f0_residual = f0_residual self.dense_out = tfkl.Dense(self.n_out) self.norm = nn.Normalize('layer') if norm else None
def __init__(self, rnn_channels=512, rnn_type='gru', z_dims=32, z_time_steps=250, f0_encoder=None, name='mfcc_time_distrbuted_rnn_encoder'): super(MfccTimeDistributedRnnEncoder, self).__init__( f0_encoder=f0_encoder, name=name) if z_time_steps not in [63, 125, 250, 500, 1000]: raise ValueError( '`z_time_steps` currently limited to 63,125,250,500 and 1000') self.z_audio_spec = { 63: { 'fft_size': 2048, 'overlap': 0.5 }, 125: { 'fft_size': 1024, 'overlap': 0.5 }, 250: { 'fft_size': 1024, 'overlap': 0.75 }, 500: { 'fft_size': 512, 'overlap': 0.75 }, 1000: { 'fft_size': 256, 'overlap': 0.75 } } self.fft_size = self.z_audio_spec[z_time_steps]['fft_size'] self.overlap = self.z_audio_spec[z_time_steps]['overlap'] # Layers. self.z_norm = nn.Normalize('instance') self.rnn = nn.rnn(rnn_channels, rnn_type) self.dense_out = nn.dense(z_dims)
def __init__(self, rnn_channels=512, rnn_type='gru', z_dims=32, z_time_steps=250, **kwargs): super().__init__(**kwargs) if z_time_steps not in [63, 125, 250, 500, 1000]: raise ValueError( '`z_time_steps` currently limited to 63,125,250,500 and 1000') self.z_audio_spec = { '63': { 'fft_size': 2048, 'overlap': 0.5 }, '125': { 'fft_size': 1024, 'overlap': 0.5 }, '250': { 'fft_size': 1024, 'overlap': 0.75 }, '500': { 'fft_size': 512, 'overlap': 0.75 }, '1000': { 'fft_size': 256, 'overlap': 0.75 } } self.fft_size = self.z_audio_spec[str(z_time_steps)]['fft_size'] self.overlap = self.z_audio_spec[str(z_time_steps)]['overlap'] # Layers. self.z_norm = nn.Normalize('instance') self.rnn = nn.Rnn(rnn_channels, rnn_type) self._enc_mu_log_var = tfkl.Dense(2 * z_dims) self.z_time_steps = z_time_steps self.z_dims = z_dims