def __init__(self, latent_dims, param_size, hidden_size=64, num_layers=1, dropout_p=0.2, use_f0=True, use_ld=True): super().__init__(param_size) # Map the latent vector self.z_MLP = MLP(latent_dims, hidden_size, 3) self.use_f0 = use_f0 self.use_ld = use_ld gru_input_size = hidden_size if use_f0: self.f0_MLP = MLP(1, hidden_size, loop=3) gru_input_size += hidden_size if use_ld: self.ld_MLP = MLP(1, hidden_size, loop=3) gru_input_size += hidden_size # Recurrent model to handle temporality self.gru = nn.GRU(gru_input_size, hidden_size, num_layers, dropout=dropout_p, batch_first=True) # Mixing MLP after the GRU self.fi_MLP = MLP(hidden_size, hidden_size, loop=3) # Outputs to different parameters of the synth self.dense_out = nn.Linear(hidden_size, param_size)
def __init__(self, n_classes, n_fft=1024, hop_length=512, n_mels=128, channels=64, length=4.0, sample_rate=16000): super().__init__() self.convs = nn.Sequential( nn.Conv2d(1, channels, kernel_size=5, stride=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2), nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2)) self.n_fft = n_fft self.hop_length = hop_length n_samples = length * sample_rate self.n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1 self.logmel = nn.Sequential( MelSpec(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels), LogTransform()) # get_final_size dummy = torch.randn(1, 1, n_mels, self.n_frames) dummy = self.convs(dummy) self.conv_shape = list(dummy.shape[2:]) self.mlp = MLP(channels * self.conv_shape[0] * self.conv_shape[1], 64, loop=2) self.out = nn.Linear(64, n_classes)
def __init__(self, encoder, decoder, encoder_dims, latent_dims, hidden_size=64): super().__init__(encoder, decoder, encoder_dims, latent_dims) # RNN that takes past latents and attribute as condition self.hidden_size = hidden_size self.temporal = nn.GRUCell(latent_dims, hidden_size) self.mix_lin = MLP(hidden_size + encoder_dims, latent_dims) self.post_loc_out = nn.Linear(latent_dims, latent_dims) self.post_logscale_out = nn.Linear(latent_dims, latent_dims) self.prior_lin = MLP(hidden_size, latent_dims) self.prior_loc_out = nn.Linear(latent_dims, latent_dims) self.prior_logscale_out = nn.Linear(latent_dims, latent_dims)
def __init__(self, latent_dims, param_size, hidden_size=256, num_layers=3, use_f0=True, use_ld=True): super().__init__(param_size) self.z_MLP = MLP(latent_dims, hidden_size, 3) self.use_f0 = use_f0 self.use_ld = use_ld mlp_input_size = hidden_size if use_f0: self.f0_MLP = MLP(1, hidden_size, loop=3) mlp_input_size += hidden_size if use_ld: self.ld_MLP = MLP(1, hidden_size, loop=3) mlp_input_size += hidden_size # Mixing MLP self.fi_MLP = MLP(mlp_input_size, hidden_size, loop=num_layers) # Outputs to different parameters of the synth self.dense_out = nn.Linear(hidden_size, param_size)
def __init__(self, frame_setting, encoder_dims, n_mfcc=40, num_layers=3, hidden_size=256, sr=16000, f0_encoder=None, encode_ld=False): super().__init__(f0_encoder, encode_ld) n_fft, hop = get_window_hop(frame_setting) self.mfcc = Mfcc(n_fft, hop, 128, n_mfcc, f_min=20) self.norm = Normalize2d('batch') self.mlp = MLP(n_mfcc, hidden_size, num_layers) self.out = nn.Linear(hidden_size, encoder_dims)
def __init__(self, frame_setting, encoder_dims, n_mels=128, channels=64, kernel_size=7, strides=[2, 2, 2, 2], hidden_size=256, sr=16000, f0_encoder=None, encode_ld=False): super().__init__(f0_encoder, encode_ld) n_fft, hop = get_window_hop(frame_setting) self.logmel = nn.Sequential( MelSpec(n_fft=n_fft, hop_length=hop, n_mels=n_mels), LogTransform()) self.frame_size = n_mels self.norm = Normalize2d('batch') self.channels = channels self.convs = nn.ModuleList([ nn.Sequential( nn.Conv1d(1, channels, kernel_size, padding=kernel_size // 2, stride=strides[0]), nn.BatchNorm1d(channels), nn.ReLU()) ] + [ nn.Sequential( nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, stride=strides[i]), nn.BatchNorm1d(channels), nn.ReLU()) for i in range(1, len(strides) - 1) ] + [ nn.Sequential( nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, stride=strides[-1])) ]) self.l_out = self.get_downsampled_length()[-1] self.mlp = MLP(self.l_out * channels, encoder_dims, loop=2)
def __init__(self, frame_setting, encoder_dims, n_mels=40, num_layers=3, hidden_size=256, sr=16000, f0_encoder=None, encode_ld=False): super().__init__(f0_encoder, encode_ld) n_fft, hop = get_window_hop(frame_setting) self.logmel = nn.Sequential( MelSpec(n_fft=n_fft, hop_length=hop, n_mels=n_mels), LogTransform()) self.norm = Normalize2d('batch') self.mlp = MLP(n_mels, hidden_size, num_layers) self.out = nn.Linear(hidden_size, encoder_dims)
def __init__(self, frame_setting, encoder_dims, channels, kernel_size, strides, f0_encoder=None, encode_ld=False): super().__init__(f0_encoder, encode_ld) n_fft, hop = get_window_hop(frame_setting) self.frame_size = n_fft # same as window size self.hop_size = hop self.encoder_dims = encoder_dims self.convs = nn.ModuleList([ nn.Sequential( nn.Conv1d(1, channels, kernel_size, padding=kernel_size // 2, stride=strides[0]), nn.ReLU()) ] + [ nn.Sequential( nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, stride=strides[i]), nn.ReLU()) for i in range(1, len(strides) - 1) ] + [ nn.Sequential( nn.Conv1d(channels, channels, kernel_size, padding=kernel_size // 2, stride=strides[-1])) ]) self.l_out = self.get_downsampled_length()[-1] self.mlp = MLP(self.l_out * encoder_dims, encoder_dims, loop=2)