def __init__(self,
                 latent_dims,
                 param_size,
                 hidden_size=64,
                 num_layers=1,
                 dropout_p=0.2,
                 use_f0=True,
                 use_ld=True):
        super().__init__(param_size)
        # Map the latent vector
        self.z_MLP = MLP(latent_dims, hidden_size, 3)
        self.use_f0 = use_f0
        self.use_ld = use_ld
        gru_input_size = hidden_size
        if use_f0:
            self.f0_MLP = MLP(1, hidden_size, loop=3)
            gru_input_size += hidden_size
        if use_ld:
            self.ld_MLP = MLP(1, hidden_size, loop=3)
            gru_input_size += hidden_size

        # Recurrent model to handle temporality
        self.gru = nn.GRU(gru_input_size,
                          hidden_size,
                          num_layers,
                          dropout=dropout_p,
                          batch_first=True)
        # Mixing MLP after the GRU
        self.fi_MLP = MLP(hidden_size, hidden_size, loop=3)
        # Outputs to different parameters of the synth
        self.dense_out = nn.Linear(hidden_size, param_size)
Exemple #2
0
 def __init__(self,
              n_classes,
              n_fft=1024,
              hop_length=512,
              n_mels=128,
              channels=64,
              length=4.0,
              sample_rate=16000):
     super().__init__()
     self.convs = nn.Sequential(
         nn.Conv2d(1, channels, kernel_size=5, stride=1), nn.ReLU(),
         nn.MaxPool2d(kernel_size=2, stride=2),
         nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(),
         nn.MaxPool2d(kernel_size=2, stride=2),
         nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(),
         nn.MaxPool2d(kernel_size=2, stride=2),
         nn.Conv2d(channels, channels, kernel_size=5, stride=1), nn.ReLU(),
         nn.MaxPool2d(kernel_size=2, stride=2))
     self.n_fft = n_fft
     self.hop_length = hop_length
     n_samples = length * sample_rate
     self.n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1
     self.logmel = nn.Sequential(
         MelSpec(n_fft=n_fft, hop_length=hop_length, n_mels=n_mels),
         LogTransform())
     # get_final_size
     dummy = torch.randn(1, 1, n_mels, self.n_frames)
     dummy = self.convs(dummy)
     self.conv_shape = list(dummy.shape[2:])
     self.mlp = MLP(channels * self.conv_shape[0] * self.conv_shape[1],
                    64,
                    loop=2)
     self.out = nn.Linear(64, n_classes)
Exemple #3
0
 def __init__(self,
              encoder,
              decoder,
              encoder_dims,
              latent_dims,
              hidden_size=64):
     super().__init__(encoder, decoder, encoder_dims, latent_dims)
     # RNN that takes past latents and attribute as condition
     self.hidden_size = hidden_size
     self.temporal = nn.GRUCell(latent_dims, hidden_size)
     self.mix_lin = MLP(hidden_size + encoder_dims, latent_dims)
     self.post_loc_out = nn.Linear(latent_dims, latent_dims)
     self.post_logscale_out = nn.Linear(latent_dims, latent_dims)
     self.prior_lin = MLP(hidden_size, latent_dims)
     self.prior_loc_out = nn.Linear(latent_dims, latent_dims)
     self.prior_logscale_out = nn.Linear(latent_dims, latent_dims)
 def __init__(self,
              latent_dims,
              param_size,
              hidden_size=256,
              num_layers=3,
              use_f0=True,
              use_ld=True):
     super().__init__(param_size)
     self.z_MLP = MLP(latent_dims, hidden_size, 3)
     self.use_f0 = use_f0
     self.use_ld = use_ld
     mlp_input_size = hidden_size
     if use_f0:
         self.f0_MLP = MLP(1, hidden_size, loop=3)
         mlp_input_size += hidden_size
     if use_ld:
         self.ld_MLP = MLP(1, hidden_size, loop=3)
         mlp_input_size += hidden_size
     # Mixing MLP
     self.fi_MLP = MLP(mlp_input_size, hidden_size, loop=num_layers)
     # Outputs to different parameters of the synth
     self.dense_out = nn.Linear(hidden_size, param_size)
    def __init__(self,
                 frame_setting,
                 encoder_dims,
                 n_mfcc=40,
                 num_layers=3,
                 hidden_size=256,
                 sr=16000,
                 f0_encoder=None,
                 encode_ld=False):
        super().__init__(f0_encoder, encode_ld)
        n_fft, hop = get_window_hop(frame_setting)
        self.mfcc = Mfcc(n_fft, hop, 128, n_mfcc, f_min=20)
        self.norm = Normalize2d('batch')

        self.mlp = MLP(n_mfcc, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, encoder_dims)
 def __init__(self,
              frame_setting,
              encoder_dims,
              n_mels=128,
              channels=64,
              kernel_size=7,
              strides=[2, 2, 2, 2],
              hidden_size=256,
              sr=16000,
              f0_encoder=None,
              encode_ld=False):
     super().__init__(f0_encoder, encode_ld)
     n_fft, hop = get_window_hop(frame_setting)
     self.logmel = nn.Sequential(
         MelSpec(n_fft=n_fft, hop_length=hop, n_mels=n_mels),
         LogTransform())
     self.frame_size = n_mels
     self.norm = Normalize2d('batch')
     self.channels = channels
     self.convs = nn.ModuleList([
         nn.Sequential(
             nn.Conv1d(1,
                       channels,
                       kernel_size,
                       padding=kernel_size // 2,
                       stride=strides[0]), nn.BatchNorm1d(channels),
             nn.ReLU())
     ] + [
         nn.Sequential(
             nn.Conv1d(channels,
                       channels,
                       kernel_size,
                       padding=kernel_size // 2,
                       stride=strides[i]), nn.BatchNorm1d(channels),
             nn.ReLU()) for i in range(1,
                                       len(strides) - 1)
     ] + [
         nn.Sequential(
             nn.Conv1d(channels,
                       channels,
                       kernel_size,
                       padding=kernel_size // 2,
                       stride=strides[-1]))
     ])
     self.l_out = self.get_downsampled_length()[-1]
     self.mlp = MLP(self.l_out * channels, encoder_dims, loop=2)
    def __init__(self,
                 frame_setting,
                 encoder_dims,
                 n_mels=40,
                 num_layers=3,
                 hidden_size=256,
                 sr=16000,
                 f0_encoder=None,
                 encode_ld=False):
        super().__init__(f0_encoder, encode_ld)
        n_fft, hop = get_window_hop(frame_setting)
        self.logmel = nn.Sequential(
            MelSpec(n_fft=n_fft, hop_length=hop, n_mels=n_mels),
            LogTransform())
        self.norm = Normalize2d('batch')

        self.mlp = MLP(n_mels, hidden_size, num_layers)
        self.out = nn.Linear(hidden_size, encoder_dims)
    def __init__(self,
                 frame_setting,
                 encoder_dims,
                 channels,
                 kernel_size,
                 strides,
                 f0_encoder=None,
                 encode_ld=False):
        super().__init__(f0_encoder, encode_ld)
        n_fft, hop = get_window_hop(frame_setting)
        self.frame_size = n_fft  # same as window size
        self.hop_size = hop

        self.encoder_dims = encoder_dims
        self.convs = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(1,
                          channels,
                          kernel_size,
                          padding=kernel_size // 2,
                          stride=strides[0]), nn.ReLU())
        ] + [
            nn.Sequential(
                nn.Conv1d(channels,
                          channels,
                          kernel_size,
                          padding=kernel_size // 2,
                          stride=strides[i]), nn.ReLU())
            for i in range(1,
                           len(strides) - 1)
        ] + [
            nn.Sequential(
                nn.Conv1d(channels,
                          channels,
                          kernel_size,
                          padding=kernel_size // 2,
                          stride=strides[-1]))
        ])
        self.l_out = self.get_downsampled_length()[-1]
        self.mlp = MLP(self.l_out * encoder_dims, encoder_dims, loop=2)