Beispiel #1
0
    def __init__(self, input_dim=600+9, output_dim=1*3, dropout_prob=0.):
        """Initialises acoustic model parameters and settings."""
        super(F0_RNN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dropout_prob = dropout_prob

        self.recurrent_layers = utils.SequentialWithRecurrent(
            nn.Linear(self.input_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, self.output_dim),
        )

        self.metrics.add_metrics('all',
                                 LF0_RMSE_Hz=LF0Distortion())
Beispiel #2
0
    def __init__(self,
                 z_dim=16,
                 kld_weight=1.,
                 conditioning_dim=600 + 9,
                 output_dim=1 * 3,
                 dropout_prob=0.,
                 latent=None):
        """Initialises VAE parameters and settings."""
        super(VAE, self).__init__(z_dim=z_dim, kld_weight=kld_weight)
        self.conditioning_dim = conditioning_dim
        self.output_dim = output_dim
        self.dropout_prob = dropout_prob
        self.latent = latent

        self.encoder_layer = _Encoder(self.conditioning_dim, self.output_dim,
                                      dropout_prob, self.z_dim)

        self.decoder_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.conditioning_dim + self.z_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, self.output_dim),
        )

        self.metrics.add_metrics('all', LF0_RMSE_Hz=LF0Distortion())
Beispiel #3
0
    def __init__(self,
                 conditioning_dim,
                 input_dim,
                 dropout_prob,
                 z_dim,
                 latent=None):
        super(_Encoder, self).__init__()

        self.conditioning_dim = conditioning_dim
        self.input_dim = input_dim
        self.z_dim = z_dim
        self.latent = latent

        self.shared_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.conditioning_dim + self.input_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
        )

        self.mu_layer = nn.Linear(64, self.z_dim)
        self.logvar_layer = nn.Linear(64, self.z_dim)
    def __init__(self, z_dim=16, conditioning_dim=70+9, output_dim=1*3, dropout_prob=0.,
                 phone_set_file='data/unilex_phoneset.txt'):
        """Initialises VAE parameters and settings."""
        super(F0_AE, self).__init__()
        self.z_dim = z_dim
        self.conditioning_dim = conditioning_dim
        self.output_dim = output_dim
        self.dropout_prob = dropout_prob
        self.phone_set_file = phone_set_file

        self.encoder_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.output_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, self.z_dim)
        )

        self.decoder_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.conditioning_dim + self.z_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(
                nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, self.output_dim),
        )

        self.metrics.add_metrics('all',
                                 LF0_RMSE_Hz=LF0Distortion(),
                                 embeddings=TensorHistory(self.z_dim, hidden=True),
                                 name=History(hidden=True),
                                 n_segments=TensorHistory(1, dtype=torch.long, hidden=True),
                                 segment_mean_F0=TensorHistory(1, hidden=True))
Beispiel #5
0
    def __init__(self,
                 input_dim=600 + 9,
                 output_dims=None,
                 dropout_prob=0.,
                 num_layers=8):
        """Initialises acoustic model parameters and settings."""
        if output_dims is None:
            output_dims = {
                'lf0': 1 * 3,
                'vuv': 1,
                'mcep': 60 * 3,
                'bap': 5 * 3
            }

        super(LSTMAcousticModel, self).__init__()
        self.input_dim = input_dim
        self.output_dims = output_dims
        self.dropout_prob = dropout_prob
        self.num_layers = num_layers

        self.layers = utils.SequentialWithRecurrent(
            nn.Linear(self.input_dim, 512),
            nn.Sigmoid(),
            nn.Dropout(p=self.dropout_prob),
            *[
                utils.RecurrentCuDNNWrapper(
                    nn.LSTM(512,
                            512,
                            dropout=self.dropout_prob,
                            batch_first=True)) for _ in range(self.num_layers)
            ],
            nn.Linear(512, 256),
            nn.Sigmoid(),
            nn.Dropout(p=self.dropout_prob),
            nn.Linear(256, sum(self.output_dims.values())),
        )

        self.metrics.add_metrics('all',
                                 LF0_RMSE_Hz=metrics.LF0Distortion(),
                                 VUV_accuracy=metrics.Mean(),
                                 MCEP_distortion=metrics.MelCepDistortion(),
                                 BAP_distortion=metrics.Distortion())
Beispiel #6
0
    def __init__(self,
                 z_dim=16,
                 kld_weight=1.,
                 conditioning_dim=70 + 9,
                 output_dim=1 * 3,
                 dropout_prob=0.,
                 n_components=32,
                 pseudo_inputs_seq_lens=50,
                 pseudo_inputs_mean=0.05,
                 pseudo_inputs_std=0.01,
                 phone_set_file='data/unilex_phoneset.txt'):

        if isinstance(pseudo_inputs_seq_lens, dict):
            pseudo_inputs_seq_lens, n_components = self._make_seq_lens(
                pseudo_inputs_seq_lens)

        super(F0_VAMP_VAE,
              self).__init__(z_dim=z_dim,
                             kld_weight=kld_weight,
                             input_dim=output_dim,
                             n_components=n_components,
                             pseudo_inputs_seq_lens=pseudo_inputs_seq_lens,
                             pseudo_inputs_mean=pseudo_inputs_mean,
                             pseudo_inputs_std=pseudo_inputs_std)

        self.conditioning_dim = conditioning_dim
        self.output_dim = output_dim
        self.dropout_prob = dropout_prob
        self.phone_set_file = phone_set_file

        _encoder_shared_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.output_dim, 256), nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(),
            nn.Dropout(p=dropout_prob))

        self.encoder_layer = vae.VAELayer(_encoder_shared_layer, 64,
                                          self.z_dim)

        self.decoder_layer = utils.SequentialWithRecurrent(
            nn.Linear(self.conditioning_dim + self.z_dim, 256),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, 64),
            nn.Sigmoid(),
            nn.Dropout(p=dropout_prob),
            nn.Linear(64, self.output_dim),
        )

        self.metrics.add_metrics('all',
                                 LF0_RMSE_Hz=LF0Distortion(),
                                 embeddings=TensorHistory(self.z_dim,
                                                          hidden=True),
                                 name=History(hidden=True),
                                 n_segments=TensorHistory(1,
                                                          dtype=torch.long,
                                                          hidden=True),
                                 segment_mean_F0=TensorHistory(1, hidden=True))