def __init__(self, input_dim=600+9, output_dim=1*3, dropout_prob=0.): """Initialises acoustic model parameters and settings.""" super(F0_RNN, self).__init__() self.input_dim = input_dim self.output_dim = output_dim self.dropout_prob = dropout_prob self.recurrent_layers = utils.SequentialWithRecurrent( nn.Linear(self.input_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob), nn.Linear(64, self.output_dim), ) self.metrics.add_metrics('all', LF0_RMSE_Hz=LF0Distortion())
def __init__(self, z_dim=16, kld_weight=1., conditioning_dim=600 + 9, output_dim=1 * 3, dropout_prob=0., latent=None): """Initialises VAE parameters and settings.""" super(VAE, self).__init__(z_dim=z_dim, kld_weight=kld_weight) self.conditioning_dim = conditioning_dim self.output_dim = output_dim self.dropout_prob = dropout_prob self.latent = latent self.encoder_layer = _Encoder(self.conditioning_dim, self.output_dim, dropout_prob, self.z_dim) self.decoder_layer = utils.SequentialWithRecurrent( nn.Linear(self.conditioning_dim + self.z_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob), nn.Linear(64, self.output_dim), ) self.metrics.add_metrics('all', LF0_RMSE_Hz=LF0Distortion())
def __init__(self, z_dim=16, conditioning_dim=70+9, output_dim=1*3, dropout_prob=0., phone_set_file='data/unilex_phoneset.txt'): """Initialises VAE parameters and settings.""" super(F0_AE, self).__init__() self.z_dim = z_dim self.conditioning_dim = conditioning_dim self.output_dim = output_dim self.dropout_prob = dropout_prob self.phone_set_file = phone_set_file self.encoder_layer = utils.SequentialWithRecurrent( nn.Linear(self.output_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob), nn.Linear(64, self.z_dim) ) self.decoder_layer = utils.SequentialWithRecurrent( nn.Linear(self.conditioning_dim + self.z_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper( nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob), nn.Linear(64, self.output_dim), ) self.metrics.add_metrics('all', LF0_RMSE_Hz=LF0Distortion(), embeddings=TensorHistory(self.z_dim, hidden=True), name=History(hidden=True), n_segments=TensorHistory(1, dtype=torch.long, hidden=True), segment_mean_F0=TensorHistory(1, hidden=True))
def __init__(self, z_dim=16, kld_weight=1., conditioning_dim=70 + 9, output_dim=1 * 3, dropout_prob=0., n_components=32, pseudo_inputs_seq_lens=50, pseudo_inputs_mean=0.05, pseudo_inputs_std=0.01, phone_set_file='data/unilex_phoneset.txt'): if isinstance(pseudo_inputs_seq_lens, dict): pseudo_inputs_seq_lens, n_components = self._make_seq_lens( pseudo_inputs_seq_lens) super(F0_VAMP_VAE, self).__init__(z_dim=z_dim, kld_weight=kld_weight, input_dim=output_dim, n_components=n_components, pseudo_inputs_seq_lens=pseudo_inputs_seq_lens, pseudo_inputs_mean=pseudo_inputs_mean, pseudo_inputs_std=pseudo_inputs_std) self.conditioning_dim = conditioning_dim self.output_dim = output_dim self.dropout_prob = dropout_prob self.phone_set_file = phone_set_file _encoder_shared_layer = utils.SequentialWithRecurrent( nn.Linear(self.output_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob)) self.encoder_layer = vae.VAELayer(_encoder_shared_layer, 64, self.z_dim) self.decoder_layer = utils.SequentialWithRecurrent( nn.Linear(self.conditioning_dim + self.z_dim, 256), nn.Sigmoid(), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(256, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), utils.RecurrentCuDNNWrapper(nn.GRU(64, 64, batch_first=True)), nn.Dropout(p=dropout_prob), nn.Linear(64, 64), nn.Sigmoid(), nn.Dropout(p=dropout_prob), nn.Linear(64, self.output_dim), ) self.metrics.add_metrics('all', LF0_RMSE_Hz=LF0Distortion(), embeddings=TensorHistory(self.z_dim, hidden=True), name=History(hidden=True), n_segments=TensorHistory(1, dtype=torch.long, hidden=True), segment_mean_F0=TensorHistory(1, hidden=True))