Beispiel #1
0
 def __init__(self, hparams):
     super(Tacotron2, self).__init__()
     self.mask_padding = hparams.mask_padding
     self.fp16_run = hparams.fp16_run
     self.use_vae = hparams.use_vae
     self.embedding_variation = hparams.embedding_variation
     self.label_type = hparams.label_type
     self.n_mel_channels = hparams.n_mel_channels
     self.n_frames_per_step = hparams.n_frames_per_step
     self.symbols_embedding_dim = hparams.symbols_embedding_dim
     self.speaker_embedding_dim = hparams.speaker_embedding_dim
     self.emotion_embedding_dim = hparams.emotion_embedding_dim
     self.transcript_embedding = nn.Embedding(hparams.n_symbols,
                                              hparams.symbols_embedding_dim)
     if self.use_vae:
         if self.label_type == 'one-hot':
             self.speaker_embedding = LinearNorm(
                 hparams.n_speakers,
                 hparams.speaker_embedding_dim,
                 bias=True,
                 w_init_gain='tanh')
             self.emotion_embedding = LinearNorm(
                 hparams.n_emotions,
                 hparams.emotion_embedding_dim,
                 bias=True,
                 w_init_gain='tanh')
         elif self.label_type == 'id':
             self.speaker_embedding = nn.Embedding(
                 hparams.n_speakers, hparams.speaker_embedding_dim)
             self.emotion_embedding = nn.Embedding(
                 hparams.n_emotions, hparams.emotion_embedding_dim)
     self.vae_input_type = hparams.vae_input_type
     std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
     val = sqrt(3.0) * std  # uniform bounds for std
     self.transcript_embedding.weight.data.uniform_(-val, val)
     self.encoder = Encoder(hparams)
     self.decoder = Decoder(hparams)
     self.postnet = Postnet(hparams)
     self.vae_gst = VAE_GST(hparams)
Beispiel #2
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.encoder_embedding_dim,
            hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, hparams.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + hparams.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + hparams.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     hparams.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Beispiel #3
0
    def __init__(self, config):
        super(Decoder, self).__init__()
        self.n_mel_channels = config["n_mel_channels"]
        self.n_frames_per_step = config["n_frames_per_step"]
        self.encoder_embedding_dim = config["encoder_embedding_dim"]
        self.attention_rnn_dim = config["attention_rnn_dim"]
        self.decoder_rnn_dim = config["decoder_rnn_dim"]
        self.prenet_dim = config["prenet_dim"]
        self.max_decoder_steps = config["max_decoder_steps"]
        self.gate_threshold = config["gate_threshold"]
        self.p_attention_dropout = config["p_attention_dropout"]
        self.p_decoder_dropout = config["p_decoder_dropout"]

        self.prenet = Prenet(
            config["n_mel_channels"] * config["n_frames_per_step"],
            [config["prenet_dim"], config["prenet_dim"]])

        self.attention_rnn = nn.LSTMCell(
            config["prenet_dim"] + config["encoder_embedding_dim"],
            config["attention_rnn_dim"])

        self.attention_layer = Attention(
            config["attention_rnn_dim"], config["encoder_embedding_dim"],
            config["attention_dim"], config["attention_location_n_filters"],
            config["attention_location_kernel_size"])

        self.decoder_rnn = nn.LSTMCell(
            config["attention_rnn_dim"] + config["encoder_embedding_dim"],
            config["decoder_rnn_dim"], 1)

        self.linear_projection = LinearNorm(
            config["decoder_rnn_dim"] + config["encoder_embedding_dim"],
            config["n_mel_channels"] * config["n_frames_per_step"])

        self.gate_layer = LinearNorm(config["decoder_rnn_dim"] +
                                     config["encoder_embedding_dim"],
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Beispiel #4
0
 def __init__(self,
              attention_rnn_dim,
              embedding_dim,
              attention_dim,
              attention_location_n_filters,
              attention_location_kernel_size,
              prune_rate=0):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim,
                                   attention_dim,
                                   bias=False,
                                   w_init_gain='tanh')
     self.memory_layer = LinearNorm(embedding_dim,
                                    attention_dim,
                                    bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
     self.prune_rate = prune_rate
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim,
                                   attention_dim,
                                   bias=False,
                                   w_init_gain='tanh')
     # if hparams.style == 'speaker_encoder':
     #     embedding_dim += 256
     # elif hparams.style == 'style_embedding':
     #     embedding_dim += 128
     # elif hparams.style == 'both':
     #     embedding_dim += 256 + 128
     self.memory_layer = LinearNorm(embedding_dim,
                                    attention_dim,
                                    bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
    def __init__(self, in_dim, sizes, hparams):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])

        self.convolutions = nn.Sequential(
            ConvNorm(hparams.prenet_dim,
                     hparams.prenet_dim,
                     kernel_size=hparams.audio_kernel_size,
                     stride=hparams.audio_stride,
                     w_init_gain='relu'), nn.BatchNorm1d(hparams.prenet_dim))
Beispiel #7
0
    def __init__(self, query_dim, keys_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size):
        super(Attention, self).__init__()
        # 传统attention需要query和keys做线性变换再v^T.*tanh(W * query + V * keys)
        # 这个query_layer和memory_layer分别得到 W * query 和 V * keys
        # w_init_gain='tanh'是因为他们包在tanh(W * query + V * keys)函数中
        self.query_layer = LinearNorm(query_dim,
                                      attention_dim,
                                      bias=False,
                                      w_init_gain='tanh')
        self.memory_layer = LinearNorm(keys_dim,
                                       attention_dim,
                                       bias=False,
                                       w_init_gain='tanh')
        # 当前attention除了传统参数还包括对注意力权重做卷积处理
        self.location_layer = LocationLayer(attention_location_n_filters,
                                            attention_location_kernel_size,
                                            attention_dim)
        self.EOS_embedding_layer = nn.Embedding(1, attention_dim)

        self.v = LinearNorm(attention_dim, 1, bias=False)

        self.score_mask_value = -float("inf")
Beispiel #8
0
    def __init__(self, in_dim, sizes, p_prenet_dropout, prenet_batchnorm):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList([
            LinearNorm(in_size, out_size, bias=False)
            for (in_size, out_size) in zip(in_sizes, sizes)
        ])
        self.p_prenet_dropout = p_prenet_dropout
        self.prenet_batchnorm = prenet_batchnorm
        self.p_prenet_input_dropout = 0

        if self.prenet_batchnorm:
            self.batchnorms = nn.ModuleList(
                [nn.BatchNorm1d(size) for size in sizes])
Beispiel #9
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2,
                                   attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding,
                                   bias=False,
                                   stride=1,
                                   dilation=1)
     self.location_dense = LinearNorm(attention_n_filters,
                                      attention_dim,
                                      bias=False,
                                      w_init_gain='tanh')
Beispiel #10
0
 def __init__(self, hparams):
     super(BERT_Tacotron2, self).__init__()
     self.mask_padding = hparams.mask_padding
     self.n_mel_channels = hparams.n_mel_channels
     self.n_frames_per_step = hparams.n_frames_per_step
     self.embedding = nn.Embedding(hparams.n_symbols,
                                   hparams.symbols_embedding_dim)
     std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim))
     val = sqrt(3.0) * std  # uniform bounds for std
     self.embedding.weight.data.uniform_(-val, val)
     self.encoder = Encoder(hparams)
     self.linear_converter = LinearNorm(
         hparams.encoder_embedding_dim + hparams.BERT_embedding_dim,
         hparams.encoder_embedding_dim)
     self.decoder = Decoder(hparams)
     self.postnet = Postnet(hparams)
Beispiel #11
0
    def __init__(self, hparams, supervised=False):
        super(GMVAE_revised, self).__init__()
        self.latent_embedding_dim = hparams.latent_embedding_dim
        self.supervised = supervised
        convolutions = []
        conv_layer_1 = nn.Sequential(
            ConvNorm(hparams.n_mel_channels,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_1)

        conv_layer_2 = nn.Sequential(
            ConvNorm(hparams.latent_embedding_dim,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_2)

        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.latent_embedding_dim,
                            int(hparams.latent_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)

        # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1)
        #
        # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1

        self.linear_projection = LinearNorm(
            hparams.latent_embedding_dim,
            int(hparams.latent_embedding_dim / 2))

        self.linear_projection_mean = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.linear_projection_variance = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.fc3 = nn.Linear(hparams.latent_out_dim,
                             int(hparams.latent_embedding_dim / 2))

        self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2),
                             hparams.latent_embedding_dim)
Beispiel #12
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()  ##每个类的__init__都要加这一句?
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(
         2,
         attention_n_filters,  ##ConvNorm是layers.py里定义的一个类
         kernel_size=attention_kernel_size,
         padding=padding,
         bias=False,
         stride=1,
         dilation=1)
     self.location_dense = LinearNorm(
         attention_n_filters,
         attention_dim,  ##LinearNorm是layers.py里定义的一个类
         bias=False,
         w_init_gain='tanh')
    def __init__(self, num_mixtures, attention_rnn_dim, embedding_dim, attention_dim,
                 attention_location_n_filters, attention_location_kernel_size):
        super(GMMAttention, self).__init__()
        self.num_mixtures = num_mixtures
        lin = nn.Linear(attention_dim, 3*num_mixtures, bias=True)
        lin.weight.data.mul_(0.001)
        lin.bias.data.mul_(0.001)
        lin.bias.data.sub_(2.)

        self.F = nn.Sequential(
                LinearNorm(attention_rnn_dim, attention_dim, bias=True, w_init_gain='tanh'),
                nn.Tanh(),
                lin)
                # LinearNorm(attention_dim, 3*num_mixtures, bias=False, w_init_gain='linear'))

        self.score_mask_value = 0 # -float("inf")
        self.register_buffer('pos', torch.arange(
            0, 2000, dtype=torch.float).view(1, -1, 1).data)
Beispiel #14
0
    def __init__(self, hparams):
        super(GST, self).__init__()
        self.token_embedding_size = hparams.token_embedding_size
        self.token_num = hparams.token_num
        self.torchMoji_linear = hparams.torchMoji_linear

        if hparams.token_activation_func == 'softmax': self.activation_fn = 0
        elif hparams.token_activation_func == 'sigmoid': self.activation_fn = 1
        elif hparams.token_activation_func == 'tanh': self.activation_fn = 2
        elif hparams.token_activation_func == 'absolute':
            self.activation_fn = 3
        else:
            print(
                f'token_activation_func of {hparams.token_activation_func} is invalid\nPlease use "softmax", "sigmoid" or "tanh"'
            )
            raise

        self.token_embedding = nn.Parameter(
            torch.zeros([self.token_num,
                         self.token_embedding_size]))  # (token_num, Embedding)
        init.normal_(self.token_embedding, mean=0., std=0.5)
        # init.orthogonal_(self.token_embedding)
        self.ref_encoder = ReferenceEncoder(hparams, activation_fn=torch.tanh)
        self.att = MultiHeadAttention(hparams)

        # torchMoji
        if self.torchMoji_linear:
            self.map_lin = LinearNorm(hparams.torchMoji_attDim, self.token_num)

        self.p_drop_tokens = hparams.p_drop_tokens
        self.drop_tokens_mode = hparams.drop_tokens_mode
        if self.drop_tokens_mode == 'embedding':
            self.embedding = nn.Embedding(1, self.token_num)
        elif self.drop_tokens_mode == 'speaker_embedding':
            self.speaker_embedding = nn.Embedding(hparams.n_speakers,
                                                  self.token_num)
Beispiel #15
0
    def __init__(self, hparams):
        super().__init__()

        self.hidden1 = LinearNorm(64, 256)
        self.output = LinearNorm(256, hparams.speaker_num)
Beispiel #16
0
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList(
         [LinearNorm(in_size, out_size, bias=False)
          for (in_size, out_size) in zip(in_sizes, sizes)])
Beispiel #17
0
class GMVAE_revised(nn.Module):
    def __init__(self, hparams, supervised=False):
        super(GMVAE_revised, self).__init__()
        self.latent_embedding_dim = hparams.latent_embedding_dim
        self.supervised = supervised
        convolutions = []
        conv_layer_1 = nn.Sequential(
            ConvNorm(hparams.n_mel_channels,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_1)

        conv_layer_2 = nn.Sequential(
            ConvNorm(hparams.latent_embedding_dim,
                     hparams.latent_embedding_dim,
                     kernel_size=hparams.latent_kernel_size,
                     stride=1,
                     padding=int((hparams.latent_kernel_size - 1) / 2),
                     dilation=1,
                     w_init_gain='relu'),
            nn.BatchNorm1d(hparams.latent_embedding_dim))
        convolutions.append(conv_layer_2)

        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.latent_embedding_dim,
                            int(hparams.latent_embedding_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)

        # self.mean_pool = nn.AvgPool1d(hparams.latent_kernel_size, stride=1)
        #
        # self.mean_pool_out_size = hparams.latent_embedding_dim - hparams.latent_kernel_size + 1

        self.linear_projection = LinearNorm(
            hparams.latent_embedding_dim,
            int(hparams.latent_embedding_dim / 2))

        self.linear_projection_mean = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.linear_projection_variance = LinearNorm(
            int(hparams.latent_embedding_dim / 2), hparams.latent_out_dim)

        self.fc3 = nn.Linear(hparams.latent_out_dim,
                             int(hparams.latent_embedding_dim / 2))

        self.fc4 = nn.Linear(int(hparams.latent_embedding_dim / 2),
                             hparams.latent_embedding_dim)

    def parse_batch(self, batch):
        if self.supervised:
            text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_padded_512, gate_padded_512, output_lengths_512, labels = batch
        else:
            text_padded, input_lengths, mel_padded, gate_padded, output_lengths, mel_padded_512, gate_padded_512, output_lengths_512 = batch
        text_padded = to_gpu(text_padded).long()
        input_lengths = to_gpu(input_lengths).long()
        max_len = torch.max(input_lengths.data).item()
        mel_padded = to_gpu(mel_padded).float()
        gate_padded = to_gpu(gate_padded).float()
        output_lengths = to_gpu(output_lengths).long()
        mel_padded_512 = to_gpu(mel_padded_512).float()
        gate_padded_512 = to_gpu(gate_padded_512).float()
        output_lengths_512 = to_gpu(output_lengths_512).long()

        return ((text_padded, input_lengths, mel_padded, max_len,
                 output_lengths, mel_padded), (mel_padded, gate_padded))

    def vae_encode(self, inputs, label=None):
        _, _, x, _, _, _ = inputs
        #        print('x shape:', x.shape)
        #        pdb.set_trace()
        for conv in self.convolutions:
            x = F.dropout(F.relu(conv(x)), 0.5, self.training)
#            pdb.set_trace()
        x = x.transpose(1, 2)
        #        print('Just finished convs')
        #        pdb.set_trace()
        out, _ = self.lstm(x)
        #        print('Just finished lstm', out.shape)
        #        pdb.set_trace()
        out = torch.mean(out, dim=1)
        x_after_mean = out
        #        print('After mean pool', out.shape)
        #        pdb.set_trace()
        #        out=torch.cat([out, label],1)
        out = self.linear_projection.forward(out)
        #        print('After linear 1', out.shape)
        #        pdb.set_trace()
        mean = self.linear_projection_mean.forward(out)
        variance = self.linear_projection_variance.forward(out)
        #        mean = torch.mean(torch.mean(self.linear_projection_mean.forward(out),dim=1), dim=0)
        #        variance = torch.mean(torch.mean(self.linear_projection_variance.forward(out),dim=1), dim=0)
        #    print('mean', mean.shape)
        #   print('variance', variance.shape)
        #        pdb.set_trace()
        return mean, variance, x_after_mean

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z, label=None):
        #  print('shape to be decoded', z.shape)
        #z=torch.cat([z, label],1)
        h3 = F.relu(self.fc3(z))
        # print('shape of the recons',h3.shape)
        #        pdb.set_trace()
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x, label=None):
        mu, logvar, x_after_mean = self.vae_encode(x, label)
        z = self.reparameterize(mu, logvar)
        # print('mu shape:', mu.shape)
        # print('logvar shape:', logvar.shape)
        #       pdb.set_trace()
        return self.decode(z, label), mu, logvar, x_after_mean

    def generate_sample(self, x):
        mu, logvar, _ = self.vae_encode(x)
        #        pdb.set_trace()
        #        pdb.set_trace()
        return Normal(mu, logvar.exp()).sample(
            (1, x[2].shape[2])).squeeze(dim=0)
Beispiel #18
0
    def __init__(self,hparams):
        super(speaker_classifier, self).__init__()

        self.hidden1 = LinearNorm(hparams.z_speaker_dim,256)
        self.output = LinearNorm(256, hparams.speaker_num)
Beispiel #19
0
 def __init__(self, input_dims, hidden_dims):
     super(vectorBased_selfAttention, self).__init__()
     self.layers1 = LinearNorm(input_dims, hidden_dims)
     self.layers2 = LinearNorm(hidden_dims, input_dims)
Beispiel #20
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.mellotron = hparams.mellotron
        self.disable_f0 = hparams.disable_f0
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_dim = hparams.encoder_LSTM_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.prenet_layers = hparams.prenet_layers
        self.prenet_batchnorm = hparams.prenet_batchnorm
        self.p_prenet_dropout = hparams.p_prenet_dropout
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.AttRNN_extra_decoder_input = hparams.AttRNN_extra_decoder_input
        self.AttRNN_hidden_dropout_type = hparams.AttRNN_hidden_dropout_type
        self.p_AttRNN_hidden_dropout = hparams.p_AttRNN_hidden_dropout
        self.p_AttRNN_cell_dropout = hparams.p_AttRNN_cell_dropout
        self.DecRNN_hidden_dropout_type = hparams.DecRNN_hidden_dropout_type
        self.p_DecRNN_hidden_dropout = hparams.p_DecRNN_hidden_dropout
        self.p_DecRNN_cell_dropout = hparams.p_DecRNN_cell_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing
        self.teacher_force_till = hparams.teacher_force_till
        self.num_att_mixtures = hparams.num_att_mixtures
        self.extra_projection = hparams.extra_projection
        self.normalize_attention_input = hparams.normalize_attention_input
        self.normalize_AttRNN_output = hparams.normalize_AttRNN_output
        self.attention_type = hparams.attention_type
        self.attention_layers = hparams.attention_layers
        self.low_vram_inference = hparams.low_vram_inference
        self.context_frames = hparams.context_frames
        self.hide_startstop_tokens = hparams.hide_startstop_tokens

        attention_rnn_in_dim = hparams.prenet_dim + self.encoder_dim
        if not self.disable_f0:
            self.prenet_f0 = ConvNorm(
                1,
                hparams.prenet_f0_dim,
                kernel_size=hparams.prenet_f0_kernel_size,
                padding=max(0, int(hparams.prenet_f0_kernel_size / 2)),
                bias=False,
                stride=1,
                dilation=1)
            attention_rnn_in_dim += hparams.prenet_f0_dim

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step *
            self.context_frames, [hparams.prenet_dim] * hparams.prenet_layers,
            self.p_prenet_dropout, self.prenet_batchnorm)

        if self.AttRNN_extra_decoder_input:
            attention_rnn_in_dim += hparams.decoder_rnn_dim

        if self.AttRNN_hidden_dropout_type == 'dropout':
            self.attention_rnn = nn.LSTMCell(
                attention_rnn_in_dim,  # input_size
                hparams.attention_rnn_dim)  # hidden_size)
        elif self.AttRNN_hidden_dropout_type == 'zoneout':
            self.attention_rnn = LSTMCellWithZoneout(
                attention_rnn_in_dim,  # input_size
                hparams.attention_rnn_dim,
                zoneout_prob=self.p_DecRNN_hidden_dropout
            )  # hidden_size, zoneout)
            self.p_AttRNN_hidden_dropout = 0.0  # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout

        if self.attention_type == 0:
            self.attention_layer = Attention(
                hparams.attention_rnn_dim, self.encoder_dim,
                hparams.attention_dim, hparams.attention_location_n_filters,
                hparams.attention_location_kernel_size)
        elif self.attention_type == 1:
            self.attention_layer = GMMAttention(
                hparams.num_att_mixtures, hparams.attention_layers,
                hparams.attention_rnn_dim, self.encoder_dim,
                hparams.attention_dim, hparams.attention_location_n_filters,
                hparams.attention_location_kernel_size, hparams)
        else:
            raise NotImplementedException(
                "attention_type invalid, valid values are... 0 and 1")

        if self.DecRNN_hidden_dropout_type == 'dropout':
            self.decoder_rnn = nn.LSTMCell(
                hparams.attention_rnn_dim + self.encoder_dim,  # input_size
                hparams.decoder_rnn_dim,
                1)  # hidden_size, bias)
        elif self.DecRNN_hidden_dropout_type == 'zoneout':
            self.decoder_rnn = LSTMCellWithZoneout(
                hparams.attention_rnn_dim + self.encoder_dim,  # input_size
                hparams.decoder_rnn_dim,
                1,
                zoneout_prob=self.p_DecRNN_hidden_dropout
            )  # hidden_size, zoneout)
            self.p_DecRNN_hidden_dropout = 0.0  # zoneout assigned inside LSTMCellWithZoneout so don't need normal dropout

        if self.extra_projection:
            self.linear_projection_pre = LinearNorm(
                hparams.decoder_rnn_dim + self.encoder_dim,
                hparams.decoder_rnn_dim + self.encoder_dim)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     self.encoder_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Beispiel #21
0
    def __init__(self):
        super().__init__()

        self.hidden2 = LinearNorm(64, 256)
        self.output = LinearNorm(256, 2)
        self.lambd = 1
Beispiel #22
0
    def __init__(self, hparams):
        super(augmentation_classifier, self).__init__()

        self.hidden2 = LinearNorm(hparams.z_speaker_dim, 256)
        self.output = LinearNorm(256, 2)
        self.lambd = 1