Example #1
0
 def __init__(self, attention_rnn_dim, embedding_dim, attention_dim,
              attention_location_n_filters, attention_location_kernel_size):
     super(Attention, self).__init__()
     self.query_layer = LinearNorm(attention_rnn_dim, attention_dim,
                                   bias=False, w_init_gain='tanh')
     self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False,
                                    w_init_gain='tanh')
     self.v = LinearNorm(attention_dim, 1, bias=False)
     self.location_layer = LocationLayer(attention_location_n_filters,
                                         attention_location_kernel_size,
                                         attention_dim)
     self.score_mask_value = -float("inf")
Example #2
0
    def __init__(self, hparams):
        super(Decoder, self).__init__()
        self.n_mel_channels = hparams.n_mel_channels
        self.n_frames_per_step = hparams.n_frames_per_step
        self.encoder_embedding_dim = hparams.encoder_embedding_dim + hparams.token_embedding_size + hparams.speaker_embedding_dim
        self.attention_rnn_dim = hparams.attention_rnn_dim
        self.decoder_rnn_dim = hparams.decoder_rnn_dim
        self.prenet_dim = hparams.prenet_dim
        self.max_decoder_steps = hparams.max_decoder_steps
        self.gate_threshold = hparams.gate_threshold
        self.p_attention_dropout = hparams.p_attention_dropout
        self.p_decoder_dropout = hparams.p_decoder_dropout
        self.p_teacher_forcing = hparams.p_teacher_forcing

        if hparams.prenet_f0_dim > 0:
            self.prenet_f0 = ConvNorm(
                hparams.prenet_f0_dim,  # 输入f0s的维度。
                hparams.prenet_f0_dim,
                kernel_size=hparams.prenet_f0_kernel_size,
                padding=max(0, int(hparams.prenet_f0_kernel_size / 2)),
                bias=False,
                stride=1,
                dilation=1)

        self.prenet = Prenet(
            hparams.n_mel_channels * hparams.n_frames_per_step,
            [hparams.prenet_dim, hparams.prenet_dim])

        self.attention_rnn = nn.LSTMCell(
            hparams.prenet_dim + hparams.prenet_f0_dim +
            self.encoder_embedding_dim, hparams.attention_rnn_dim)

        self.attention_layer = Attention(
            hparams.attention_rnn_dim, self.encoder_embedding_dim,
            hparams.attention_dim, hparams.attention_location_n_filters,
            hparams.attention_location_kernel_size)

        self.decoder_rnn = nn.LSTMCell(
            hparams.attention_rnn_dim + self.encoder_embedding_dim,
            hparams.decoder_rnn_dim, 1)

        self.linear_projection = LinearNorm(
            hparams.decoder_rnn_dim + self.encoder_embedding_dim,
            hparams.n_mel_channels * hparams.n_frames_per_step)

        self.gate_layer = LinearNorm(hparams.decoder_rnn_dim +
                                     self.encoder_embedding_dim,
                                     1,
                                     bias=True,
                                     w_init_gain='sigmoid')
Example #3
0
 def __init__(self, in_dim, sizes):
     super(Prenet, self).__init__()
     in_sizes = [in_dim] + sizes[:-1]
     self.layers = nn.ModuleList([
         LinearNorm(in_size, out_size, bias=False)
         for (in_size, out_size) in zip(in_sizes, sizes)
     ])
Example #4
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding, bias=False, stride=1,
                                   dilation=1)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=False, w_init_gain='tanh')