Beispiel #1
0
    def __init__(self, hparams):
        super(Postnet, self).__init__()
        self.convolutions = nn.ModuleList()

        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='tanh'),
                nn.BatchNorm1d(hparams.postnet_embedding_dim))
        )
        for i in range(1, hparams.postnet_n_convolutions - 1):
            self.convolutions.append(
                nn.Sequential(
                    ConvNorm(hparams.postnet_embedding_dim,
                             hparams.postnet_embedding_dim,
                             kernel_size=hparams.postnet_kernel_size, stride=1,
                             padding=int((hparams.postnet_kernel_size - 1) / 2),
                             dilation=1, w_init_gain='tanh'),
                    nn.BatchNorm1d(hparams.postnet_embedding_dim))
            )
        self.convolutions.append(
            nn.Sequential(
                ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels,
                         kernel_size=hparams.postnet_kernel_size, stride=1,
                         padding=int((hparams.postnet_kernel_size - 1) / 2),
                         dilation=1, w_init_gain='linear'),
                nn.BatchNorm1d(hparams.n_mel_channels))
            )
Beispiel #2
0
    def __init__(self, hparams, input_dim, output_dim):
        super(MelEncoder, self).__init__()

        self.melenc_conv_hidden_dim = hparams.melenc_conv_dim
        self.output_dim = output_dim
        self.drop_chance = hparams.melenc_drop_frame_rate

        convolutions = []
        for _ in range(hparams.melenc_n_layers):
            input_dim = input_dim if (_ == 0) else self.melenc_conv_hidden_dim
            output_dim = self.output_dim if (
                _ == hparams.melenc_n_layers -
                1) else self.melenc_conv_hidden_dim
            conv_layer = nn.Sequential(
                ConvNorm(input_dim,
                         output_dim,
                         kernel_size=hparams.melenc_kernel_size,
                         stride=hparams.melenc_stride,
                         padding=int((hparams.melenc_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(output_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.melenc_n_tokens,
                            hparams.melenc_n_tokens // 2,
                            1,
                            batch_first=True,
                            bidirectional=True)

        self.LReLU = nn.LeakyReLU(negative_slope=0.01)
Beispiel #3
0
 def __init__(self, attention_n_filters, attention_kernel_size,
              attention_dim, out_bias=False):
     super(LocationLayer, self).__init__()
     padding = int((attention_kernel_size - 1) / 2)
     self.location_conv = ConvNorm(2, attention_n_filters,
                                   kernel_size=attention_kernel_size,
                                   padding=padding, bias=False, stride=1,
                                   dilation=1)
     self.location_dense = LinearNorm(attention_n_filters, attention_dim,
                                      bias=out_bias, w_init_gain='tanh')
Beispiel #4
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 n_layers=2,
                 n_blocks=1,
                 kernel_size=3,
                 act_func=nn.LeakyReLU(negative_slope=0.01, inplace=True),
                 hidden_dim=None,
                 dropout=0.2,
                 use_batchnorm=True,
                 residual_act_func=False):
        super(Conv1dResBlock, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim or output_dim
        self.n_blocks = n_blocks
        self.n_layers = n_layers
        self.kernel_size = kernel_size
        self.dropout = dropout
        self.act_func = act_func
        self.residual_act_func = residual_act_func
        self.blocks = nn.ModuleList()

        self.start_conv = ConvNorm(input_dim, hidden_dim, 1)

        for i in range(self.n_blocks):
            convs = nn.ModuleList()
            for j in range(self.n_layers):
                conv = ConvNorm(
                    hidden_dim,
                    hidden_dim,
                    kernel_size,
                    padding=(kernel_size - 1) // 2,
                )
                if use_batchnorm:
                    conv = nn.Sequential(conv, nn.BatchNorm1d(hidden_dim))
                convs.append(conv)
            self.blocks.append(convs)

        self.end_conv = ConvNorm(hidden_dim, output_dim, 1)
Beispiel #5
0
    def __init__(self, hp):
        super(Encoder, self).__init__()
        self.encoder_speaker_embed_dim = hp.encoder_speaker_embed_dim
        if self.encoder_speaker_embed_dim:
            self.encoder_speaker_embedding = nn.Embedding(
                hp.n_speakers, self.encoder_speaker_embed_dim)

        self.encoder_concat_speaker_embed = hp.encoder_concat_speaker_embed
        self.encoder_conv_hidden_dim = hp.encoder_conv_hidden_dim

        convolutions = []
        for _ in range(hp.encoder_n_convolutions):
            if _ == 0:
                if self.encoder_concat_speaker_embed == 'before_conv':
                    input_dim = hp.symbols_embedding_dim + self.encoder_speaker_embed_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    input_dim = hp.symbols_embedding_dim
                else:
                    raise NotImplementedError(
                        f'encoder_concat_speaker_embed is has invalid value {hp.encoder_concat_speaker_embed}, valid values are "before","inside".'
                    )
            else:
                input_dim = self.encoder_conv_hidden_dim

            if _ == (hp.encoder_n_convolutions) - 1:  # last conv
                if self.encoder_concat_speaker_embed == 'before_conv':
                    output_dim = hp.encoder_LSTM_dim
                elif self.encoder_concat_speaker_embed == 'before_lstm':
                    output_dim = hp.encoder_LSTM_dim - self.encoder_speaker_embed_dim
            else:
                output_dim = self.encoder_conv_hidden_dim

            conv_layer = nn.Sequential(
                ConvNorm(input_dim,
                         output_dim,
                         kernel_size=hp.encoder_kernel_size,
                         stride=1,
                         padding=int((hp.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(output_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hp.encoder_LSTM_dim,
                            int(hp.encoder_LSTM_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
        self.LReLU = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU

        self.sylps_layer = LinearNorm(hp.encoder_LSTM_dim, 1)
Beispiel #6
0
    def __init__(self, hparams, global_cond_dim):
        super(Encoder, self).__init__()
        self.encoder_speaker_embed_dim = hparams.encoder_speaker_embed_dim
        if self.encoder_speaker_embed_dim:
            self.encoder_speaker_embedding = nn.Embedding(
                hparams.n_speakers, self.encoder_speaker_embed_dim)
            std = sqrt(2.0 /
                       (hparams.n_speakers + self.encoder_speaker_embed_dim))
            val = sqrt(3.0) * std  # uniform bounds for std
            self.encoder_speaker_embedding.weight.data.uniform_(-val, val)

        self.encoder_conv_hidden_dim = hparams.encoder_conv_hidden_dim

        output_dim = hparams.symbols_embedding_dim + self.encoder_speaker_embed_dim  # first layer input_dim
        convolutions = []
        for i in range(hparams.encoder_n_convolutions):
            is_last_layer = bool(i + 1 == hparams.encoder_n_convolutions)
            is_first_layer = bool(i == 0)

            input_dim = output_dim
            output_dim = hparams.encoder_LSTM_dim if is_last_layer else self.encoder_conv_hidden_dim

            conv_layer = nn.Sequential(
                ConvNorm(input_dim,
                         output_dim,
                         kernel_size=hparams.encoder_kernel_size,
                         stride=1,
                         padding=int((hparams.encoder_kernel_size - 1) / 2),
                         dilation=1,
                         w_init_gain='relu'), nn.BatchNorm1d(output_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.encoder_LSTM_dim,
                            int(hparams.encoder_LSTM_dim / 2),
                            1,
                            batch_first=True,
                            bidirectional=True)
        self.LReLU = nn.LeakyReLU(negative_slope=0.01)  # LeakyReLU

        self.cond_conv = nn.Linear(
            hparams.encoder_LSTM_dim, global_cond_dim
        )  # predicts Preceived Loudness Mu/Logvar from LSTM Hidden State