Exemple #1
0
    def test_in_out(self):
        layer = Encoder(128)
        dummy_input = T.rand(4, 8, 128)

        print(layer)
        output = layer(dummy_input)
        print(output.shape)
        assert output.shape[0] == 4
        assert output.shape[1] == 8
        assert output.shape[2] == 256  # 128 * 2 BiRNN
Exemple #2
0
    def __init__(self,
                 num_chars,
                 num_speakers,
                 r=5,
                 postnet_output_dim=1025,
                 decoder_output_dim=80,
                 attn_type='original',
                 attn_win=False,
                 attn_norm="sigmoid",
                 prenet_type="original",
                 prenet_dropout=True,
                 forward_attn=False,
                 trans_agent=False,
                 forward_attn_mask=False,
                 location_attn=True,
                 attn_K=5,
                 separate_stopnet=True,
                 bidirectional_decoder=False,
                 double_decoder_consistency=False,
                 ddc_r=None,
                 encoder_in_features=256,
                 decoder_in_features=256,
                 speaker_embedding_dim=None,
                 gst=False,
                 gst_embedding_dim=256,
                 gst_num_heads=4,
                 gst_style_tokens=10,
                 memory_size=5):
        super(Tacotron,
              self).__init__(num_chars, num_speakers, r, postnet_output_dim,
                             decoder_output_dim, attn_type, attn_win,
                             attn_norm, prenet_type, prenet_dropout,
                             forward_attn, trans_agent, forward_attn_mask,
                             location_attn, attn_K, separate_stopnet,
                             bidirectional_decoder, double_decoder_consistency,
                             ddc_r, encoder_in_features, decoder_in_features,
                             speaker_embedding_dim, gst, gst_embedding_dim,
                             gst_num_heads, gst_style_tokens)

        # speaker embedding layers
        if self.num_speakers > 1:
            if not self.embeddings_per_sample:
                speaker_embedding_dim = 256
                self.speaker_embedding = nn.Embedding(self.num_speakers,
                                                      speaker_embedding_dim)
                self.speaker_embedding.weight.data.normal_(0, 0.3)

        # speaker and gst embeddings is concat in decoder input
        if self.num_speakers > 1:
            self.decoder_in_features += speaker_embedding_dim  # add speaker embedding dim

        # embedding layer
        self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
        self.embedding.weight.data.normal_(0, 0.3)

        # base model layers
        self.encoder = Encoder(self.encoder_in_features)
        self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r,
                               memory_size, attn_type, attn_win, attn_norm,
                               prenet_type, prenet_dropout, forward_attn,
                               trans_agent, forward_attn_mask, location_attn,
                               attn_K, separate_stopnet)
        self.postnet = PostCBHG(decoder_output_dim)
        self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
                                     postnet_output_dim)

        # global style token layers
        if self.gst:
            self.gst_layer = GST(num_mel=80,
                                 num_heads=gst_num_heads,
                                 num_style_tokens=gst_style_tokens,
                                 embedding_dim=gst_embedding_dim)
        # backward pass decoder
        if self.bidirectional_decoder:
            self._init_backward_decoder()
        # setup DDC
        if self.double_decoder_consistency:
            self.coarse_decoder = Decoder(
                self.decoder_in_features, decoder_output_dim, ddc_r,
                memory_size, attn_type, attn_win, attn_norm, prenet_type,
                prenet_dropout, forward_attn, trans_agent, forward_attn_mask,
                location_attn, attn_K, separate_stopnet)
Exemple #3
0
 def __init__(self,
              num_chars,
              num_speakers,
              r=5,
              postnet_output_dim=1025,
              decoder_output_dim=80,
              attn_type='original',
              attn_win=False,
              attn_norm="sigmoid",
              prenet_type="original",
              prenet_dropout=True,
              forward_attn=False,
              trans_agent=False,
              forward_attn_mask=False,
              location_attn=True,
              attn_K=5,
              separate_stopnet=True,
              bidirectional_decoder=False,
              double_decoder_consistency=False,
              ddc_r=None,
              gst=False,
              memory_size=5):
     super(Tacotron,
           self).__init__(num_chars, num_speakers, r, postnet_output_dim,
                          decoder_output_dim, attn_type, attn_win,
                          attn_norm, prenet_type, prenet_dropout,
                          forward_attn, trans_agent, forward_attn_mask,
                          location_attn, attn_K, separate_stopnet,
                          bidirectional_decoder, double_decoder_consistency,
                          ddc_r, gst)
     decoder_in_features = 512 if num_speakers > 1 else 256
     encoder_in_features = 512 if num_speakers > 1 else 256
     speaker_embedding_dim = 256
     proj_speaker_dim = 80 if num_speakers > 1 else 0
     # base model layers
     self.embedding = nn.Embedding(num_chars, 256, padding_idx=0)
     self.embedding.weight.data.normal_(0, 0.3)
     self.encoder = Encoder(encoder_in_features)
     self.decoder = Decoder(decoder_in_features, decoder_output_dim, r,
                            memory_size, attn_type, attn_win, attn_norm,
                            prenet_type, prenet_dropout, forward_attn,
                            trans_agent, forward_attn_mask, location_attn,
                            attn_K, separate_stopnet, proj_speaker_dim)
     self.postnet = PostCBHG(decoder_output_dim)
     self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2,
                                  postnet_output_dim)
     # speaker embedding layers
     if num_speakers > 1:
         self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim)
         self.speaker_embedding.weight.data.normal_(0, 0.3)
         self.speaker_project_mel = nn.Sequential(
             nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh())
         self.speaker_embeddings = None
         self.speaker_embeddings_projected = None
     # global style token layers
     if self.gst:
         gst_embedding_dim = 256
         self.gst_layer = GST(num_mel=80,
                              num_heads=4,
                              num_style_tokens=10,
                              embedding_dim=gst_embedding_dim)
     # backward pass decoder
     if self.bidirectional_decoder:
         self._init_backward_decoder()
     # setup DDC
     if self.double_decoder_consistency:
         self.coarse_decoder = Decoder(
             decoder_in_features, decoder_output_dim, ddc_r, memory_size,
             attn_type, attn_win, attn_norm, prenet_type, prenet_dropout,
             forward_attn, trans_agent, forward_attn_mask, location_attn,
             attn_K, separate_stopnet, proj_speaker_dim)