def test_in_out(self): layer = Encoder(128) dummy_input = T.rand(4, 8, 128) print(layer) output = layer(dummy_input) print(output.shape) assert output.shape[0] == 4 assert output.shape[1] == 8 assert output.shape[2] == 256 # 128 * 2 BiRNN
def __init__(self, num_chars, num_speakers, r=5, postnet_output_dim=1025, decoder_output_dim=80, attn_type='original', attn_win=False, attn_norm="sigmoid", prenet_type="original", prenet_dropout=True, forward_attn=False, trans_agent=False, forward_attn_mask=False, location_attn=True, attn_K=5, separate_stopnet=True, bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, encoder_in_features=256, decoder_in_features=256, speaker_embedding_dim=None, gst=False, gst_embedding_dim=256, gst_num_heads=4, gst_style_tokens=10, memory_size=5): super(Tacotron, self).__init__(num_chars, num_speakers, r, postnet_output_dim, decoder_output_dim, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, encoder_in_features, decoder_in_features, speaker_embedding_dim, gst, gst_embedding_dim, gst_num_heads, gst_style_tokens) # speaker embedding layers if self.num_speakers > 1: if not self.embeddings_per_sample: speaker_embedding_dim = 256 self.speaker_embedding = nn.Embedding(self.num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) # speaker and gst embeddings is concat in decoder input if self.num_speakers > 1: self.decoder_in_features += speaker_embedding_dim # add speaker embedding dim # embedding layer self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) # base model layers self.encoder = Encoder(self.encoder_in_features) self.decoder = Decoder(self.decoder_in_features, decoder_output_dim, r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) # global style token layers if self.gst: self.gst_layer = GST(num_mel=80, num_heads=gst_num_heads, num_style_tokens=gst_style_tokens, embedding_dim=gst_embedding_dim) # backward pass decoder if self.bidirectional_decoder: self._init_backward_decoder() # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( self.decoder_in_features, decoder_output_dim, ddc_r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet)
def __init__(self, num_chars, num_speakers, r=5, postnet_output_dim=1025, decoder_output_dim=80, attn_type='original', attn_win=False, attn_norm="sigmoid", prenet_type="original", prenet_dropout=True, forward_attn=False, trans_agent=False, forward_attn_mask=False, location_attn=True, attn_K=5, separate_stopnet=True, bidirectional_decoder=False, double_decoder_consistency=False, ddc_r=None, gst=False, memory_size=5): super(Tacotron, self).__init__(num_chars, num_speakers, r, postnet_output_dim, decoder_output_dim, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, bidirectional_decoder, double_decoder_consistency, ddc_r, gst) decoder_in_features = 512 if num_speakers > 1 else 256 encoder_in_features = 512 if num_speakers > 1 else 256 speaker_embedding_dim = 256 proj_speaker_dim = 80 if num_speakers > 1 else 0 # base model layers self.embedding = nn.Embedding(num_chars, 256, padding_idx=0) self.embedding.weight.data.normal_(0, 0.3) self.encoder = Encoder(encoder_in_features) self.decoder = Decoder(decoder_in_features, decoder_output_dim, r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, proj_speaker_dim) self.postnet = PostCBHG(decoder_output_dim) self.last_linear = nn.Linear(self.postnet.cbhg.gru_features * 2, postnet_output_dim) # speaker embedding layers if num_speakers > 1: self.speaker_embedding = nn.Embedding(num_speakers, speaker_embedding_dim) self.speaker_embedding.weight.data.normal_(0, 0.3) self.speaker_project_mel = nn.Sequential( nn.Linear(speaker_embedding_dim, proj_speaker_dim), nn.Tanh()) self.speaker_embeddings = None self.speaker_embeddings_projected = None # global style token layers if self.gst: gst_embedding_dim = 256 self.gst_layer = GST(num_mel=80, num_heads=4, num_style_tokens=10, embedding_dim=gst_embedding_dim) # backward pass decoder if self.bidirectional_decoder: self._init_backward_decoder() # setup DDC if self.double_decoder_consistency: self.coarse_decoder = Decoder( decoder_in_features, decoder_output_dim, ddc_r, memory_size, attn_type, attn_win, attn_norm, prenet_type, prenet_dropout, forward_attn, trans_agent, forward_attn_mask, location_attn, attn_K, separate_stopnet, proj_speaker_dim)