Exemple #1
0
    def __init__(self, mel_channels, text_channels, emb_channels):
        super().__init__()
        self.text_channels = text_channels
        self.pos_embedding_mel = nn.Embedding.from_pretrained(
            positional_encoding(2 * 512, emb_channels), freeze=True)
        self.pos_embedding_text = nn.Embedding.from_pretrained(
            positional_encoding(2 * 256, emb_channels), freeze=True)

        self.encoder = Encoder(in_channels=80, emb_channels=256)
        self.decoder = Decoder(text_channels,
                               enc_channels=256,
                               emb_channels=256)
Exemple #2
0
 def __transform(self, input_data, masks):
     """Apply transformer encoder
     """
     config = self.config
     transformed_output = tf.identity(input_data)
     if config.tf_used:
         tf_keep_prob = tf.cond(self.is_train, lambda: config.tf_keep_prob,
                                lambda: 1.0)
         tf_mh_keep_prob = tf.cond(self.is_train,
                                   lambda: config.tf_mh_keep_prob,
                                   lambda: 1.0)
         tf_ffn_keep_prob = tf.cond(self.is_train,
                                    lambda: config.tf_ffn_keep_prob,
                                    lambda: 1.0)
         # last dimension must be equal to model_dim because we use a residual connection.
         model_dim = transformed_output.get_shape().as_list()[-1]
         # sinusoidal positional signal
         signal = positional_encoding(self.sentence_lengths,
                                      self.sentence_length,
                                      model_dim,
                                      zero_pad=False,
                                      scale=False,
                                      scope='positional-encoding',
                                      reuse=None)
         transformed_output += signal
         # block
         for i in range(config.tf_num_layers):
             x = transformed_output
             # layer norm
             x_norm = normalize(x, scope='layer-norm-sa-%s' % i, reuse=None)
             # multi-head attention
             y = self.__self_attention(x_norm,
                                       masks,
                                       model_dim=model_dim,
                                       keep_prob=tf_mh_keep_prob,
                                       scope='self-attention-%s' % i)
             # residual and dropout
             x = tf.nn.dropout(x_norm + y, keep_prob=tf_keep_prob)
             # layer norm
             x_norm = normalize(x,
                                scope='layer-norm-ffn-%s' % i,
                                reuse=None)
             # position-wise feed forward net
             y = self.__feedforward(x_norm,
                                    masks,
                                    model_dim=model_dim,
                                    kernel_size=config.tf_ffn_kernel_size,
                                    keep_prob=tf_ffn_keep_prob,
                                    scope='feed-forward-%s' % i)
             # residual and dropout
             x = tf.nn.dropout(x_norm + y, keep_prob=tf_keep_prob)
             transformed_output = x
         # final layer norm
         transformed_output = normalize(transformed_output,
                                        scope='layer-norm',
                                        reuse=None)
     return transformed_output
Exemple #3
0
from util import to_device, plot_att_heads
from model import Encoder, Decoder
from dataset import Dataset, _symbol_to_id

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
epoch_total = 64
batch_size = 16
enc_lr = 0.0001
dec_lr = 0.0005
emb_lr = 0.0001

# -----------------------------------

text_embedding = nn.Embedding(num_embeddings=len(_symbol_to_id),
                              embedding_dim=512).to(device)
pos_embedding = nn.Embedding.from_pretrained(positional_encoding(512, 512),
                                             freeze=True).to(device)
pos_embedding_ = nn.Embedding.from_pretrained(positional_encoding(256, 512),
                                              freeze=True).to(device)

encoder = Encoder(emb_channels=512).to(device)
decoder = Decoder(mel_channels=80, enc_channels=512,
                  emb_channels=512).to(device)

optimizer = torch.optim.Adam([{
    'params': text_embedding.parameters(),
    'lr': emb_lr
}, {
    'params': encoder.parameters(),
    'lr': enc_lr
}, {
Exemple #4
0
from util import to_device, plot_att_heads
from model import Encoder, Decoder
from dataset import Dataset, _symbol_to_id, parse_text
from audio_process import MelWav, sample_rate

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
epoch_total = 64
batch_size = 32
enc_lr = 0.0001
dec_lr = 0.0005
emb_lr = 0.0001

# -----------------------------------

text_embedding = nn.Embedding(num_embeddings=len(_symbol_to_id), embedding_dim=256).to(device)
pos_embedding = nn.Embedding.from_pretrained(positional_encoding(512, 256), freeze=True).to(device)
pos_embedding_ = nn.Embedding.from_pretrained(positional_encoding(256, 256), freeze=True).to(device)

encoder = Encoder(emb_channels=256).to(device)
decoder = Decoder(mel_channels=80, enc_channels=256, emb_channels=256).to(device)
mel_to_wav = MelWav().to(device)

optimizer = torch.optim.Adam([{'params': text_embedding.parameters(), 'lr': emb_lr},
                              {'params': encoder.parameters(), 'lr': enc_lr},
                              {'params': decoder.parameters(), 'lr': dec_lr}],
                             lr=0.001)

# -----------------------------------

logs_idx = f'emb_lr{emb_lr}-enc_lr{enc_lr}-dec_lr{dec_lr}-batch_size{batch_size}'
saves = glob.glob(f'logs/{logs_idx}/*.pt')