def __init__(self, mel_channels, text_channels, emb_channels): super().__init__() self.text_channels = text_channels self.pos_embedding_mel = nn.Embedding.from_pretrained( positional_encoding(2 * 512, emb_channels), freeze=True) self.pos_embedding_text = nn.Embedding.from_pretrained( positional_encoding(2 * 256, emb_channels), freeze=True) self.encoder = Encoder(in_channels=80, emb_channels=256) self.decoder = Decoder(text_channels, enc_channels=256, emb_channels=256)
def __transform(self, input_data, masks): """Apply transformer encoder """ config = self.config transformed_output = tf.identity(input_data) if config.tf_used: tf_keep_prob = tf.cond(self.is_train, lambda: config.tf_keep_prob, lambda: 1.0) tf_mh_keep_prob = tf.cond(self.is_train, lambda: config.tf_mh_keep_prob, lambda: 1.0) tf_ffn_keep_prob = tf.cond(self.is_train, lambda: config.tf_ffn_keep_prob, lambda: 1.0) # last dimension must be equal to model_dim because we use a residual connection. model_dim = transformed_output.get_shape().as_list()[-1] # sinusoidal positional signal signal = positional_encoding(self.sentence_lengths, self.sentence_length, model_dim, zero_pad=False, scale=False, scope='positional-encoding', reuse=None) transformed_output += signal # block for i in range(config.tf_num_layers): x = transformed_output # layer norm x_norm = normalize(x, scope='layer-norm-sa-%s' % i, reuse=None) # multi-head attention y = self.__self_attention(x_norm, masks, model_dim=model_dim, keep_prob=tf_mh_keep_prob, scope='self-attention-%s' % i) # residual and dropout x = tf.nn.dropout(x_norm + y, keep_prob=tf_keep_prob) # layer norm x_norm = normalize(x, scope='layer-norm-ffn-%s' % i, reuse=None) # position-wise feed forward net y = self.__feedforward(x_norm, masks, model_dim=model_dim, kernel_size=config.tf_ffn_kernel_size, keep_prob=tf_ffn_keep_prob, scope='feed-forward-%s' % i) # residual and dropout x = tf.nn.dropout(x_norm + y, keep_prob=tf_keep_prob) transformed_output = x # final layer norm transformed_output = normalize(transformed_output, scope='layer-norm', reuse=None) return transformed_output
from util import to_device, plot_att_heads from model import Encoder, Decoder from dataset import Dataset, _symbol_to_id device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") epoch_total = 64 batch_size = 16 enc_lr = 0.0001 dec_lr = 0.0005 emb_lr = 0.0001 # ----------------------------------- text_embedding = nn.Embedding(num_embeddings=len(_symbol_to_id), embedding_dim=512).to(device) pos_embedding = nn.Embedding.from_pretrained(positional_encoding(512, 512), freeze=True).to(device) pos_embedding_ = nn.Embedding.from_pretrained(positional_encoding(256, 512), freeze=True).to(device) encoder = Encoder(emb_channels=512).to(device) decoder = Decoder(mel_channels=80, enc_channels=512, emb_channels=512).to(device) optimizer = torch.optim.Adam([{ 'params': text_embedding.parameters(), 'lr': emb_lr }, { 'params': encoder.parameters(), 'lr': enc_lr }, {
from util import to_device, plot_att_heads from model import Encoder, Decoder from dataset import Dataset, _symbol_to_id, parse_text from audio_process import MelWav, sample_rate device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") epoch_total = 64 batch_size = 32 enc_lr = 0.0001 dec_lr = 0.0005 emb_lr = 0.0001 # ----------------------------------- text_embedding = nn.Embedding(num_embeddings=len(_symbol_to_id), embedding_dim=256).to(device) pos_embedding = nn.Embedding.from_pretrained(positional_encoding(512, 256), freeze=True).to(device) pos_embedding_ = nn.Embedding.from_pretrained(positional_encoding(256, 256), freeze=True).to(device) encoder = Encoder(emb_channels=256).to(device) decoder = Decoder(mel_channels=80, enc_channels=256, emb_channels=256).to(device) mel_to_wav = MelWav().to(device) optimizer = torch.optim.Adam([{'params': text_embedding.parameters(), 'lr': emb_lr}, {'params': encoder.parameters(), 'lr': enc_lr}, {'params': decoder.parameters(), 'lr': dec_lr}], lr=0.001) # ----------------------------------- logs_idx = f'emb_lr{emb_lr}-enc_lr{enc_lr}-dec_lr{dec_lr}-batch_size{batch_size}' saves = glob.glob(f'logs/{logs_idx}/*.pt')