def test_attention_location(): encoder_out = Variable(torch.zeros(152, 2, 256)) # seq, batch, dim query_vector = Variable(torch.zeros(1, 2, 256)) # seq, batch, dim mask = Variable(torch.zeros(2, 152, 1)) # seq, batch, dim attention = LocationAttention(dim=256) context, mask = attention(query_vector, encoder_out, mask) assert context.size() == (1, 2, 256) assert mask.size() == (2, 152, 1) # batch, input_seq_len
def test_attention_location_softmax(): encoder_out = Variable(torch.randn(152, 2, 256)) # seq, batch, dim query_vector = Variable(torch.randn(1, 2, 1024)) # seq, batch, dim mask = Variable(torch.randn(2, 152, 1)) # batch, seq1, seq2 attention = LocationAttention(encoded_dim=256, query_dim=1024, attention_dim=128) context, mask = attention(query_vector, encoder_out, mask) assert float(mask[:, 0, :].sum().data) == 1.0 # batch, input_seq_len
def test_attention_location_sizes(): encoder_out = Variable(torch.randn(152, 2, 256)) # seq, batch, dim query_vector = Variable(torch.randn(1, 2, 1024)) # seq, batch, dim mask = Variable(torch.randn(2, 152, 1)) # batch, seq1, seq2 attention = LocationAttention(encoded_dim=256, query_dim=1024, attention_dim=128) context, mask = attention(query_vector, encoder_out, mask) assert context.size() == (1, 2, 256) assert mask.size() == (1, 2, 152) # seq2, batch, seq1
def __init__(self, hidden_size=1024, num_layers=2): super(Decoder, self).__init__() self.prenet = PreNet(in_features=80, out_features=256) self.attention = LocationAttention(dim=256) self.rnn = nn.GRU(input_size=512, hidden_size=hidden_size, num_layers=num_layers, dropout=0.1) self.spec_out = nn.Linear(in_features=1024 + 256, out_features=80) self.stop_out = nn.Linear(in_features=1024 + 256, out_features=1) self.postnet = PostNet()
def test_attention(): """ Attention should output a fixed length context vector (seq len = 1) and and a weight for each item in the input sequence """ encoder_out = Variable(torch.zeros(152, 2, 256)) # seq, batch, dim query_vector = Variable(torch.zeros(1, 2, 256)) # seq, batch, dim attention = LocationAttention(dim=256) context, mask = attention(query_vector, encoder_out) assert context.size() == (1, 2, 256) # seq, batch, dim assert mask.size() == (2, 152, 1) # batch, input_seq_len
def test_attention_sizes(): """ Attention should output a fixed length context vector (seq len = 1) and and a weight for each item in the input sequence """ encoder_out = Variable(torch.randn(152, 2, 256)) # seq, batch, dim query_vector = Variable(torch.randn(1, 2, 1024)) # seq, batch, dim attention = LocationAttention(encoded_dim=256, query_dim=1024, attention_dim=128) context, mask = attention(query_vector, encoder_out) assert context.size() == (1, 2, 256) # seq, batch, dim assert mask.size() == (1, 2, 152) # seq2, batch, seq1
def __init__(self, hidden_size=1024, num_layers=2, num_mels=80, num_prenet_features=256): super(Decoder, self).__init__() self.num_layers = num_layers self.hidden_size = hidden_size self.num_mels = num_mels self.prenet = PreNet(in_features=num_mels, out_features=num_prenet_features) self.attention = LocationAttention(encoded_dim=256, query_dim=hidden_size, attention_dim=128) self.rnn = nn.LSTM(input_size=num_prenet_features + 256, hidden_size=hidden_size, num_layers=num_layers, dropout=0.1) self.spec_out = nn.Linear(in_features=hidden_size + 256, out_features=num_mels) self.stop_out = nn.Linear(in_features=hidden_size + 256, out_features=1) self.postnet = PostNet()