def forward(self, x): attention = self.attention(x, x, x) output_1 = x + attention output_1 = LayerNorm(output_1.size()[1:])(output_1) feed_forward = self.feed_forward(output_1) transformed_skip = output_1 + feed_forward return LayerNorm(transformed_skip.size()[1:])(transformed_skip)
def forward(self, x, y): """ Forward pass of one decoder stack Args: x (torch.tensor): The encoded sequence (B, T, dmodel) y (torch.tensor): The shifted decoded sequence (B, T, dmodel) """ attention_1 = self.attention_1(y, y, y, maskout=True) output_1 = LayerNorm(attention_1.size()[1:])(attention_1 + y) attention_2 = self.attention_2(output_1, x, x) output_2 = LayerNorm(attention_2.size()[1:])(attention_2 + output_1) ff = self.feed_forward(output_2) transformed_skip = LayerNorm(output_2.size()[1:])(ff + output_2) return x, transformed_skip