def forward(self, x1, x2): """ Computes the forward pass for the attention layer of the ABCNN-2 Block. Args: x1, x2: torch.Tensors of shape (batch_size, 1, max_length + width - 1, output_size) The outputs from the convolutional layer. Returns: w1, w2: torch.Tensors of shape (batch_size, 1, max_length, output_size) The outputs from the attention layer. This layer takes the place of the Average Pooling layer seen in the BCNN and ABCNN-1 models. """ # Compute attention matrix for outputs of convolutional layer A = compute_attention_matrix(x1, x2, self.match_score) # Initialize outputs for attention layer batch_size = x1.shape[0] output_size = x1.shape[3] w1 = torch.zeros((batch_size, 1, self.max_length, output_size)) w2 = torch.zeros((batch_size, 1, self.max_length, output_size)) w1 = w1.cuda() if x1.is_cuda else w1 w2 = w2.cuda() if x2.is_cuda else w2 # Compute the outputs for j in range(self.max_length): for k in range(j, j + self.width): row_sum = torch.sum(A[:, :, :, k], dim=2, keepdim=True) col_sum = torch.sum(A[:, :, k, :], dim=2, keepdim=True) row_sum = row_sum.cuda() if x1.is_cuda else row_sum col_sum = col_sum.cuda() if x2.is_cuda else col_sum w1[:, :, j, :] += row_sum * x1[:, :, k, :] w2[:, :, j, :] += col_sum * x2[:, :, k, :] return w1, w2
def forward(self, x1, x2): """ Computes the forward pass for the attention layer of the ABCNN-1 Block. Args: x1, x2: torch.Tensors of shape (batch_size, 1, max_length, input_size) The inputs to the ABCNN-1 Block. Returns: attn1, attn2: torch.Tensors of shape (batch_size, 2, max_length, input_size) The output of the attention layer for the ABCNN-1 Block. """ # Get attention matrix and its transpose A = compute_attention_matrix(x1, x2, self.match_score) A = A.cuda() if self.W1.is_cuda else A A_t = A.permute(0, 1, 3, 2) # Compute attention feature maps a1 = torch.matmul(A, self.W1) a2 = torch.matmul(A_t, self.W2) # Stack attention feature maps with inputs attn1 = torch.cat([x1, a1], dim=1) attn2 = torch.cat([x2, a2], dim=1) return attn1, attn2
# Create directory to store plots prefix = "example{}".format(i) plot_dir = os.path.join(args.output_dir, prefix) if not os.path.exists(plot_dir): os.mkdir(plot_dir) # Get features for each question x0 = features[0].view(1, 1, max_length, embeddings_size) x1 = features[1].view(1, 1, max_length, embeddings_size) # Store all-ap outputs for input layer outputs0.append(all_ap(x0)) outputs1.append(all_ap(x1)) # Generate initial attention distribution A = compute_attention_matrix(x0, x1, manhattan) A = A.squeeze().cpu().numpy() filename = "{}_input_attn.png".format(prefix) filepath = os.path.join(plot_dir, filename) plot_attention_matrix(A, example[0], example[1], filepath) # Generate attention distribution for blocks for j, block in enumerate(blocks): # Get outputs for next block x0, x1 = x0.detach(), x1.detach() x0, x1, a0, a1 = block(x0, x1) # Sanity check assert(not any(torch.isnan(x0).tolist())) assert(not any(torch.isnan(x1).tolist()))