def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None # Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. # h_t^{dec}, c_t^{dec} = decoder(\overline(y_t),h_{t-1}^{dec},c_{t-1}^{dec}) dec_state = self.decoder(Ybar_t, dec_state) # Split dec_state into its two parts (dec_hidden, dec_cell) = dec_state # (b, 2 * h) -> ((b, h), (b, h)) # batched matrix multiplication # (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len) # unsqueeze - Returns a new tensor with a dimension of size one inserted at the specified position. # e_{t, i} = (h_t^{dec})^{\top}W_{attProj}h_i^{enc} e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) # \alpha_t = Softmax(e_t) alpha_t = torch.unsqueeze(F.softmax(e_t, dim=1), dim=1) # (b, src_len) -> (b, 1, src_len) # (b, 1, src_len) * (b, src_len, 2*h) -> (b, 1, 2*h) -> (b, 2*h) # a_t = \sum_i^m\alpha_{t, i}h_i^{enc} a_t = torch.squeeze(torch.bmm(alpha_t, enc_hiddens), dim=1) # u_t = [a_t;h_t^{dec}] U_t = torch.cat((a_t, dec_hidden), dim=1) # v_t = W_uu_t V_t = self.combined_output_projection(U_t) # o_t = Dropout(Tanh(v_t)) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t
def forward(self, x: torch.Tensor) -> torch.Tensor: """ x: shape [bs, nums, ebd], where ebd must equals to self._dims out: shape [bs, nums], attention weights """ if x.shape[-1] != self._input_dim: raise ConfigurationError("""The last dim of input must equals to the construction parameter 'input_dim'. Please check.""") if self._need_mlp: bs, nums, ebd = x.shape x_combine = x.reshape(bs * nums, ebd) x_combine = self._activation(self._mlp(x_combine)) x = x_combine.reshape(bs, nums, -1) bs = x.shape[0] w = self._weight.unsqueeze(0).unsqueeze(2) # shape [1, ebd, 1] w = w.expand(bs, -1, -1) # shape [bs, ebd, 1] logits = x.bmm(w).squeeze(-1) # shape [bs, nums] if self._normalize: return F.softmax(logits, dim=1) else: return logits
def inverse_depth_to_camera_coords( inverse_depth: torch.Tensor, intrinsics_inv: torch.Tensor) -> torch.Tensor: """Transform coordinates in the pixel frame to the camera frame. Args: inverse_depth: depth map B,1,H,W intrinsics_inv: intrinsics_inv matrix for each element of batch -- [B,3,3] Returns: array of (X,Y,Z) cam coordinates -- [B,3,H,W] """ b, _, h, w = inverse_depth.shape # compose homogeneous tensors i_range = torch.arange(h).view(1, h, 1).expand(1, h, w).type_as( inverse_depth) # [1, H, W] j_range = torch.arange(w).view(1, 1, w).expand(1, h, w).type_as( inverse_depth) # [1, H, W] ones = torch.ones(1, h, w).type_as(inverse_depth) pixel_coords = torch.stack((j_range, i_range, ones), dim=1) # [1, 3, H, W] # expand to batch pixel_coords = pixel_coords.expand(b, 3, h, w) # [B, 3, H, W] pixel_coords_flat = pixel_coords.view(*pixel_coords.shape[:2], -1) camera_coords = intrinsics_inv.bmm(pixel_coords_flat).view_as(pixel_coords) # scale by depth # assert inverse_depth.min().item() >= 10e-5 # avoid division by zero return camera_coords / inverse_depth
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # 1, dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state # 3, (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len) e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: # e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # 1, apply softmax to e_t alpha_t = F.softmax(e_t, dim=1) # (b, src_len) # 2, (b, 1, src_len) x (b, src_len, 2h) = (b, 1, 2h) -> (b, 2h) # a_t = e_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1) att_view = (alpha_t.size(0), 1, alpha_t.size(1)) a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1) # 3, concate a_t (b, 2h) and dec_hidden (b, h) to U_t (b, 3h) U_t = torch.cat((a_t, dec_hidden), dim=1) # 4, apply combined output to U_T -> V_t, shape (b, h) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor: """ Takes a matrix of vectors and a set of weights over the rows in the matrix (which we call an "attention" vector), and returns a weighted sum of the rows in the matrix. This is the typical computation performed after an attention mechanism. Note that while we call this a "matrix" of vectors and an attention "vector", we also handle higher-order tensors. We always sum over the second-to-last dimension of the "matrix", and we assume that all dimensions in the "matrix" prior to the last dimension are matched in the "vector". Non-matched dimensions in the "vector" must be `directly after the batch dimension`. For example, say I have a "matrix" with dimensions `(batch_size, num_queries, num_words, embedding_dim)`. The attention "vector" then must have at least those dimensions, and could have more. Both: - `(batch_size, num_queries, num_words)` (distribution over words for each query) - `(batch_size, num_documents, num_queries, num_words)` (distribution over words in a query for each document) are valid input "vectors", producing tensors of shape: `(batch_size, num_queries, embedding_dim)` and `(batch_size, num_documents, num_queries, embedding_dim)` respectively. """ # We'll special-case a few settings here, where there are efficient (but poorly-named) # operations in pytorch that already do the computation we need. if attention.dim() == 2 and matrix.dim() == 3: return attention.unsqueeze(1).bmm(matrix).squeeze(1) if attention.dim() == 3 and matrix.dim() == 3: return attention.bmm(matrix) if matrix.dim() - 1 < attention.dim(): expanded_size = list(matrix.size()) for i in range(attention.dim() - matrix.dim() + 1): matrix = matrix.unsqueeze(1) expanded_size.insert(i + 1, attention.size(i + 1)) matrix = matrix.expand(*expanded_size) intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix return intermediate.sum(dim=-2)
def forward(self, inputA: Tensor, inputB: Tensor) -> Tensor: inputA = inputA.view(inputA.shape[0], inputA.shape[1], -1) inputB = inputB.view(inputB.shape[0], inputB.shape[1], -1) bi_vec = inputA.bmm(inputB.permute(0, 2, 1)) bi_vec = bi_vec.view(bi_vec.shape[0], -1) bi_vec = torch.sign(bi_vec) * torch.sqrt(torch.abs(bi_vec)) return F.normalize(bi_vec, dim=-1, p=2)
def scaled_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask=None) -> torch.Tensor: temp = query.bmm(key.transpose(1, 2)) scaled = temp / (query.size(-1)**0.5) if mask is not None: scaled = scaled.masked_fill(mask == 0, -1e9) softmax = torch.nn.functional.softmax(scaled, dim=-1) return softmax.bmm(value)
def transform_points_Rt(points: torch.Tensor, viewpoint: torch.Tensor, inverse: bool = False): N, H, W = viewpoint.shape assert H == 3 and W == 4, "Rt is B x 3 x 4 " t = viewpoint[:, :, 3] r = viewpoint[:, :, 0:3] # transpose r to handle the fact that P in num_points x 3 # yT = (RX)T = XT @ RT r = r.transpose(1, 2).contiguous() # invert if needed if inverse: points = points - t[:, None, :] points = points.bmm(r.inverse()) else: points = points.bmm(r) points = points + t[:, None, :] return points
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor: if attention.dim() == 2 and matrix.dim() == 3: return attention.unsqueeze(1).bmm(matrix).squeeze(1) if attention.dim() == 3 and matrix.dim() == 3: return attention.bmm(matrix) if matrix.dim() - 1 < attention.dim(): expanded_size = list(matrix.size()) for i in range(attention.dim() - matrix.dim() + 1): matrix = matrix.unsqueeze(1) expanded_size.insert(i + 1, attention.size(i + 1)) matrix = matrix.expand(*expanded_size) intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix return intermediate.sum(dim=-2)
def forward(self, memory: torch.Tensor, answer: torch.Tensor) -> torch.Tensor: """ Inputs: memory: tensor of shape bsz * hdim answer: tensor of shape bsz * hdim Outputs: prob: tensor of shape bsz """ Ua = self.combination(answer) memory = memory.unsqueeze(1) Ua = Ua.unsqueeze(1) mUa = memory.bmm(Ua.transpose(1, 2)) return mUa.view(-1)
def _forward_internal(self, matrix1: torch.Tensor, matrix2: torch.Tensor ) -> torch.Tensor: """ Args: matrix1 : Tensor of shape (batch_size, seq_len1, hdim1) matrix2 : Tensor of shape (batch_size, seq_len2, hdim2) Output: alpha : Tensor of shape (batch_size, seq_len1, seq_len2) """ # Shape : (batch_size, seq_len_2, hdim1) Wy = self._weights(matrix2) # Shape : (batch_size, seq_len_1, seq_len_2) alpha = matrix1.bmm(Wy.transpose(-2, -1)) return alpha
def dot_product_score(queries: Tensor, keys: Tensor, scaled: bool = False): ''' Input: - queries: [B, T, A] - keys: [B, S, A] Output: - score: [B, T, S] ''' # [B,T,A] x [B,A,S] = [B,T,S] if scaled: attn_dim = queries.size(-1) queries = queries / (attn_dim**0.5) score = queries.bmm(keys.transpose(1, 2)) # [B,T,S] return score
def forward(self, p_seq: torch.Tensor, q: torch.Tensor, p_mask: torch.Tensor): """ Input: p_seq: batch_size * p_seq_len * p_hidden_dim q: batch_size * q_hidden_dim p_mask: batch_size * p_seq_len (1 for padding, 0 for true) Output: attn_scores: batch_size * p_seq_len """ Wq = self.linear(q) if self.linear is not None else q pWq = p_seq.bmm(Wq.unsqueeze(2)).squeeze(2) pWq.data.masked_fill_(p_mask.data, -float("inf")) attn_scores = F.softmax(pWq, dim=-1) if self.normalize else pWq.exp() return attn_scores
def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor) -> Tensor: ''' Input: query = <X, W_q> => (L, d_k) key = <X, W_k> => (L, d_k) value = <X, W_k> => (L, d_v) Returns: Self Attention Matrix (L, d_v) ''' temp = query.bmm(key.transpose(1, 2)) scale = query.size(-1)**0.5 softmax = f.softmax(temp / scale, dim=-1) return softmax.bmm(value)
def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor, mask: Union[None, Tensor] = None) -> Tensor: similarity = query.bmm(key.transpose(1, 2)) scale = query.size(-1) ** 0.5 if mask is not None: similarity = similarity.masked_fill(mask, float('-inf')) softmax = F.softmax(similarity / scale, dim=-1) return softmax.bmm(value)
def attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, enc_masks: torch.Tensor = None) -> torch.Tensor: query_unsqueezed = query.unsqueeze(dim=2) score = key.bmm(query_unsqueezed) score = score.squeeze(dim=2) if enc_masks is not None: score.data.masked_fill_(enc_masks.bool(), -float('inf')) attention_weights = softmax(score, dim=1) attention_weights = attention_weights.unsqueeze(dim=1) context_vector = attention_weights.bmm(value) context_vector = context_vector.squeeze(dim=1) return attention_weights, context_vector
def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. dec_state = self.decoder(Ybar_t, dec_state) ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) dec_hidden = dec_state[0] # shape b, h dec_cell = dec_state[1] # shape b, h ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. # mine e_t = torch.bmm(dec_hidden, torch.bmm(enc_hiddens_proj, enc_hidden)) # 3, (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len) e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # to understand ??
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor: """ Takes a matrix of vectors and a set of weights over the rows in the matrix (which we call an "attention" vector), and returns a weighted sum of the rows in the matrix. This is the typical computation performed after an attention mechanism. Note that while we call this a "matrix" of vectors and an attention "vector", we also handle higher-order tensors. We always sum over the second-to-last dimension of the "matrix", and we assume that all dimensions in the "matrix" prior to the last dimension are matched in the "vector". Non-matched dimensions in the "vector" must be `directly after the batch dimension`. For example, say I have a "matrix" with dimensions ``(batch_size, num_queries, num_words, embedding_dim)``. The attention "vector" then must have at least those dimensions, and could have more. Both: - ``(batch_size, num_queries, num_words)`` (distribution over words for each query) - ``(batch_size, num_documents, num_queries, num_words)`` (distribution over words in a query for each document) are valid input "vectors", producing tensors of shape: ``(batch_size, num_queries, embedding_dim)`` and ``(batch_size, num_documents, num_queries, embedding_dim)`` respectively. """ # We'll special-case a few settings here, where there are efficient (but poorly-named) # operations in pytorch that already do the computation we need. if attention.dim() == 2 and matrix.dim() == 3: return attention.unsqueeze(1).bmm(matrix).squeeze(1) if attention.dim() == 3 and matrix.dim() == 3: return attention.bmm(matrix) if matrix.dim() - 1 < attention.dim(): expanded_size = list(matrix.size()) for i in range(attention.dim() - matrix.dim() + 1): matrix = matrix.unsqueeze(1) expanded_size.insert(i + 1, attention.size(i + 1)) matrix = matrix.expand(*expanded_size) intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix return intermediate.sum(dim=-2)
def matrix_cosine_similarity(x: torch.Tensor, y: torch.Tensor, eps: float = 1e-8): """ :param x (batch_size, length_1, dim) :param y (batch_size, length_2, dim) :return (batch_size, length_1, length_2) """ length_1, length_2 = x.size(1), y.size(1) # shape: (batch_size, length_1, length_2) dot_product = x.bmm(y.permute(0, 2, 1)) # shape: (batch_size, length_1), (batch_size, length_2) x_norm, y_norm = x.norm(dim=-1, p=None), y.norm(dim=-1, p=None) # added eps for numerical stability x_norm = torch.max(x_norm, eps * x_norm.new_ones(x_norm.size())) y_norm = torch.max(y_norm, eps * y_norm.new_ones(y_norm.size())) expanded_x_norm = x_norm.unsqueeze(-1).repeat(1, 1, length_2) expanded_y_norm = y_norm.unsqueeze(1).repeat(1, length_1, 1) # shape: (batch_size, length_1, length_2) norm = expanded_x_norm * expanded_y_norm similarity = dot_product / norm return similarity
def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor: intermediate = matrix_1.bmm(self._weight_matrix.unsqueeze(0)) return self._activation( intermediate.bmm(matrix_2.transpose(1, 2)) + self._bias)
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ###Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. dec_state = self.decoder(Ybar_t, dec_state) # Ybar_t:(b, e + h) ###Split dec_state into its two parts (dec_hidden, dec_cell) (dec_hidden, dec_cell) = dec_state #dec_hidden:(b,h) dec_cell:(b,h) ###Compute the attention scores e_t, a Tensor shape (b, src_len). #enc_hiddens_proj:(b,src_len,h) dec_hidden.unsqueeze(2):(b,h,1) ->(b,src_len,1)->e_t:(b,src_len) e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ###Apply softmax to e_t to yield alpha_t alpha_t = F.softmax(e_t) #e_t:(b,src_len) -> alpha_t:(b,src_len) att_view = (alpha_t.size(0), 1, alpha_t.size(1)) ###Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the attention output vector, a_t. a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1) #a_t:(b, 2h) ###Concatenate dec_hidden with a_t to compute tensor U_t U_t = torch.cat((a_t, dec_hidden), dim=1) #U_t:(b,3h) ###Apply the combined output projection layer to U_t to compute tensor V_t V_t = self.combined_output_projection(U_t) #V_t:(b,h) ###Compute tensor O_t by first applying the Tanh function and then the dropout layer. O_t = self.dropout(torch.tanh(V_t)) # #O_t:(b,h) combined_output = O_t return dec_state, combined_output, e_t
def forward(self, input: Tensor) -> Tensor: input = input.view(input.shape[0], input.shape[1], -1) interaction = input.bmm(input.permute(0, 2, 1)) log_interactions = [mat_log_sym(matrix) for matrix in torch.unbind(interaction, dim=0)] return torch.stack(log_interactions, dim = 0)
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None # COPY OVER YOUR CODE FROM ASSIGNMENT 4 # 1. Apply decoder to Ybar_t and previous hidden and cell decoder states. dec_prev_h, dec_prev_c = dec_state dec_state = self.decoder(Ybar_t, (dec_prev_h, dec_prev_c)) # 2. Split dec_state into two parts. dec_hidden, dec_cell = dec_state # dec_hidden (b, h), dec_cell (b, h) # 3. Compute the multiplicative attention scores. dec_hidden_unsqueezed = dec_hidden.unsqueeze(dim=2) # (b, h, 1) # enc_hiddens_proj (b, src_len, h) e_t = enc_hiddens_proj.bmm(dec_hidden_unsqueezed) # (b, src_len, 1) e_t = e_t.squeeze(dim=2) # (b, src_len) # END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) # COPY OVER YOUR CODE FROM ASSIGNMENT 4 # 1. Apply softmax to get attention distribution. alpha_t = torch.nn.functional.softmax(e_t, dim=1) # (b, src_len) alpha_t = alpha_t.unsqueeze(dim=1) # (b, 1, src_len) # 2. Use bmm to obtain the attention output vector. # enc_hiddens (b, src_len, 2h) a_t = alpha_t.bmm(enc_hiddens) # (b, 1, 2h) a_t = a_t.squeeze(dim=1) # (b, 2h) # 3. Concatentate dec_hidden with a_t to compute U_t U_t = torch.cat([a_t, dec_hidden], dim=1) # (b, 3h) # 4. Apply the combiend output projection layer to U_t to compute V_t V_t = self.combined_output_projection(U_t) # (b, h) # 5. Compute O_t by applying tanh and then dropout O_t = self.dropout(V_t.tanh()) # (b, h) # END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t
def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor: return matrix_1.bmm(matrix_2.transpose(2, 1))
def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor: sim_score = matrix.bmm(vector.unsqueeze(-1)).squeeze(-1) sim_score_scaled = sim_score / math.sqrt(sim_score.size(-1)) return sim_score_scaled
def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = F.softmax(e_t, dim=1) alpha_t = alpha_t.unsqueeze(2) a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t).squeeze(2) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t
def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor): temp = query.bmm(key.transpose(1, 2)) scale = query.size(-1)**0.5 softmax = F.softmax(temp / scale, dim=-1) return softmax.bmm(value)
def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor: return matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)
def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. dec_state = self.decoder(Ybar_t, dec_state) ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) dec_hidden, dec_cell = dec_state ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!) ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # (bs,1,src_len) @ (bs,src_len,2h) = (bs,1,2h) = (bs,2h) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 # So that when do softmax on these paddings of this sentence, the attribution score will be 0 (e^-inf = 0) # example: sentence [il,a,m,entarte,<PAD>] (max source length = 5) will have enc_masks [0,0,0,0,1] # and with attribution score (pre_softmax) e_t such as [3,-1,0,-2,5], # 5 will be such a high attribution score for a meaningless padding # so we need to neutralize it by applying mask on so that e_t will be [3,-1,0,-2,-inf] if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply softmax to e_t to yield alpha_t alpha_t = F.softmax(e_t, dim=1) # (bs, src_len) ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### # att_view = (alpha_t.size(0), 1, alpha_t.size(1)) # a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1) # (b,2h,src_len) @ (b,src_len,1) a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t.unsqueeze(2)).squeeze(2) ### 3. Concatenate dec_hidden with a_t to compute tensor U_t U_t = torch.cat([dec_hidden, a_t], dim=1) ### 4. Apply the combined output projection layer to U_t to compute tensor V_t V_t = self.combined_output_projection(U_t) ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t
def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor: transformed_vectors = self.ll(vector) return matrix.bmm(transformed_vectors.unsqueeze(-1)).squeeze(-1)
def scaled_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor: temp = query.bmm(key.transpose(1, 2)) scaled = temp / (query.size(-1) ** 0.5) softmax = torch.nn.functional.softmax(scaled, dim=-1) return softmax.bmm(value)