def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        # Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        # h_t^{dec}, c_t^{dec} = decoder(\overline(y_t),h_{t-1}^{dec},c_{t-1}^{dec})
        dec_state = self.decoder(Ybar_t, dec_state)
        # Split dec_state into its two parts
        (dec_hidden, dec_cell) = dec_state  # (b, 2 * h) -> ((b, h), (b, h))
        # batched matrix multiplication
        # (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len)
        # unsqueeze - Returns a new tensor with a dimension of size one inserted at the specified position.
        # e_{t, i} = (h_t^{dec})^{\top}W_{attProj}h_i^{enc}
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        # \alpha_t = Softmax(e_t)
        alpha_t = torch.unsqueeze(F.softmax(e_t, dim=1),
                                  dim=1)  # (b, src_len) -> (b, 1, src_len)
        # (b, 1, src_len) * (b, src_len, 2*h) -> (b, 1, 2*h) -> (b, 2*h)
        # a_t = \sum_i^m\alpha_{t, i}h_i^{enc}
        a_t = torch.squeeze(torch.bmm(alpha_t, enc_hiddens), dim=1)
        # u_t = [a_t;h_t^{dec}]
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        # v_t = W_uu_t
        V_t = self.combined_output_projection(U_t)
        # o_t = Dropout(Tanh(v_t))
        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: shape [bs, nums, ebd], where ebd must equals to self._dims
        out: shape [bs, nums], attention weights
        """
        if x.shape[-1] != self._input_dim:
            raise ConfigurationError("""The last dim of input must equals to
                the construction parameter 'input_dim'. Please check.""")

        if self._need_mlp:
            bs, nums, ebd = x.shape
            x_combine = x.reshape(bs * nums, ebd)
            x_combine = self._activation(self._mlp(x_combine))
            x = x_combine.reshape(bs, nums, -1)

        bs = x.shape[0]
        w = self._weight.unsqueeze(0).unsqueeze(2)  # shape [1, ebd, 1]
        w = w.expand(bs, -1, -1)  # shape [bs, ebd, 1]

        logits = x.bmm(w).squeeze(-1)  # shape [bs, nums]

        if self._normalize:
            return F.softmax(logits, dim=1)
        else:
            return logits
def inverse_depth_to_camera_coords(
        inverse_depth: torch.Tensor,
        intrinsics_inv: torch.Tensor) -> torch.Tensor:
    """Transform coordinates in the pixel frame to the camera frame.
    Args:
        inverse_depth: depth map B,1,H,W
        intrinsics_inv: intrinsics_inv matrix for each element of batch -- [B,3,3]
    Returns:
        array of (X,Y,Z) cam coordinates -- [B,3,H,W]
    """
    b, _, h, w = inverse_depth.shape

    # compose homogeneous tensors
    i_range = torch.arange(h).view(1, h, 1).expand(1, h, w).type_as(
        inverse_depth)  # [1, H, W]
    j_range = torch.arange(w).view(1, 1, w).expand(1, h, w).type_as(
        inverse_depth)  # [1, H, W]
    ones = torch.ones(1, h, w).type_as(inverse_depth)
    pixel_coords = torch.stack((j_range, i_range, ones), dim=1)  # [1, 3, H, W]

    # expand to batch
    pixel_coords = pixel_coords.expand(b, 3, h, w)  # [B, 3, H, W]
    pixel_coords_flat = pixel_coords.view(*pixel_coords.shape[:2], -1)
    camera_coords = intrinsics_inv.bmm(pixel_coords_flat).view_as(pixel_coords)

    # scale by depth
    # assert inverse_depth.min().item() >= 10e-5  # avoid division by zero
    return camera_coords / inverse_depth
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.
        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.
        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4
        # 1,
        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        # 3, (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len)
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        ### END YOUR CODE FROM ASSIGNMENT 4

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            # e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4
        # 1, apply softmax to e_t
        alpha_t = F.softmax(e_t, dim=1)  # (b, src_len)
        # 2, (b, 1, src_len) x (b, src_len, 2h) = (b, 1, 2h) -> (b, 2h)
        # a_t = e_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1)
        att_view = (alpha_t.size(0), 1, alpha_t.size(1))
        a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1)

        # 3, concate a_t (b, 2h) and dec_hidden (b, h) to U_t (b, 3h)
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        # 4, apply combined output to U_T -> V_t, shape (b, h)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))

        ### END YOUR CODE FROM ASSIGNMENT 4

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #5
0
def weighted_sum(matrix: torch.Tensor,
                 attention: torch.Tensor) -> torch.Tensor:
    """
    Takes a matrix of vectors and a set of weights over the rows in the matrix (which we call an
    "attention" vector), and returns a weighted sum of the rows in the matrix.  This is the typical
    computation performed after an attention mechanism.
    Note that while we call this a "matrix" of vectors and an attention "vector", we also handle
    higher-order tensors.  We always sum over the second-to-last dimension of the "matrix", and we
    assume that all dimensions in the "matrix" prior to the last dimension are matched in the
    "vector".  Non-matched dimensions in the "vector" must be `directly after the batch dimension`.
    For example, say I have a "matrix" with dimensions `(batch_size, num_queries, num_words,
    embedding_dim)`.  The attention "vector" then must have at least those dimensions, and could
    have more. Both:
        - `(batch_size, num_queries, num_words)` (distribution over words for each query)
        - `(batch_size, num_documents, num_queries, num_words)` (distribution over words in a
          query for each document)
    are valid input "vectors", producing tensors of shape:
    `(batch_size, num_queries, embedding_dim)` and
    `(batch_size, num_documents, num_queries, embedding_dim)` respectively.
    """
    # We'll special-case a few settings here, where there are efficient (but poorly-named)
    # operations in pytorch that already do the computation we need.
    if attention.dim() == 2 and matrix.dim() == 3:
        return attention.unsqueeze(1).bmm(matrix).squeeze(1)
    if attention.dim() == 3 and matrix.dim() == 3:
        return attention.bmm(matrix)
    if matrix.dim() - 1 < attention.dim():
        expanded_size = list(matrix.size())
        for i in range(attention.dim() - matrix.dim() + 1):
            matrix = matrix.unsqueeze(1)
            expanded_size.insert(i + 1, attention.size(i + 1))
        matrix = matrix.expand(*expanded_size)
    intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix
    return intermediate.sum(dim=-2)
Beispiel #6
0
    def forward(self, inputA: Tensor, inputB: Tensor) -> Tensor:
        inputA = inputA.view(inputA.shape[0], inputA.shape[1], -1)
        inputB = inputB.view(inputB.shape[0], inputB.shape[1], -1)

        bi_vec = inputA.bmm(inputB.permute(0, 2, 1))
        bi_vec = bi_vec.view(bi_vec.shape[0], -1)
        bi_vec = torch.sign(bi_vec) * torch.sqrt(torch.abs(bi_vec))

        return F.normalize(bi_vec, dim=-1, p=2)
def scaled_dot_product_attention(query: torch.Tensor,
                                 key: torch.Tensor,
                                 value: torch.Tensor,
                                 mask=None) -> torch.Tensor:
    temp = query.bmm(key.transpose(1, 2))
    scaled = temp / (query.size(-1)**0.5)
    if mask is not None:
        scaled = scaled.masked_fill(mask == 0, -1e9)

    softmax = torch.nn.functional.softmax(scaled, dim=-1)
    return softmax.bmm(value)
Beispiel #8
0
def transform_points_Rt(points: torch.Tensor,
                        viewpoint: torch.Tensor,
                        inverse: bool = False):
    N, H, W = viewpoint.shape
    assert H == 3 and W == 4, "Rt is B x 3 x 4 "
    t = viewpoint[:, :, 3]
    r = viewpoint[:, :, 0:3]

    # transpose r to handle the fact that P in num_points x 3
    # yT = (RX)T = XT @ RT
    r = r.transpose(1, 2).contiguous()

    # invert if needed
    if inverse:
        points = points - t[:, None, :]
        points = points.bmm(r.inverse())
    else:
        points = points.bmm(r)
        points = points + t[:, None, :]

    return points
Beispiel #9
0
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor:
    if attention.dim() == 2 and matrix.dim() == 3:
        return attention.unsqueeze(1).bmm(matrix).squeeze(1)
    if attention.dim() == 3 and matrix.dim() == 3:
        return attention.bmm(matrix)
    if matrix.dim() - 1 < attention.dim():
        expanded_size = list(matrix.size())
        for i in range(attention.dim() - matrix.dim() + 1):
            matrix = matrix.unsqueeze(1)
            expanded_size.insert(i + 1, attention.size(i + 1))
        matrix = matrix.expand(*expanded_size)
    intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix
    return intermediate.sum(dim=-2)
Beispiel #10
0
 def forward(self, memory: torch.Tensor,
             answer: torch.Tensor) -> torch.Tensor:
     """
     Inputs:
         memory: tensor of shape bsz * hdim
         answer: tensor of shape bsz * hdim
     Outputs:
         prob: tensor of shape bsz
     """
     Ua = self.combination(answer)
     memory = memory.unsqueeze(1)
     Ua = Ua.unsqueeze(1)
     mUa = memory.bmm(Ua.transpose(1, 2))
     return mUa.view(-1)
Beispiel #11
0
 def _forward_internal(self, matrix1: torch.Tensor, matrix2: torch.Tensor
                       ) -> torch.Tensor:
     """
     Args:
         matrix1 : Tensor of shape (batch_size, seq_len1, hdim1)
         matrix2 : Tensor of shape (batch_size, seq_len2, hdim2)
     Output:
         alpha : Tensor of shape (batch_size, seq_len1, seq_len2)
     """
     # Shape : (batch_size, seq_len_2, hdim1)
     Wy = self._weights(matrix2)
     # Shape : (batch_size, seq_len_1, seq_len_2)
     alpha = matrix1.bmm(Wy.transpose(-2, -1))
     return alpha
Beispiel #12
0
def dot_product_score(queries: Tensor, keys: Tensor, scaled: bool = False):
    '''
    Input:
    - queries: [B, T, A]
    - keys: [B, S, A]
    Output:
    - score: [B, T, S]
    '''
    # [B,T,A] x [B,A,S] = [B,T,S]
    if scaled:
        attn_dim = queries.size(-1)
        queries = queries / (attn_dim**0.5)
    score = queries.bmm(keys.transpose(1, 2))  # [B,T,S]
    return score
Beispiel #13
0
 def forward(self, p_seq: torch.Tensor, q: torch.Tensor, p_mask: torch.Tensor):
     """
     Input:
         p_seq: batch_size * p_seq_len * p_hidden_dim
         q: batch_size * q_hidden_dim
         p_mask: batch_size * p_seq_len (1 for padding, 0 for true)
     Output:
         attn_scores: batch_size * p_seq_len
     """
     Wq = self.linear(q) if self.linear is not None else q
     pWq = p_seq.bmm(Wq.unsqueeze(2)).squeeze(2)
     pWq.data.masked_fill_(p_mask.data, -float("inf"))
     attn_scores = F.softmax(pWq, dim=-1) if self.normalize else pWq.exp()
     return attn_scores
Beispiel #14
0
def scaled_dot_product_attention(query: Tensor, key: Tensor,
                                 value: Tensor) -> Tensor:
    '''
    Input:
        query = <X, W_q> => (L, d_k)
        key = <X, W_k> => (L, d_k)
        value = <X, W_k> => (L, d_v)
    Returns:
        Self Attention Matrix (L, d_v)
    '''
    temp = query.bmm(key.transpose(1, 2))
    scale = query.size(-1)**0.5
    softmax = f.softmax(temp / scale, dim=-1)
    return softmax.bmm(value)
def scaled_dot_product_attention(query: Tensor,
                                 key: Tensor,
                                 value: Tensor,
                                 mask: Union[None, Tensor] = None) -> Tensor:

    similarity = query.bmm(key.transpose(1, 2))

    scale = query.size(-1) ** 0.5

    if mask is not None:
        similarity = similarity.masked_fill(mask, float('-inf'))

    softmax = F.softmax(similarity / scale, dim=-1)

    return softmax.bmm(value)
Beispiel #16
0
def attention(query: torch.Tensor,
              key: torch.Tensor,
              value: torch.Tensor,
              enc_masks: torch.Tensor = None) -> torch.Tensor:

    query_unsqueezed = query.unsqueeze(dim=2)
    score = key.bmm(query_unsqueezed)
    score = score.squeeze(dim=2)

    if enc_masks is not None:
        score.data.masked_fill_(enc_masks.bool(), -float('inf'))

    attention_weights = softmax(score, dim=1)
    attention_weights = attention_weights.unsqueeze(dim=1)

    context_vector = attention_weights.bmm(value)
    context_vector = context_vector.squeeze(dim=1)

    return attention_weights, context_vector
Beispiel #17
0
    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
		dec_state = self.decoder(Ybar_t, dec_state)
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
		dec_hidden = dec_state[0] # shape b, h
		dec_cell = dec_state[1]		# shape b, h
		
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
		# mine e_t = torch.bmm(dec_hidden, torch.bmm(enc_hiddens_proj, enc_hidden))
        # 3, (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len)
		e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # to understand ??
Beispiel #18
0
def weighted_sum(matrix: torch.Tensor, attention: torch.Tensor) -> torch.Tensor:
    """
    Takes a matrix of vectors and a set of weights over the rows in the matrix (which we call an
    "attention" vector), and returns a weighted sum of the rows in the matrix.  This is the typical
    computation performed after an attention mechanism.

    Note that while we call this a "matrix" of vectors and an attention "vector", we also handle
    higher-order tensors.  We always sum over the second-to-last dimension of the "matrix", and we
    assume that all dimensions in the "matrix" prior to the last dimension are matched in the
    "vector".  Non-matched dimensions in the "vector" must be `directly after the batch dimension`.

    For example, say I have a "matrix" with dimensions ``(batch_size, num_queries, num_words,
    embedding_dim)``.  The attention "vector" then must have at least those dimensions, and could
    have more. Both:

        - ``(batch_size, num_queries, num_words)`` (distribution over words for each query)
        - ``(batch_size, num_documents, num_queries, num_words)`` (distribution over words in a
          query for each document)

    are valid input "vectors", producing tensors of shape:
    ``(batch_size, num_queries, embedding_dim)`` and
    ``(batch_size, num_documents, num_queries, embedding_dim)`` respectively.
    """
    # We'll special-case a few settings here, where there are efficient (but poorly-named)
    # operations in pytorch that already do the computation we need.
    if attention.dim() == 2 and matrix.dim() == 3:
        return attention.unsqueeze(1).bmm(matrix).squeeze(1)
    if attention.dim() == 3 and matrix.dim() == 3:
        return attention.bmm(matrix)
    if matrix.dim() - 1 < attention.dim():
        expanded_size = list(matrix.size())
        for i in range(attention.dim() - matrix.dim() + 1):
            matrix = matrix.unsqueeze(1)
            expanded_size.insert(i + 1, attention.size(i + 1))
        matrix = matrix.expand(*expanded_size)
    intermediate = attention.unsqueeze(-1).expand_as(matrix) * matrix
    return intermediate.sum(dim=-2)
Beispiel #19
0
def matrix_cosine_similarity(x: torch.Tensor,
                             y: torch.Tensor,
                             eps: float = 1e-8):
    """
    :param x (batch_size, length_1, dim)
    :param y (batch_size, length_2, dim)
    :return 
        (batch_size, length_1, length_2)
    """
    length_1, length_2 = x.size(1), y.size(1)
    # shape: (batch_size, length_1, length_2)
    dot_product = x.bmm(y.permute(0, 2, 1))
    # shape: (batch_size, length_1), (batch_size, length_2)
    x_norm, y_norm = x.norm(dim=-1, p=None), y.norm(dim=-1, p=None)
    # added eps for numerical stability
    x_norm = torch.max(x_norm, eps * x_norm.new_ones(x_norm.size()))
    y_norm = torch.max(y_norm, eps * y_norm.new_ones(y_norm.size()))

    expanded_x_norm = x_norm.unsqueeze(-1).repeat(1, 1, length_2)
    expanded_y_norm = y_norm.unsqueeze(1).repeat(1, length_1, 1)
    # shape: (batch_size, length_1, length_2)
    norm = expanded_x_norm * expanded_y_norm
    similarity = dot_product / norm
    return similarity
 def forward(self, matrix_1: torch.Tensor,
             matrix_2: torch.Tensor) -> torch.Tensor:
     intermediate = matrix_1.bmm(self._weight_matrix.unsqueeze(0))
     return self._activation(
         intermediate.bmm(matrix_2.transpose(1, 2)) + self._bias)
Beispiel #21
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ###Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        dec_state = self.decoder(Ybar_t, dec_state)  # Ybar_t:(b, e + h)

        ###Split dec_state into its two parts (dec_hidden, dec_cell)
        (dec_hidden, dec_cell) = dec_state  #dec_hidden:(b,h) dec_cell:(b,h)

        ###Compute the attention scores e_t, a Tensor shape (b, src_len).
        #enc_hiddens_proj:(b,src_len,h) dec_hidden.unsqueeze(2):(b,h,1) ->(b,src_len,1)->e_t:(b,src_len)
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ###Apply softmax to e_t to yield alpha_t
        alpha_t = F.softmax(e_t)  #e_t:(b,src_len) -> alpha_t:(b,src_len)
        att_view = (alpha_t.size(0), 1, alpha_t.size(1))

        ###Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the attention output vector, a_t.
        a_t = torch.bmm(alpha_t.view(*att_view),
                        enc_hiddens).squeeze(1)  #a_t:(b, 2h)

        ###Concatenate dec_hidden with a_t to compute tensor U_t
        U_t = torch.cat((a_t, dec_hidden), dim=1)  #U_t:(b,3h)

        ###Apply the combined output projection layer to U_t to compute tensor V_t
        V_t = self.combined_output_projection(U_t)  #V_t:(b,h)

        ###Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        O_t = self.dropout(torch.tanh(V_t))  # #O_t:(b,h)

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #22
0
    def forward(self, input: Tensor) -> Tensor:
        input = input.view(input.shape[0], input.shape[1], -1)
        interaction = input.bmm(input.permute(0, 2, 1))
        log_interactions = [mat_log_sym(matrix) for matrix in torch.unbind(interaction, dim=0)]

        return torch.stack(log_interactions, dim = 0)
Beispiel #23
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the
        attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape
                                (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size,
                                h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with
                            shape (b, h), where b = batch size, h = hidden size.
                            First tensor is decoder's prev hidden state, second
                            tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape
                                     (b, src_len, h * 2), where b = batch size,
                                     src_len = maximum source length,
                                     h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor,
                                          projected from (h * 2) to h. Tensor is
                                          with shape (b, src_len, h), where
                                          b = batch size,
                                          src_len = maximum source length,
                                          h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                   where b = batch size, src_len is maximum
                                   source length.

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both
                                   shape (b, h), where b = batch size,
                                   h = hidden size. First tensor is decoder's
                                   new hidden state, second tensor is decoder's
                                   new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t,
                                           shape (b, h), where b = batch size,
                                           h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention
                               scores distribution.
                               Note: You will not use this outside of this
                                     function. We are simply returning this value
                                     so that we can sanity check your
                                     implementation.
        """

        combined_output = None

        # COPY OVER YOUR CODE FROM ASSIGNMENT 4

        # 1. Apply decoder to Ybar_t and previous hidden and cell decoder states.
        dec_prev_h, dec_prev_c = dec_state
        dec_state = self.decoder(Ybar_t, (dec_prev_h, dec_prev_c))

        # 2. Split dec_state into two parts.
        dec_hidden, dec_cell = dec_state  # dec_hidden (b, h), dec_cell (b, h)

        # 3. Compute the multiplicative attention scores.
        dec_hidden_unsqueezed = dec_hidden.unsqueeze(dim=2)  # (b, h, 1)
        # enc_hiddens_proj (b, src_len, h)
        e_t = enc_hiddens_proj.bmm(dec_hidden_unsqueezed)  # (b, src_len, 1)
        e_t = e_t.squeeze(dim=2)  # (b, src_len)

        # END YOUR CODE FROM ASSIGNMENT 4

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        # COPY OVER YOUR CODE FROM ASSIGNMENT 4

        # 1. Apply softmax to get attention distribution.
        alpha_t = torch.nn.functional.softmax(e_t, dim=1)  # (b, src_len)
        alpha_t = alpha_t.unsqueeze(dim=1)  # (b, 1, src_len)

        # 2. Use bmm to obtain the attention output vector.
        # enc_hiddens (b, src_len, 2h)
        a_t = alpha_t.bmm(enc_hiddens)  # (b, 1, 2h)
        a_t = a_t.squeeze(dim=1)  # (b, 2h)

        # 3. Concatentate dec_hidden with a_t to compute U_t
        U_t = torch.cat([a_t, dec_hidden], dim=1)  # (b, 3h)

        # 4. Apply the combiend output projection layer to U_t to compute V_t
        V_t = self.combined_output_projection(U_t)  # (b, h)

        # 5. Compute O_t by applying tanh and then dropout
        O_t = self.dropout(V_t.tanh())  # (b, h)

        # END YOUR CODE FROM ASSIGNMENT 4

        combined_output = O_t
        return dec_state, combined_output, e_t
 def forward(self, matrix_1: torch.Tensor, matrix_2: torch.Tensor) -> torch.Tensor:
     return matrix_1.bmm(matrix_2.transpose(2, 1))
 def _forward_internal(self, vector: torch.Tensor,
                       matrix: torch.Tensor) -> torch.Tensor:
     sim_score = matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)
     sim_score_scaled = sim_score / math.sqrt(sim_score.size(-1))
     return sim_score_scaled
Beispiel #26
0
    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze

        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)
        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh

        alpha_t = F.softmax(e_t, dim=1)
        alpha_t = alpha_t.unsqueeze(2)
        a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t).squeeze(2)
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))
        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t
def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor):
    temp = query.bmm(key.transpose(1, 2))
    scale = query.size(-1)**0.5
    softmax = F.softmax(temp / scale, dim=-1)
    return softmax.bmm(value)
 def _forward_internal(self, vector: torch.Tensor, matrix: torch.Tensor) -> torch.Tensor:
     return matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)
Beispiel #29
0
    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.
        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.
        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        dec_state = self.decoder(Ybar_t, dec_state)

        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        dec_hidden, dec_cell = dec_state
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.

        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!)
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)
        # (bs,1,src_len) @ (bs,src_len,2h) = (bs,1,2h) = (bs,2h)
        ### END YOUR CODE FROM ASSIGNMENT 4

        # Set e_t to -inf where enc_masks has 1
        # So that when do softmax on these paddings of this sentence, the attribution score will be 0 (e^-inf = 0)
        # example: sentence [il,a,m,entarte,<PAD>] (max source length = 5) will have enc_masks [0,0,0,0,1]
        # and with attribution score (pre_softmax) e_t such as [3,-1,0,-2,5],
        # 5 will be such a high attribution score for a meaningless padding
        # so we need to neutralize it by applying mask on so that e_t will be [3,-1,0,-2,-inf]
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4
        ###     1. Apply softmax to e_t to yield alpha_t
        alpha_t = F.softmax(e_t, dim=1)  # (bs, src_len)
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###

        #         att_view = (alpha_t.size(0), 1, alpha_t.size(1))
        #         a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1)

        # (b,2h,src_len) @ (b,src_len,1)
        a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t.unsqueeze(2)).squeeze(2)

        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        U_t = torch.cat([dec_hidden, a_t], dim=1)
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        V_t = self.combined_output_projection(U_t)
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        O_t = self.dropout(torch.tanh(V_t))
        ### END YOUR CODE FROM ASSIGNMENT 4

        combined_output = O_t
        return dec_state, combined_output, e_t
Beispiel #30
0
    def _forward_internal(self, vector: torch.Tensor,
                          matrix: torch.Tensor) -> torch.Tensor:
        transformed_vectors = self.ll(vector)

        return matrix.bmm(transformed_vectors.unsqueeze(-1)).squeeze(-1)
Beispiel #31
0
def scaled_dot_product_attention(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor) -> torch.Tensor:
  temp = query.bmm(key.transpose(1, 2))
  scaled = temp / (query.size(-1) ** 0.5)

  softmax = torch.nn.functional.softmax(scaled, dim=-1)
  return softmax.bmm(value)
 def _forward_internal(self, vector: torch.Tensor,
                       matrix: torch.Tensor) -> torch.Tensor:
     return matrix.bmm(vector.unsqueeze(-1)).squeeze(-1)
Beispiel #33
0
 def forward(self, matrix_1: torch.Tensor,
             matrix_2: torch.Tensor) -> torch.Tensor:
     return matrix_1.bmm(matrix_2.transpose(2, 1))