Beispiel #1
0
    def get_future_mask(self, batch_size, sequence_length):
        """Mask future targets and padding

            :param batch_size: a Tensor dimension
            :param sequence_length: a Tensor dimension
            :param padding_mask: None or bool Tensor with shape [batch_size, sequence_length]

            :return mask Tensor with shape [batch_size, sequence_length, sequence_length]
        """

        xind = th.arange(sequence_length)[None,:].repeat(*(sequence_length, 1))
        yind = th.arange(sequence_length)[:,None].repeat(*(1, sequence_length))
        mask = yind >= xind
        mask = mask[None,...].repeat(*(batch_size, 1, 1))

        return mask.to(get_device())
Beispiel #2
0
    def forward(self, inputs, start=1):
        """
            Args:
                inputs: a float32 Tensor with shape [batch_size, sequence_length, hidden_size]

            Returns:
                embedding: a float32 Tensor with shape [batch_size, sequence_length, hidden_size]
        """
        ####################################  YOUR CODE HERE  ####################################
        # PART 3: Implement the Position Embedding.
        # As stated in section 3.5 of the paper, attention does not naturally embed position information
        # To incorporate that, the authors use a variable frequency sin embedding.
        # Note that we use zero-indexing here while the authors use one-indexing

        assert inputs.shape[
            -1] == self.hidden_size and 'Input final dim must match model hidden size'

        batch_size = inputs.shape[0]
        sequence_length = inputs.shape[1]

        # obtain a sequence that starts at `start` and increments for `sequence_length `
        seq_pos = th.arange(start, sequence_length + start, dtype=th.float32)
        seq_pos_expanded = seq_pos[None, :, None]
        index = seq_pos_expanded.repeat(*[1, 1, self.hidden_size // 2])

        # create the position embedding as described in the paper
        # use the `divisor` attribute instantiated in __init__
        sin_embedding = th.sin(index / self.divisor)
        cos_embedding = th.cos(index / self.divisor)

        # interleave the sin and cos. For more info see:
        # https://discuss.pytorch.org/t/how-to-interleave-two-tensors-along-certain-dimension/11332/3
        position_shape = (1, sequence_length, self.hidden_size
                          )  # fill in the other two dimensions
        position_embedding = th.stack((sin_embedding, cos_embedding),
                                      dim=3).view(position_shape)

        pos_embed_deviced = position_embedding.to(get_device())
        return inputs + position_embedding  # add the embedding to the input
    def forward(self, queries, keys, values, mask=None):
        """Fast scaled dot product attention.

            :param queries: Tensor with shape [batch_size, heads (optional), n_queries, depth_k]
            :param keys:    Tensor with shape [batch_size, heads (optional), n_keyval, depth_k]
            :param values:  Tensor with shape [batch_size, heads (optional), n_keyval, depth_v]
            :param mask:    Tensor with shape [batch_size, n_queries, n_queries]

            :return: output: Tensor with shape [batch_size, heads (optional), n_queries, depth_v]
        """
        ####################################  YOUR CODE HERE  ####################################
        # n_queries corresponds to the sequence length on the query side
        # n_keyval corresponds to the sequence length on the key side (and value, as they are one and the same)
        # depth_k is the size of the projection that the key / query comparison is performed on.
        # depth_v is the size of the projection of the value projection. In a setting with one head, it is usually the dimension (dim) of the Transformer.
        # heads corresponds to the number of heads the attention is performed on.
        # If you are unfamiliar with attention heads, read section 3.2.2 of the Attention is all you need paper

        # PART 1: Implement Attention QKV
        # Use queries, keys and values to compute the output of the QKV attention

        # As defined is the Attention is all you need paper: https://arxiv.org/pdf/1706.03762.pdf
        device = get_device()
        key_dim = th.tensor(keys.shape[-1], dtype=th.float32)
        batch_size = queries.shape[0]
        similarity = None
        if len(queries.size()) == 3:
            similarity = th.zeros(batch_size, queries.shape[1], keys.shape[1])
            similarity = similarity.to(device)
            for batch in range(batch_size):
                similarity[batch, :, :] = th.mm(queries[batch, :, :],
                                                keys[batch, :, :].t())
        else:
            heads = queries.shape[1]
            similarity = th.zeros(batch_size, heads, queries.shape[2],
                                  keys.shape[2])
            similarity = similarity.to(device)
            for batch in range(batch_size):
                for head in range(heads):
                    similarity[batch,
                               head, :, :] = th.mm(queries[batch, head, :, :],
                                                   keys[batch, head, :, :].t())
        similarity = (
            1 / th.sqrt(key_dim)
        ) * similarity  # Compute the similarity according to the QKV formula

        masked_similarity = self.apply_mask(
            similarity, mask=mask
        )  # We give you the mask to apply so that it is correct, you do not need to modify this.
        last_dim = len(masked_similarity.size()) - 1
        weights = F.softmax(
            masked_similarity, dim=last_dim
        )  # Turn the similarity into a normalized output. Remember that the last dim contains the features
        output = None
        if len(queries.size()) == 3:
            output = th.zeros(batch_size, queries.shape[1], values.shape[2])
            output = output.to(device)
            for batch in range(batch_size):
                output[batch, :, :] = th.mm(weights[batch, :, :],
                                            values[batch, :, :])
        else:
            heads = queries.shape[1]
            output = th.zeros(batch_size, heads, queries.shape[2],
                              values.shape[3])
            output = output.to(device)
            for batch in range(batch_size):
                for head in range(heads):
                    output[batch,
                           head, :, :] = th.mm(weights[batch, head, :, :],
                                               values[batch, head, :, :])
        ####################################  END OF YOUR CODE  ##################################

        return output, weights