Esempio n. 1
0
    def from_params(cls, vocab: Vocabulary, params: Params) -> 'DeIsTe':
        embedder_params = params.pop("text_field_embedder")
        text_field_embedder = TextFieldEmbedder.from_params(
            vocab, embedder_params)

        inter_attention = MatrixAttention.from_params(
            params.pop("inter_attention"))
        param_dyn_encoder = Seq2VecEncoder.from_params(
            params.pop("param_dyn_encoder"))

        pos_embedder = TokenEmbedder.from_params(
            vocab=None, params=params.pop("pos_embedder"))
        pos_attn_encoder = Seq2VecEncoder.from_params(
            params.pop("pos_attn_encoder"))

        output_feedforward_params = params.pop('output_feedforward', None)
        output_feedforward = FeedForward.from_params(
            output_feedforward_params) if output_feedforward_params else None

        initializer = InitializerApplicator.from_params(
            params.pop('initializer', []))
        regularizer = RegularizerApplicator.from_params(
            params.pop('regularizer', []))

        params.assert_empty(cls.__name__)
        return cls(vocab=vocab,
                   text_field_embedder=text_field_embedder,
                   inter_attention=inter_attention,
                   param_dyn_encoder=param_dyn_encoder,
                   pos_embedder=pos_embedder,
                   pos_attn_encoder=pos_attn_encoder,
                   output_feedforward=output_feedforward,
                   initializer=initializer,
                   regularizer=regularizer)
 def test_can_init_dot(self):
     legacy_attention = MatrixAttention.from_params(
         Params({
             "type": "linear",
             "tensor_1_dim": 3,
             "tensor_2_dim": 3
         }))
     isinstance(legacy_attention, LinearMatrixAttention)
Esempio n. 3
0
    def __init__(
        self,
        hidden_size1: int,
        hidden_size2: int,
        combined_hidden_size: int,
        num_attention_heads: int,
        dropout1: float = 0.0,
        dropout2: float = 0.0,
        scoring_func1: str = "scaled_dot_product",
        scoring_func2: str = "scaled_dot_product",
    ):
        super().__init__()
        if combined_hidden_size % num_attention_heads != 0:
            raise ValueError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (combined_hidden_size, num_attention_heads))

        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(combined_hidden_size /
                                       num_attention_heads)

        # This is basically the `combined_hidden_size`, since we already ensure
        # that `combined_hidden_size` is divisible by `num_attention_heads`.
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        # First modality:

        self.query1 = torch.nn.Linear(hidden_size1, self.all_head_size)
        self.key1 = torch.nn.Linear(hidden_size1, self.all_head_size)
        self.value1 = torch.nn.Linear(hidden_size1, self.all_head_size)

        self.scoring_func1 = scoring_func1
        self.attn1 = MatrixAttention.by_name(self.scoring_func1)()
        self.dropout1 = torch.nn.Dropout(dropout1)

        # Second modality:

        self.query2 = torch.nn.Linear(hidden_size2, self.all_head_size)
        self.key2 = torch.nn.Linear(hidden_size2, self.all_head_size)
        self.value2 = torch.nn.Linear(hidden_size2, self.all_head_size)

        self.scoring_func2 = scoring_func2
        self.attn2 = MatrixAttention.by_name(self.scoring_func2)()
        self.dropout2 = torch.nn.Dropout(dropout2)
    def test_can_build_from_params(self):
        params = Params({
            "type": "legacy",
            "similarity_function": {
                "type": "cosine"
            }
        })
        attention = MatrixAttention.from_params(params)

        assert attention._similarity_function.__class__.__name__ == "CosineSimilarity"
 def test_can_build_from_params(self):
     params = Params({
         "type": "legacy",
         'similarity_function': {
             'type': 'cosine'
         }
     })
     attention = MatrixAttention.from_params(params)
     # pylint: disable=protected-access
     assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity'
 def test_can_init_cosine(self):
     legacy_attention = MatrixAttention.from_params(Params({"type": "cosine"}))
     isinstance(legacy_attention, CosineMatrixAttention)
 def test_can_init_dot(self):
     legacy_attention = MatrixAttention.from_params(Params({"type": "dot_product"}))
     isinstance(legacy_attention, DotProductMatrixAttention)
from __future__ import division
from __future__ import absolute_import
import torch
#overrides

from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention


class CosineMatrixAttention(MatrixAttention):
    u"""
    Computes attention between every entry in matrix_1 with every entry in matrix_2 using cosine
    similarity.
    """

    #overrides
    def forward(self, matrix_1, matrix_2):
        a_norm = matrix_1 / (matrix_1.norm(p=2, dim=-1, keepdim=True) + 1e-13)
        b_norm = matrix_2 / (matrix_2.norm(p=2, dim=-1, keepdim=True) + 1e-13)
        return torch.bmm(a_norm, b_norm.transpose(-1, -2))


CosineMatrixAttention = MatrixAttention.register(u"cosine")(
    CosineMatrixAttention)
 def test_can_build_from_params(self):
     params = Params({"type": "legacy", 'similarity_function': {'type': 'cosine'}})
     attention = MatrixAttention.from_params(params)
     # pylint: disable=protected-access
     assert attention._similarity_function.__class__.__name__ == 'CosineSimilarity'
 def test_can_init_cosine(self):
     legacy_attention = MatrixAttention.from_params(
         Params({"type": "cosine"}))
     isinstance(legacy_attention, CosineMatrixAttention)
Esempio n. 11
0
    def __init__(
        self,
        hidden_size: int = 512,
        attention_head_size: int = 64,
        num_attention_heads: int = 8,
        scoring_func: str = "scaled_dot_product",
        output_linear: bool = False,
        dropout: float = 0.0,
        bias: bool = True,
        normalize_weights: bool = False,
        is_decoder: bool = False,
        is_cross_attention: bool = False,
        relative_attention_num_buckets: Optional[int] = None,
    ):

        super().__init__()

        if hidden_size % num_attention_heads != 0:
            raise ConfigurationError(
                "The hidden size (%d) is not a multiple of the number of attention "
                "heads (%d)" % (hidden_size, num_attention_heads))

        if is_cross_attention and not is_decoder:
            raise ConfigurationError(
                "The attention layer can be a cross-attention layer only "
                "if it is within a decoder.")

        self.hidden_size = hidden_size
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = attention_head_size
        self.all_head_size = self.num_attention_heads * self.attention_head_size

        self.query = torch.nn.Linear(hidden_size,
                                     self.all_head_size,
                                     bias=bias)
        self.key = torch.nn.Linear(hidden_size, self.all_head_size, bias=bias)
        self.value = torch.nn.Linear(hidden_size,
                                     self.all_head_size,
                                     bias=bias)

        # out linear layer for distilbert, T5 etc.
        if output_linear:
            self.output = torch.nn.Linear(self.all_head_size,
                                          hidden_size,
                                          bias=bias)

        self.scoring_func = scoring_func
        self.attn = MatrixAttention.by_name(self.scoring_func)()

        self.relative_attention_num_buckets = relative_attention_num_buckets

        if self.relative_attention_num_buckets is not None:
            self.relative_attention_bias = torch.nn.Embedding(
                self.relative_attention_num_buckets, self.num_attention_heads)

        self.dropout = dropout

        self.is_decoder = is_decoder
        self.is_cross_attention = is_cross_attention

        if normalize_weights:
            self._normalize()
    def reset_parameters(self):
        std = math.sqrt(6 / (self._weight_vector.size(0) + 1))
        self._weight_vector.data.uniform_(-std, std)
        self._bias.data.fill_(0)

    #overrides
    def forward(
            self,  # pylint: disable=arguments-differ
            matrix_1,
            matrix_2):
        # TODO(mattg): Remove the need for this tiling.
        # https://github.com/allenai/allennlp/pull/1235#issuecomment-391540133
        tiled_matrix_1 = matrix_1.unsqueeze(2).expand(matrix_1.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_1.size()[2])
        tiled_matrix_2 = matrix_2.unsqueeze(1).expand(matrix_2.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_2.size()[2])

        combined_tensors = util.combine_tensors(
            self._combination, [tiled_matrix_1, tiled_matrix_2])
        dot_product = torch.matmul(combined_tensors, self._weight_vector)
        return self._activation(dot_product + self._bias)


LinearMatrixAttention = MatrixAttention.register(u"linear")(
    LinearMatrixAttention)
    The legacy implementation of ``MatrixAttention``.

    It should be considered deprecated as it uses much more memory than the newer specialized
    ``MatrixAttention`` modules.

    Parameters
    ----------
    similarity_function: ``SimilarityFunction``, optional (default=``DotProductSimilarity``)
        The similarity function to use when computing the attention.
    """
    def __init__(self, similarity_function=None):
        super(LegacyMatrixAttention, self).__init__()
        self._similarity_function = similarity_function or DotProductSimilarity(
        )

    #overrides
    def forward(self, matrix_1, matrix_2):
        tiled_matrix_1 = matrix_1.unsqueeze(2).expand(matrix_1.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_1.size()[2])
        tiled_matrix_2 = matrix_2.unsqueeze(1).expand(matrix_2.size()[0],
                                                      matrix_1.size()[1],
                                                      matrix_2.size()[1],
                                                      matrix_2.size()[2])
        return self._similarity_function(tiled_matrix_1, tiled_matrix_2)


LegacyMatrixAttention = MatrixAttention.register(u"legacy")(
    LegacyMatrixAttention)
from __future__ import absolute_import
import torch
#overrides

from allennlp.modules.matrix_attention.matrix_attention import MatrixAttention


class DotProductMatrixAttention(MatrixAttention):
    u"""
    Computes attention between every entry in matrix_1 with every entry in matrix_2 using a dot
    product.
    """

    #overrides
    def forward(self, matrix_1, matrix_2):
        return matrix_1.bmm(matrix_2.transpose(2, 1))


DotProductMatrixAttention = MatrixAttention.register(u"dot_product")(
    DotProductMatrixAttention)
Esempio n. 15
0
        self._weight_matrix = Parameter(
            torch.Tensor(matrix_1_dim, matrix_2_dim))

        self._bias = Parameter(torch.Tensor(1))
        self._activation = activation or Activation.by_name(u'linear')()
        self._use_input_biases = use_input_biases
        self.reset_parameters()

    def reset_parameters(self):
        torch.nn.init.xavier_uniform_(self._weight_matrix)
        self._bias.data.fill_(0)

    #overrides
    def forward(self, matrix_1, matrix_2):

        if self._use_input_biases:
            bias1 = matrix_1.new_ones(matrix_1.size()[:-1] + (1, ))
            bias2 = matrix_2.new_ones(matrix_2.size()[:-1] + (1, ))

            matrix_1 = torch.cat([matrix_1, bias1], -1)
            matrix_2 = torch.cat([matrix_2, bias2], -1)
        intermediate = torch.matmul(matrix_1.unsqueeze(1),
                                    self._weight_matrix.unsqueeze(0))
        final = torch.matmul(intermediate,
                             matrix_2.unsqueeze(1).transpose(2, 3))
        return self._activation(final.squeeze(1) + self._bias)


BilinearMatrixAttention = MatrixAttention.register(u"bilinear")(
    BilinearMatrixAttention)
 def test_can_init_dot(self):
     legacy_attention = MatrixAttention.from_params(Params({"type": "scaled_dot_product"}))
     isinstance(legacy_attention, ScaledDotProductMatrixAttention)
 def test_can_init_dot(self):
     legacy_attention = MatrixAttention.from_params(Params({"type": "linear",
                                                            "tensor_1_dim": 3,
                                                            "tensor_2_dim": 3}))
     isinstance(legacy_attention, LinearMatrixAttention)