def test_multi_head_self_attention_runs_forward(self):
     attention = MultiHeadSelfAttention(num_heads=3,
                                        input_dim=5,
                                        attention_dim=7,
                                        values_dim=9)
     inputs = Variable(torch.randn(2, 12, 5))
     assert list(attention(inputs).size()) == [2, 12, 5]
    def test_multi_head_self_attention_can_build_from_params(self):
        params = Params({"num_heads": 3, "input_dim": 2, "attention_dim": 3, "values_dim": 6})

        encoder = MultiHeadSelfAttention.from_params(params)
        assert isinstance(encoder, MultiHeadSelfAttention)
        assert encoder.get_input_dim() == 2
        assert encoder.get_output_dim() == 2
    def test_multi_head_self_attention_can_build_from_params(self):
        params = Params({
            u"num_heads": 3,
            u"input_dim": 2,
            u"attention_dim": 3,
            u"values_dim": 6
        })

        encoder = MultiHeadSelfAttention.from_params(params)
        assert isinstance(encoder, MultiHeadSelfAttention)
        assert encoder.get_input_dim() == 2
        assert encoder.get_output_dim() == 2
 def test_multi_head_self_attention_respects_masking(self):
     attention = MultiHeadSelfAttention(num_heads=3,
                                        input_dim=5,
                                        attention_dim=7,
                                        values_dim=9,
                                        attention_dropout_prob=0.0)
     tensor = Variable(torch.randn(2, 12, 5))
     mask = Variable(torch.ones([2, 12]))
     mask[0, 6:] = 0
     result = attention(tensor, mask)
     # Compute the same function without a mask, but with
     # only the unmasked elements - should be the same.
     result_without_mask = attention(tensor[:, :6, :])
     numpy.testing.assert_almost_equal(
         result[0, :6, :].data.cpu().numpy(),
         result_without_mask[0, :, :].data.cpu().numpy())
Beispiel #5
0
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_DIM,
                  HIDDEN_DIM,
                  batch_first=True,
                  dropout=args.drop))

lstm_model = LanguageModel(contextualizer=lstm,
                           text_field_embedder=word_embeddings,
                           vocab=vocab)

transformer = MultiHeadSelfAttention(attention_dim=16,
                                     input_dim=EMBEDDING_DIM,
                                     num_heads=2,
                                     values_dim=16,
                                     attention_dropout_prob=args.drop)

transformer_model = LanguageModel(contextualizer=transformer,
                                  text_field_embedder=word_embeddings,
                                  vocab=vocab)

stacked_transformer = StackedSelfAttentionEncoder(
    input_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    num_layers=2,
    projection_dim=16,
    feedforward_hidden_dim=16,
    num_attention_heads=2,
    attention_dropout_prob=args.drop)
Beispiel #6
0
    def __init__(self,
                 input_dim: int,
                 hidden_dim: int,
                 projection_dim: int,
                 feedforward_hidden_dim: int,
                 num_layers: int,
                 num_attention_heads: int,
                 dropout_prob: float = 0.1,
                 residual_dropout_prob: float = 0.2,
                 attention_dropout_prob: float = 0.1) -> None:
        super().__init__()

        self._attention_layers: List[MultiHeadSelfAttention] = []
        self._feedfoward_layers: List[FeedForward] = []
        self._layer_norm_layers: List[LayerNorm] = []
        self._feed_forward_layer_norm_layers: List[LayerNorm] = []
        self._reset_gate_layers: List[FeedForward] = []

        feedfoward_input_dim = input_dim
        for i in range(num_layers):
            feedfoward = FeedForward(
                feedfoward_input_dim,
                activations=[
                    Activation.by_name('relu')(),
                    Activation.by_name('linear')()
                ],
                hidden_dims=[feedforward_hidden_dim, hidden_dim],
                num_layers=2,
                dropout=dropout_prob)

            # Note: Please use `ModuleList` in new code. It provides better
            # support for running on multiple GPUs. We've kept `add_module` here
            # solely for backwards compatibility with existing serialized models.
            self.add_module(f"feedforward_{i}", feedfoward)
            self._feedfoward_layers.append(feedfoward)

            feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim())
            self.add_module(f"feedforward_layer_norm_{i}",
                            feedforward_layer_norm)
            self._feed_forward_layer_norm_layers.append(feedforward_layer_norm)

            self_attention = MultiHeadSelfAttention(
                num_heads=num_attention_heads,
                input_dim=hidden_dim,
                attention_dim=projection_dim,
                values_dim=projection_dim,
                attention_dropout_prob=attention_dropout_prob)
            self.add_module(f"self_attention_{i}", self_attention)
            self._attention_layers.append(self_attention)

            reset_gate = FeedForward(
                feedforward_hidden_dim,
                activations=Activation.by_name('sigmoid')(),
                hidden_dims=hidden_dim,
                num_layers=1,
                dropout=dropout_prob)
            self.add_module(f"reset_gate_{i}", reset_gate)
            self._reset_gate_layers.append(reset_gate)

            layer_norm = LayerNorm(self_attention.get_output_dim())
            self.add_module(f"layer_norm_{i}", layer_norm)
            self._layer_norm_layers.append(layer_norm)

            feedfoward_input_dim = hidden_dim

        self.dropout = Dropout(residual_dropout_prob)
        self._input_dim = input_dim
        self._output_dim = self._attention_layers[-1].get_output_dim()
Beispiel #7
0
    def __init__(self, args, input_dim, hidden_dim, word_embedder):
        super(RelationAttendedDefinitionSentenceEncoder, self).__init__()
        self.config = args
        self.args = args
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.projection_dim = input_dim
        self.feedforward_hidden_dim = input_dim
        self.num_layers = self.args.num_layers_for_stackatt
        self.num_attention_heads = self.args.num_atthead_for_stackatt

        self.word_embedder = word_embedder
        self.word_embedding_dropout = nn.Dropout(
            self.args.word_embedding_dropout)

        # from allennlp.modules.seq2seq_encoders import , , \
        #     , ,
        #     BidirectionalLanguageModelTransformer, FeedForwardEncoder

        if self.args.definition_seq2seq == 'passthrough':
            self.seq2seq = PassThroughEncoder(input_dim=input_dim)
        elif self.args.definition_seq2seq == 'multiheadstackatt':
            self.seq2seq = StackedSelfAttentionEncoder(
                input_dim=input_dim,
                hidden_dim=input_dim,
                projection_dim=input_dim,
                feedforward_hidden_dim=input_dim,
                num_layers=2,
                num_attention_heads=2)
        elif self.args.definition_seq2seq == 'qanet':
            self.seq2seq = QaNetEncoder(input_dim=input_dim,
                                        hidden_dim=input_dim,
                                        attention_projection_dim=input_dim,
                                        feedforward_hidden_dim=input_dim,
                                        num_blocks=2,
                                        num_convs_per_block=2,
                                        conv_kernel_size=3,
                                        num_attention_heads=2)
        elif self.args.definition_seq2seq == 'intrasentenceatt':
            self.seq2seq = IntraSentenceAttentionEncoder(
                input_dim=input_dim,
                projection_dim=input_dim,
                output_dim=input_dim)
        elif self.args.definition_seq2seq == 'gatedcnn':
            self.seq2seq = GatedCnnEncoder(input_dim=512,
                                           layers=[[[4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]],
                                                   [[4, 512], [4, 512]]],
                                           dropout=0.05)
        elif self.args.definition_seq2seq == 'bilmtransformer':
            self.seq2seq = BidirectionalLanguageModelTransformer(
                input_dim=input_dim, hidden_dim=input_dim, num_layers=2)
        # elif self.args.definition_seq2seq == 'feedfoward':
        #     feedforward = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=input_dim, activations=self.args.activation_for_sentence_ff)
        #     self.seq2seq = FeedForwardEncoder(feedforward)

        # '''
        # *"linear"
        # *`"relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU>`_
        # *`"relu6" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ReLU6>`_
        # *`"elu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.ELU>`_
        # *`"prelu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.PReLU>`_
        # *`"leaky_relu" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LeakyReLU>`_
        # *`"threshold" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Threshold>`_
        # *`"hardtanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Hardtanh>`_
        # *`"sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Sigmoid>`_
        # *`"tanh" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanh>`_
        # *`"log_sigmoid" < https: // pytorch.org / docs / master / nn.html  # torch.nn.LogSigmoid>`_
        # *`"softplus" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softplus>`_
        # *`"softshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softshrink>`_
        # *`"softsign" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Softsign>`_
        # *`"tanhshrink" < https: // pytorch.org / docs / master / nn.html  # torch.nn.Tanhshrink>`_
        # '''

        elif self.args.definition_seq2seq == 'multiheadselfatt':
            self.seq2seq = MultiHeadSelfAttention(
                num_heads=2,
                input_dim=input_dim,
                output_projection_dim=input_dim,
                attention_dim=input_dim,
                values_dim=input_dim)
        else:
            print('Encoder not defined:', self.args.definition_seq2seq)
            exit()
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 num_highway_layers: int,
                 phrase_layer: Seq2SeqEncoder,
                 similarity_function: SimilarityFunction,
                 multi_headed_attention_layer: MultiHeadSelfAttention,
                 modeling_layer: Seq2SeqEncoder,
                 span_end_encoder: Seq2SeqEncoder,
                 dropout: float = 0.2,
                 mask_lstms: bool = True,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._highway_layer = TimeDistributed(
            Highway(text_field_embedder.get_output_dim(), num_highway_layers))
        self._phrase_layer = phrase_layer
        self._matrix_attention = LegacyMatrixAttention(similarity_function)
        self._modeling_layer = modeling_layer
        self._span_end_encoder = span_end_encoder

        #New Self Attention layer
        self._self_attention_layer = multi_headed_attention_layer
        self._sa_matrix_attention = LegacyMatrixAttention(similarity_function)
        selfattent_dim = multi_headed_attention_layer.get_output_dim()
        #print("Self Attention Output Dim:",selfattent_dim,"\n")

        encoding_dim = phrase_layer.get_output_dim()
        modeling_dim = modeling_layer.get_output_dim()
        #span_start_input_dim = encoding_dim * 4 + modeling_dim

        span_start_input_dim = encoding_dim * 4 + modeling_dim + 2 * selfattent_dim

        self._span_start_predictor = TimeDistributed(
            torch.nn.Linear(span_start_input_dim, 1))

        span_end_encoding_dim = span_end_encoder.get_output_dim()
        #span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim
        span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim + 2 * selfattent_dim

        self._span_end_predictor = TimeDistributed(
            torch.nn.Linear(span_end_input_dim, 1))

        # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily
        # obvious from the configuration files, so we check here.
        check_dimensions_match(modeling_layer.get_input_dim(),
                               4 * encoding_dim + 2 * selfattent_dim,
                               "modeling layer input dim", "4 * encoding dim")
        check_dimensions_match(text_field_embedder.get_output_dim(),
                               phrase_layer.get_input_dim(),
                               "text field embedder output dim",
                               "phrase layer input dim")
        check_dimensions_match(
            span_end_encoder.get_input_dim(),
            4 * encoding_dim + 3 * modeling_dim + 2 * selfattent_dim,
            "span end encoder input dim",
            "4 * encoding dim + 3 * modeling dim")

        self._na_accuracy = CategoricalAccuracy()
        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._squad_metrics = SquadEmAndF1()

        self._na_dense = lambda in_dim: torch.nn.Linear(in_dim, 2).cuda()

        if dropout > 0:
            self._dropout = torch.nn.Dropout(p=dropout)
        else:
            self._dropout = lambda x: x
        self._mask_lstms = mask_lstms

        initializer(self)
    def __init__(self,
                 token_embedders: Dict[str, TokenEmbedder],
                 output_dim: int,
                 allow_unmatched_keys: bool = False) -> None:
        super(AttentiveTextFieldEmbedder, self).__init__()
        self._token_embedders = token_embedders
        self.output_dim = output_dim

        for key, embedder in token_embedders.items():
            name = 'token_embedder_%s' % key
            self.add_module(name, embedder)
        self._allow_unmatched_keys = allow_unmatched_keys

        self.use_glove = False
        if 'tokens' in self._token_embedders:
            self.use_glove = True
            self.glove_embedder = self._token_embedders['tokens']

        self.use_elmo = False
        if 'elmo' in self._token_embedders:
            self.use_elmo = True
            self.elmo_embedder = self._token_embedders['elmo']

        self.use_char = False
        if 'token_characters' in self._token_embedders:
            self.use_char = True
            self.char_embedder = self._token_embedders['token_characters']

        self.num_tasks = len(self._token_embedders) - int(
            self.use_glove) - int(self.use_elmo) - int(self.use_char)

        self.separate_embedder_keys = set(
            ['tokens', 'elmo', 'token_characters'])
        self.linear_layers = {}
        for key, embedder in self._token_embedders.items():
            if key in self.separate_embedder_keys:
                continue
            in_dim = embedder.get_output_dim()
            out_dim = self.output_dim
            self.linear_layers[key] = nn.Linear(in_dim, out_dim, bias=False)
            if torch.cuda.is_available():
                self.linear_layers[key].cuda()

        seaparate_embedding_total_dim = 0
        for key in self.separate_embedder_keys:
            if key in self._token_embedders:
                embedder = self._token_embedders[key]
                seaparate_embedding_total_dim += embedder.get_output_dim()

        self.lstm = lstm = LSTM(bidirectional=True,
                                num_layers=1,
                                input_size=output_dim,
                                hidden_size=output_dim,
                                batch_first=True)
        self.rnn_encoder = PytorchSeq2SeqWrapper(lstm)
        self.attention = MultiHeadSelfAttention(
            num_heads=4,
            input_dim=output_dim,
            attention_dim=output_dim,
            values_dim=output_dim,
            output_projection_dim=output_dim)