def test_multi_head_self_attention_runs_forward(self): attention = MultiHeadSelfAttention(num_heads=3, input_dim=5, attention_dim=7, values_dim=9) inputs = Variable(torch.randn(2, 12, 5)) assert list(attention(inputs).size()) == [2, 12, 5]
def test_multi_head_self_attention_can_build_from_params(self): params = Params({"num_heads": 3, "input_dim": 2, "attention_dim": 3, "values_dim": 6}) encoder = MultiHeadSelfAttention.from_params(params) assert isinstance(encoder, MultiHeadSelfAttention) assert encoder.get_input_dim() == 2 assert encoder.get_output_dim() == 2
def test_multi_head_self_attention_can_build_from_params(self): params = Params({ u"num_heads": 3, u"input_dim": 2, u"attention_dim": 3, u"values_dim": 6 }) encoder = MultiHeadSelfAttention.from_params(params) assert isinstance(encoder, MultiHeadSelfAttention) assert encoder.get_input_dim() == 2 assert encoder.get_output_dim() == 2
def test_multi_head_self_attention_respects_masking(self): attention = MultiHeadSelfAttention(num_heads=3, input_dim=5, attention_dim=7, values_dim=9, attention_dropout_prob=0.0) tensor = Variable(torch.randn(2, 12, 5)) mask = Variable(torch.ones([2, 12])) mask[0, 6:] = 0 result = attention(tensor, mask) # Compute the same function without a mask, but with # only the unmasked elements - should be the same. result_without_mask = attention(tensor[:, :6, :]) numpy.testing.assert_almost_equal( result[0, :6, :].data.cpu().numpy(), result_without_mask[0, :, :].data.cpu().numpy())
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True, dropout=args.drop)) lstm_model = LanguageModel(contextualizer=lstm, text_field_embedder=word_embeddings, vocab=vocab) transformer = MultiHeadSelfAttention(attention_dim=16, input_dim=EMBEDDING_DIM, num_heads=2, values_dim=16, attention_dropout_prob=args.drop) transformer_model = LanguageModel(contextualizer=transformer, text_field_embedder=word_embeddings, vocab=vocab) stacked_transformer = StackedSelfAttentionEncoder( input_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, num_layers=2, projection_dim=16, feedforward_hidden_dim=16, num_attention_heads=2, attention_dropout_prob=args.drop)
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super().__init__() self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] self._reset_gate_layers: List[FeedForward] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) reset_gate = FeedForward( feedforward_hidden_dim, activations=Activation.by_name('sigmoid')(), hidden_dims=hidden_dim, num_layers=1, dropout=dropout_prob) self.add_module(f"reset_gate_{i}", reset_gate) self._reset_gate_layers.append(reset_gate) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, args, input_dim, hidden_dim, word_embedder): super(RelationAttendedDefinitionSentenceEncoder, self).__init__() self.config = args self.args = args self.input_dim = input_dim self.hidden_dim = hidden_dim self.projection_dim = input_dim self.feedforward_hidden_dim = input_dim self.num_layers = self.args.num_layers_for_stackatt self.num_attention_heads = self.args.num_atthead_for_stackatt self.word_embedder = word_embedder self.word_embedding_dropout = nn.Dropout( self.args.word_embedding_dropout) # from allennlp.modules.seq2seq_encoders import , , \ # , , # BidirectionalLanguageModelTransformer, FeedForwardEncoder if self.args.definition_seq2seq == 'passthrough': self.seq2seq = PassThroughEncoder(input_dim=input_dim) elif self.args.definition_seq2seq == 'multiheadstackatt': self.seq2seq = StackedSelfAttentionEncoder( input_dim=input_dim, hidden_dim=input_dim, projection_dim=input_dim, feedforward_hidden_dim=input_dim, num_layers=2, num_attention_heads=2) elif self.args.definition_seq2seq == 'qanet': self.seq2seq = QaNetEncoder(input_dim=input_dim, hidden_dim=input_dim, attention_projection_dim=input_dim, feedforward_hidden_dim=input_dim, num_blocks=2, num_convs_per_block=2, conv_kernel_size=3, num_attention_heads=2) elif self.args.definition_seq2seq == 'intrasentenceatt': self.seq2seq = IntraSentenceAttentionEncoder( input_dim=input_dim, projection_dim=input_dim, output_dim=input_dim) elif self.args.definition_seq2seq == 'gatedcnn': self.seq2seq = GatedCnnEncoder(input_dim=512, layers=[[[4, 512]], [[4, 512], [4, 512]], [[4, 512], [4, 512]], [[4, 512], [4, 512]]], dropout=0.05) elif self.args.definition_seq2seq == 'bilmtransformer': self.seq2seq = BidirectionalLanguageModelTransformer( input_dim=input_dim, hidden_dim=input_dim, num_layers=2) # elif self.args.definition_seq2seq == 'feedfoward': # feedforward = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=input_dim, activations=self.args.activation_for_sentence_ff) # self.seq2seq = FeedForwardEncoder(feedforward) # ''' # *"linear" # *`"relu" < https: // pytorch.org / docs / master / nn.html # torch.nn.ReLU>`_ # *`"relu6" < https: // pytorch.org / docs / master / nn.html # torch.nn.ReLU6>`_ # *`"elu" < https: // pytorch.org / docs / master / nn.html # torch.nn.ELU>`_ # *`"prelu" < https: // pytorch.org / docs / master / nn.html # torch.nn.PReLU>`_ # *`"leaky_relu" < https: // pytorch.org / docs / master / nn.html # torch.nn.LeakyReLU>`_ # *`"threshold" < https: // pytorch.org / docs / master / nn.html # torch.nn.Threshold>`_ # *`"hardtanh" < https: // pytorch.org / docs / master / nn.html # torch.nn.Hardtanh>`_ # *`"sigmoid" < https: // pytorch.org / docs / master / nn.html # torch.nn.Sigmoid>`_ # *`"tanh" < https: // pytorch.org / docs / master / nn.html # torch.nn.Tanh>`_ # *`"log_sigmoid" < https: // pytorch.org / docs / master / nn.html # torch.nn.LogSigmoid>`_ # *`"softplus" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softplus>`_ # *`"softshrink" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softshrink>`_ # *`"softsign" < https: // pytorch.org / docs / master / nn.html # torch.nn.Softsign>`_ # *`"tanhshrink" < https: // pytorch.org / docs / master / nn.html # torch.nn.Tanhshrink>`_ # ''' elif self.args.definition_seq2seq == 'multiheadselfatt': self.seq2seq = MultiHeadSelfAttention( num_heads=2, input_dim=input_dim, output_projection_dim=input_dim, attention_dim=input_dim, values_dim=input_dim) else: print('Encoder not defined:', self.args.definition_seq2seq) exit()
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, num_highway_layers: int, phrase_layer: Seq2SeqEncoder, similarity_function: SimilarityFunction, multi_headed_attention_layer: MultiHeadSelfAttention, modeling_layer: Seq2SeqEncoder, span_end_encoder: Seq2SeqEncoder, dropout: float = 0.2, mask_lstms: bool = True, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(BidirectionalAttentionFlow, self).__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._highway_layer = TimeDistributed( Highway(text_field_embedder.get_output_dim(), num_highway_layers)) self._phrase_layer = phrase_layer self._matrix_attention = LegacyMatrixAttention(similarity_function) self._modeling_layer = modeling_layer self._span_end_encoder = span_end_encoder #New Self Attention layer self._self_attention_layer = multi_headed_attention_layer self._sa_matrix_attention = LegacyMatrixAttention(similarity_function) selfattent_dim = multi_headed_attention_layer.get_output_dim() #print("Self Attention Output Dim:",selfattent_dim,"\n") encoding_dim = phrase_layer.get_output_dim() modeling_dim = modeling_layer.get_output_dim() #span_start_input_dim = encoding_dim * 4 + modeling_dim span_start_input_dim = encoding_dim * 4 + modeling_dim + 2 * selfattent_dim self._span_start_predictor = TimeDistributed( torch.nn.Linear(span_start_input_dim, 1)) span_end_encoding_dim = span_end_encoder.get_output_dim() #span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim + 2 * selfattent_dim self._span_end_predictor = TimeDistributed( torch.nn.Linear(span_end_input_dim, 1)) # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily # obvious from the configuration files, so we check here. check_dimensions_match(modeling_layer.get_input_dim(), 4 * encoding_dim + 2 * selfattent_dim, "modeling layer input dim", "4 * encoding dim") check_dimensions_match(text_field_embedder.get_output_dim(), phrase_layer.get_input_dim(), "text field embedder output dim", "phrase layer input dim") check_dimensions_match( span_end_encoder.get_input_dim(), 4 * encoding_dim + 3 * modeling_dim + 2 * selfattent_dim, "span end encoder input dim", "4 * encoding dim + 3 * modeling dim") self._na_accuracy = CategoricalAccuracy() self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() self._na_dense = lambda in_dim: torch.nn.Linear(in_dim, 2).cuda() if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x self._mask_lstms = mask_lstms initializer(self)
def __init__(self, token_embedders: Dict[str, TokenEmbedder], output_dim: int, allow_unmatched_keys: bool = False) -> None: super(AttentiveTextFieldEmbedder, self).__init__() self._token_embedders = token_embedders self.output_dim = output_dim for key, embedder in token_embedders.items(): name = 'token_embedder_%s' % key self.add_module(name, embedder) self._allow_unmatched_keys = allow_unmatched_keys self.use_glove = False if 'tokens' in self._token_embedders: self.use_glove = True self.glove_embedder = self._token_embedders['tokens'] self.use_elmo = False if 'elmo' in self._token_embedders: self.use_elmo = True self.elmo_embedder = self._token_embedders['elmo'] self.use_char = False if 'token_characters' in self._token_embedders: self.use_char = True self.char_embedder = self._token_embedders['token_characters'] self.num_tasks = len(self._token_embedders) - int( self.use_glove) - int(self.use_elmo) - int(self.use_char) self.separate_embedder_keys = set( ['tokens', 'elmo', 'token_characters']) self.linear_layers = {} for key, embedder in self._token_embedders.items(): if key in self.separate_embedder_keys: continue in_dim = embedder.get_output_dim() out_dim = self.output_dim self.linear_layers[key] = nn.Linear(in_dim, out_dim, bias=False) if torch.cuda.is_available(): self.linear_layers[key].cuda() seaparate_embedding_total_dim = 0 for key in self.separate_embedder_keys: if key in self._token_embedders: embedder = self._token_embedders[key] seaparate_embedding_total_dim += embedder.get_output_dim() self.lstm = lstm = LSTM(bidirectional=True, num_layers=1, input_size=output_dim, hidden_size=output_dim, batch_first=True) self.rnn_encoder = PytorchSeq2SeqWrapper(lstm) self.attention = MultiHeadSelfAttention( num_heads=4, input_dim=output_dim, attention_dim=output_dim, values_dim=output_dim, output_projection_dim=output_dim)