Ejemplo n.º 1
0
 def __init__(self,
              support_dim: int,
              query_dim: int,
              candidates_dim: int,
              num_step: int = 1,
              reason_type: int = 0,
              reason_dropout_p: float = 0.2,
              dropout_p: float = 0.4
              ) -> None:
     """
     Parameters
     ----------
     
     reason_type: 0: random
                  1: only last
                  2: avg
     """
     super().__init__()
     
     assert num_step > 0
     assert reason_type < 3 and reason_type >=0
     
     self.num_step = num_step
     self.reason_type = reason_type
     self.dropout_p = dropout_p
     self.reason_dropout_p = reason_dropout_p
     
     self.supports_predictor = BilinearAttention(query_dim, support_dim, normalize=False)
     self.candidates_predictor = BilinearAttention(support_dim, candidates_dim, normalize=False)
     
     self.rnn = nn.GRUCell(support_dim, query_dim)
     self.alpha = Parameter(torch.zeros(1,1))
Ejemplo n.º 2
0
def get_attention(st_ds_conf, attn_type, *dims):
    emb_sz = st_ds_conf[
        'emb_sz']  # dim for both the decoder output and the encoder output
    attn_type = attn_type.lower()
    if attn_type == "bilinear":
        if len(dims) < 2:
            dims = [emb_sz, emb_sz]
        attn = BilinearAttention(vector_dim=dims[0], matrix_dim=dims[1])
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "dot_product":
        if len(dims) >= 2:
            assert dims[0] == dims[
                1], "encoder hidden states must be able to multiply with decoder output"
        attn = DotProductAttention()
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "multihead":
        attn = GeneralMultiHeadAttention(
            num_heads=st_ds_conf['num_heads'],
            input_dim=emb_sz,
            total_attention_dim=emb_sz,
            total_value_dim=emb_sz,
            attend_to_dim=emb_sz,
            output_dim=emb_sz,
            attention_dropout=st_ds_conf['attention_dropout'],
            use_future_blinding=False,
        )
        attn = SingleTokenMHAttentionWrapper(attn)
    elif attn_type == "none":
        attn = None
    else:
        assert False

    return attn
Ejemplo n.º 3
0
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings

        self.out = torch.nn.Linear(
            in_features=self.word_embeddings.get_output_dim() * 4,
            out_features=vocab.get_vocab_size('labels')
        )
        self.accuracy = CategoricalAccuracy()
        self.f_score_0 = F1Measure(positive_label=0)
        self.f_score_1 = F1Measure(positive_label=1)
        self.f_score_2 = F1Measure(positive_label=2)
        self.loss = CrossEntropyLoss()
        self.attention = BilinearAttention(word_embeddings.get_output_dim() * 3, word_embeddings.get_output_dim())
Ejemplo n.º 4
0
def get_attention(st_ds_conf, attn_type):
    emb_sz = st_ds_conf[
        'emb_sz']  # dim for both the decoder output and the encoder output
    attn_type = attn_type.lower()
    if attn_type == "bilinear":
        attn = BilinearAttention(vector_dim=emb_sz, matrix_dim=emb_sz)
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "dot_product":
        attn = DotProductAttention()
        attn = AllenNLPAttentionWrapper(attn, st_ds_conf['attention_dropout'])
    elif attn_type == "multihead":
        attn = GeneralMultiHeadAttention(
            num_heads=st_ds_conf['num_heads'],
            input_dim=emb_sz,
            total_attention_dim=emb_sz,
            total_value_dim=emb_sz,
            attend_to_dim=emb_sz,
            output_dim=emb_sz,
            attention_dropout=st_ds_conf['attention_dropout'],
            use_future_blinding=False,
        )
        attn = SingleTokenMHAttentionWrapper(attn)
    elif attn_type == "none":
        attn = None
    else:
        assert False

    return attn
Ejemplo n.º 5
0
    def __init__(self, input_size: int, hidden_size: int,
                 tag_embedding_size: int, num_layers: int, bidirectional: bool,
                 tag_vocab_size: int, output_dropout: float) -> None:
        super().__init__()

        self.hidden_size = hidden_size

        self.encoder = nn.LSTM(input_size=input_size,
                               hidden_size=hidden_size,
                               num_layers=num_layers,
                               bidirectional=bidirectional,
                               batch_first=True)

        bidir_mul = 2 if bidirectional else 1

        self.attention = BilinearAttention(vector_dim=hidden_size * bidir_mul,
                                           matrix_dim=hidden_size * bidir_mul)

        self.tag_embed = nn.Embedding(num_embeddings=tag_vocab_size,
                                      embedding_dim=tag_embedding_size)
        self.decoder = nn.LSTM(input_size=2 * bidir_mul * hidden_size +
                               tag_embedding_size,
                               hidden_size=bidir_mul * hidden_size,
                               num_layers=num_layers,
                               bidirectional=False,
                               batch_first=True)

        self.output2tag = torch.nn.Linear(2 * bidir_mul * hidden_size,
                                          tag_vocab_size)

        self.output_dropout = torch.nn.Dropout(p=output_dropout)
 def test_forward_does_a_bilinear_product(self):
     params = Params({"vector_dim": 2, "matrix_dim": 2, "normalize": False})
     bilinear = BilinearAttention.from_params(params)
     bilinear._weight_matrix = Parameter(
         torch.FloatTensor([[-0.3, 0.5], [2.0, -1.0]]))
     bilinear._bias = Parameter(torch.FloatTensor([0.1]))
     a_vectors = torch.FloatTensor([[1, 1]])
     b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]])
     result = bilinear(a_vectors, b_vectors).detach().numpy()
     assert result.shape == (1, 2)
     assert_almost_equal(result, [[1.8, -0.4]])
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings

        self.text_seq_encoder = PytorchSeq2VecWrapper(
            LSTM(word_embeddings.get_output_dim(),
                 int(word_embeddings.get_output_dim() / 2),
                 batch_first=True,
                 bidirectional=True))

        self.out = torch.nn.Linear(
            in_features=self.word_embeddings.get_output_dim() * 4,
            out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.f_score_0 = F1Measure(positive_label=0)
        self.f_score_1 = F1Measure(positive_label=1)
        self.f_score_2 = F1Measure(positive_label=2)
        self.loss = CrossEntropyLoss()
        self.attention = BilinearAttention(
            word_embeddings.get_output_dim() * 3,
            word_embeddings.get_output_dim())
Ejemplo n.º 8
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 dropout: float = 0.0,
                 input_dropout: float = 0.0,
                 label_smoothing: float = 0.1,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(SentimentClassifier, self).__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder

        share_rnn = nn.LSTM(input_size=self._text_field_embedder.get_output_dim(),
                            hidden_size=150,
                            batch_first=True,
                            # dropout=dropout,
                            bidirectional=True)
        share_encoder = PytorchSeq2SeqWrapper(share_rnn)

        self._encoder = RNNEncoder(vocab, share_encoder, input_dropout, regularizer)
        self._seq_vec = CnnEncoder(self._encoder.get_output_dim(), 25)
        self._de_dim = len(TASKS_NAME)
        weight = torch.empty(self._de_dim, self._text_field_embedder.get_output_dim())
        torch.nn.init.orthogonal_(weight)
        self._domain_embeddings = Embedding(self._de_dim, self._text_field_embedder.get_output_dim(), weight=weight)
        self._de_attention = BilinearAttention(self._seq_vec.get_output_dim(),
                                               self._domain_embeddings.get_output_dim())
        self._de_feedforward = FeedForward(self._domain_embeddings.get_output_dim(), 1,
                                           self._seq_vec.get_output_dim(), Activation.by_name("elu")())

        self._num_classes = self.vocab.get_vocab_size("label")
        self._sentiment_discriminator = Discriminator(self._seq_vec.get_output_dim(), self._num_classes)
        self._s_domain_discriminator = Discriminator(self._seq_vec.get_output_dim(), len(TASKS_NAME))
        self._valid_discriminator = Discriminator(self._domain_embeddings.get_output_dim(), 2)
        self._dropout = InputVariationalDropout(dropout)
        self._input_dropout = Dropout(input_dropout)
        self._label_smoothing = label_smoothing

        self.metrics = {
            "s_domain_acc": CategoricalAccuracy(),
            "valid_acc": CategoricalAccuracy()
        }
        for task_name in TASKS_NAME:
            self.metrics["{}_stm_acc".format(task_name)] = CategoricalAccuracy()

        self._loss = torch.nn.CrossEntropyLoss()
        self._domain_loss = torch.nn.CrossEntropyLoss()
        # TODO torch.nn.BCELoss
        self._valid_loss = torch.nn.BCEWithLogitsLoss()

        initializer(self)
Ejemplo n.º 9
0
 def test_forward_does_a_bilinear_product(self):
     params = Params({
             'vector_dim': 2,
             'matrix_dim': 2,
             'normalize': False,
             })
     bilinear = BilinearAttention.from_params(params)
     bilinear._weight_matrix = Parameter(torch.FloatTensor([[-.3, .5], [2.0, -1.0]]))
     bilinear._bias = Parameter(torch.FloatTensor([.1]))
     a_vectors = torch.FloatTensor([[1, 1]])
     b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]])
     result = bilinear(a_vectors, b_vectors).detach().numpy()
     assert result.shape == (1, 2)
     assert_almost_equal(result, [[1.8, -.4]])
Ejemplo n.º 10
0
def build_parsing_recombination_seq2seq_copy_model(
        flags,
        data_reader,
        vocab: Vocabulary,
        source_namespace: str = 'source_tokens',
        target_namespace: str = 'target_tokens') -> Model:
    source_embedding = Embedding(
        vocab.get_vocab_size(namespace=source_namespace),
        embedding_dim=flags.source_embedding_dim)
    lstm = PytorchSeq2SeqWrapper(
        torch.nn.LSTM(flags.source_embedding_dim,
                      flags.encoder_hidden_dim,
                      batch_first=True,
                      bidirectional=flags.encoder_bidirectional))
    attention = BilinearAttention(flags.attention_hidden_dim,
                                  flags.attention_hidden_dim,
                                  normalize=False)
    source_embedder = BasicTextFieldEmbedder({'tokens': source_embedding})
    initializer = InitializerApplicator.from_params([
        (".*bias", Params({
            "type": "constant",
            "val": 0
        })), ('.*', Params({
            "type": "uniform",
            "a": -0.1,
            "b": 0.1
        }))
    ])
    metric = SequenceAccuracy()
    model = RecombinationSeq2SeqWithCopy(
        vocab,
        source_embedder,
        lstm,
        flags.max_decode_length,
        seq_metrics=metric,
        source_namespace=source_namespace,
        target_namespace=target_namespace,
        target_embedding_dim=flags.target_embedding_dim,
        attention=attention,
        beam_size=flags.beam_size,
        use_bleu=False,
        encoder_input_dropout=flags.encoder_input_dropout,
        encoder_output_dropout=flags.encoder_output_dropout,
        dropout=flags.dropout,
        feed_output_attention_to_decoder=True,
        keep_decoder_output_dim_same_as_encoder=True,
        initializer=initializer)
    return model
Ejemplo n.º 11
0
    def __init__(self, embedding_dim):
        self.embedding_dim = embedding_dim
        encoder = MultilayerCnnEncoder(
            embedding_dim=self.embedding_dim,
            num_filters=self.embedding_dim * 2,
            layers=2,
            conv_layer_activation=Activation.by_name('tanh')(),
            ngram_filter_sizes=(3, ),
            output_dim=self.embedding_dim,
            pooling='avg')

        attention = BilinearAttention(vector_dim=self.embedding_dim * 2,
                                      matrix_dim=self.embedding_dim,
                                      normalize=True)

        super(BiLinearSelectionGenerator, self).__init__(
            encoder=encoder,
            attention=attention,
        )
Ejemplo n.º 12
0
    def __init__(self, input_size: int, hidden_size: int, num_layers: int,
                 bidirectional: bool, vocab_size: int,
                 output_dropout: float) -> None:
        super().__init__()

        self.encoder = nn.LSTM(input_size=input_size,
                               hidden_size=hidden_size,
                               num_layers=num_layers,
                               bidirectional=bidirectional,
                               batch_first=True)

        bidir_mul = 2 if bidirectional else 1

        self.attention = BilinearAttention(vector_dim=hidden_size * bidir_mul,
                                           matrix_dim=hidden_size * bidir_mul)

        self.output2label = torch.nn.Linear(2 * bidir_mul * hidden_size,
                                            vocab_size)

        self.output_dropout = torch.nn.Dropout(p=output_dropout)
Ejemplo n.º 13
0
    def __init__(self, vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 seq2seq_encoder: Seq2SeqEncoder,
                 initializer: InitializerApplicator) -> None:
        super(ProLocalModel, self).__init__(vocab)

        self.text_field_embedder = text_field_embedder
        self.seq2seq_encoder = seq2seq_encoder

        self.attention_layer = \
            BilinearAttention(2 * seq2seq_encoder.get_output_dim(),
                                                             seq2seq_encoder.get_output_dim(), normalize=True)

        self.num_types = self.vocab.get_vocab_size("state_change_type_labels")
        self.aggregate_feedforward = Linear(seq2seq_encoder.get_output_dim(),
                                            self.num_types)

        #         self.span_metric = SpanBasedF1Measure(vocab,
        #                                               tag_namespace="state_change_tags")  # by default "O" is ignored in metric computation
        #         self.num_tags = self.vocab.get_vocab_size("state_change_tags")

        #         self.tag_projection_layer = TimeDistributed(Linear(self.seq2seq_encoder.get_output_dim() + 2
        #                                                            , self.num_tags))
        self._type_accuracy = CategoricalAccuracy()

        self.type_f1_metrics = {}
        self.type_labels_vocab = self.vocab.get_index_to_token_vocabulary(
            "state_change_type_labels")
        for type_label in self.type_labels_vocab.values():
            self.type_f1_metrics["type_" + type_label] = F1Measure(
                self.vocab.get_token_index(type_label,
                                           "state_change_type_labels"))

        self._loss = torch.nn.CrossEntropyLoss()

        initializer(self)
Ejemplo n.º 14
0
encoder = PytorchSeq2SeqWrapper(torch.nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True))
train_iterator = BucketIterator(batch_size=batch_size,
                                sorting_keys=[("source_tokens", "num_tokens")],
                                instances_per_epoch=INSTANCES_PER_EPOCH)

validation_iterator = BucketIterator(batch_size=batch_size,
                                     sorting_keys=[("source_tokens", "num_tokens")])

train_iterator.index_with(vocab)
validation_iterator.index_with(vocab)

model = SimpleSeq2Seq(vocab, source_embedder, encoder,
                      max_decoding_steps=max_decoding_steps,
                      target_embedding_dim=embedding_dim,
                      target_namespace='target_tokens',
                      attention=BilinearAttention(hidden_dim * 2, hidden_dim * 2),
                      beam_size=beam_size)


def train():
    model.cuda(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    trainer = Trainer(model=model,
                      optimizer=optimizer,
                      iterator=train_iterator,
                      validation_iterator=validation_iterator,
                      train_dataset=train_dataset,
                      validation_dataset=validation_dataset,
                      num_epochs=num_epochs,
                      serialization_dir=serialization_dir,
Ejemplo n.º 15
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 lexical_feedforward: FeedForward,
                 contextual_encoder: Seq2SeqEncoder,
                 attention_feedforward: FeedForward,
                 matrix_attention: MatrixAttention,
                 memory_encoder: Seq2SeqEncoder,
                 output_feedforward: FeedForward,
                 output_logit: FeedForward,
                 answer_steps: int = 5,
                 dropout: float = 0.5,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)

        self._text_field_embedder = text_field_embedder
        self._lexical_feedforward = TimeDistributed(lexical_feedforward)
        self._contextual_encoder = contextual_encoder
        self._attention_feedforward = TimeDistributed(attention_feedforward)
        self._matrix_attention = matrix_attention
        self._memory_encoder = memory_encoder
        self._output_feedforward = output_feedforward
        self._output_logit = output_logit
        self._answer_steps = answer_steps
        self._answer_gru_cell = torch.nn.GRUCell(
            self._memory_encoder.get_output_dim(),
            self._memory_encoder.get_output_dim(),
        )
        self._answer_attention = TimeDistributed(
            torch.nn.Linear(self._memory_encoder.get_output_dim(), 1))
        self._answer_bilinear = BilinearAttention(
            self._memory_encoder.get_output_dim(),
            self._memory_encoder.get_output_dim(),
        )

        check_dimensions_match(text_field_embedder.get_output_dim(),
                               lexical_feedforward.get_input_dim(),
                               "text field embedding dim",
                               "lexical feedforward input dim")
        check_dimensions_match(lexical_feedforward.get_output_dim(),
                               contextual_encoder.get_input_dim(),
                               "lexical feedforwrd input dim",
                               "contextual layer input dim")
        check_dimensions_match(contextual_encoder.get_output_dim(),
                               attention_feedforward.get_input_dim(),
                               "contextual layer output dim",
                               "attention feedforward input dim")
        check_dimensions_match(contextual_encoder.get_output_dim() * 2,
                               memory_encoder.get_input_dim(),
                               "contextual layer output dim",
                               "memory encoder input dim")
        check_dimensions_match(memory_encoder.get_output_dim() * 4,
                               output_feedforward.get_input_dim(),
                               "memory encoder output dim",
                               "output feedforward input")
        check_dimensions_match(output_feedforward.get_output_dim(),
                               output_logit.get_input_dim(),
                               "output feedforward output dim",
                               "output logit input")

        self._dropout = torch.nn.Dropout(dropout) if dropout else None

        self._accuracy = CategoricalAccuracy()
        self._loss = torch.nn.NLLLoss()

        initializer(self)
    def __init__(self,
                 vocab: Vocabulary,
                 context_field_embedder: TextFieldEmbedder,
                 context_encoder: Seq2SeqEncoder,
                 target_encoder: Seq2SeqEncoder,
                 feedforward: Optional[FeedForward] = None,
                 context_attention_activation_function: str = 'tanh',
                 target_attention_activation_function: str = 'tanh',
                 target_field_embedder: Optional[TextFieldEmbedder] = None,
                 inter_target_encoding: Optional[InterTarget] = None,
                 target_position_weight: Optional[TargetPositionWeight] = None,
                 target_position_embedding: Optional[TextFieldEmbedder] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 dropout: float = 0.0,
                 label_name: str = 'target-sentiment-labels',
                 loss_weights: Optional[List[float]] = None,
                 use_target_sequences: bool = False) -> None:
        super().__init__(vocab, regularizer)
        '''
        :param vocab: A Vocabulary, required in order to compute sizes 
                      for input/output projections.
        :param context_field_embedder: Used to embed the context/sentence and 
                                       target text if target_field_embedder is 
                                       None but the target_encoder is NOT None.
        :param context_encoder: Encoder that will create the representation 
                                for the sentence/context that the target 
                                appears in.
        :param target_encoder: Encoder that will create the representation of 
                               target text tokens.
        :param feedforward: An optional feed forward layer to apply after the 
                            encoder.
        :param context_attention_activation_function: The attention method to be
                                                      used on the context.
        :param target_attention_activation_function: The attention method to be
                                                     used on the target text.
        :param target_field_embedder: Used to embed the target text to give as 
                                      input to the target_encoder. Thus this 
                                      allows a separate embedding for context 
                                      and target text.
        :param inter_target_encoding: Whether to model the relationship between 
                                      targets/aspect.
        :param target_position_weight: Whether to weight the output of the 
                                       context encoding based on the position 
                                       of the tokens to the target tokens. This 
                                       weighting is applied before any attention 
                                       is applied.
        :param target_position_embedding: Whether or not to concatenate a position
                                          embedding on to the input embeddings 
                                          before being an input to the 
                                          `context_encoder`.
        :param initializer: Used to initialize the model parameters.
        :param regularizer: If provided, will be used to calculate the 
                            regularization penalty during training.
        :param dropout: To apply dropout after each layer apart from the last 
                        layer. All dropout that is applied to timebased data 
                        will be `variational dropout 
                        <https://arxiv.org/abs/1512.05287>`_ all else will be  
                        standard dropout. Variation dropout is applied to the 
                        target vectors after they have been processed by the 
                        `inter_target_encoding` if this is set.
        :param label_name: Name of the label name space.
        :param loss_weights: The amount of weight to give the negative, neutral,
                             positive classes respectively. e.g. [0.2, 0.5, 0.3]
                             would weight the negative class by a factor of 
                             0.2, neutral by 0.5 and positive by 0.3. NOTE It 
                             assumes the sentiment labels are the following:
                             [negative, neutral, positive].
        :param use_target_sequences: Whether or not to use target tokens within 
                                     the context as the targets contextualized 
                                     word representation (CWR). This would only
                                     make sense to use if the word representation 
                                     i.e. field embedder is a contextualized 
                                     embedder e.g. ELMO etc. This also requires 
                                     that the dataset reader has the following 
                                     argument set to True `target_sequences`.
                                     ANOTHER reason why you would want to use 
                                     this even when not using CWR is that you 
                                     want to get contextualised POS/Dep tags 
                                     etc.
        
        This is based on the `Interactive Attention Networks for Aspect-Level 
        Sentiment Classification 
        <https://www.ijcai.org/proceedings/2017/0568.pdf>`_. The model is also 
        known as `IAN`.

         .. _variational dropout:
           https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf
        '''

        self.label_name = label_name
        self.context_field_embedder = context_field_embedder
        self.target_field_embedder = target_field_embedder
        self.num_classes = self.vocab.get_vocab_size(self.label_name)
        self.target_encoder = target_encoder
        self.context_encoder = context_encoder
        self.feedforward = feedforward
        self._use_target_sequences = use_target_sequences
        if self._use_target_sequences and self.target_field_embedder:
            raise ConfigurationError(
                '`use_target_sequences` cannot be True at'
                ' the same time as a value for '
                '`target_field_embedder` as the embeddings'
                ' come from the context and not a separate embedder')

        context_attention_activation_function = Activation.by_name(
            f'{context_attention_activation_function}')()
        target_attention_activation_function = Activation.by_name(
            f'{target_attention_activation_function}')()

        target_encoder_out = self.target_encoder.get_output_dim()
        context_encoder_out = self.context_encoder.get_output_dim()
        self.context_attention_layer = BilinearAttention(
            target_encoder_out,
            context_encoder_out,
            context_attention_activation_function,
            normalize=True)
        self.target_attention_layer = BilinearAttention(
            context_encoder_out,
            target_encoder_out,
            target_attention_activation_function,
            normalize=True)
        # To be used as the pooled input into the target attention layer as
        # the query vector.
        self._context_averager = BagOfEmbeddingsEncoder(context_encoder_out,
                                                        averaged=True)
        # To be used as the pooled input into the context attention layer as
        # the query vector.
        self._target_averager = BagOfEmbeddingsEncoder(target_encoder_out,
                                                       averaged=True)

        # Set the loss weights (have to sort them by order of label index in
        # the vocab)
        self.loss_weights = target_sentiment.util.loss_weight_order(
            self, loss_weights, self.label_name)

        # Inter target modelling
        self.inter_target_encoding = inter_target_encoding

        if feedforward is not None:
            output_dim = self.feedforward.get_output_dim()
        elif self.inter_target_encoding is not None:
            output_dim = self.inter_target_encoding.get_output_dim()
        else:
            output_dim = target_encoder_out + context_encoder_out
        self.label_projection = Linear(output_dim, self.num_classes)

        self.metrics = {"accuracy": CategoricalAccuracy()}
        self.f1_metrics = {}
        # F1 Scores
        label_index_name = self.vocab.get_index_to_token_vocabulary(
            self.label_name)
        for label_index, _label_name in label_index_name.items():
            _label_name = f'F1_{_label_name.capitalize()}'
            self.f1_metrics[_label_name] = F1Measure(label_index)
        # Dropout
        self._variational_dropout = InputVariationalDropout(dropout)
        self._naive_dropout = Dropout(dropout)

        # position embeddings
        self.target_position_embedding = target_position_embedding
        # Ensure that the dimensions of the text field embedder and text encoder
        # match
        if self.target_position_embedding:
            context_and_position_dim = (
                context_field_embedder.get_output_dim() +
                self.target_position_embedding.get_output_dim())
            check_dimensions_match(
                context_and_position_dim, context_encoder.get_input_dim(),
                "context field embedding dim and the position embeddings",
                "text encoder input dim")
        else:
            check_dimensions_match(context_field_embedder.get_output_dim(),
                                   context_encoder.get_input_dim(),
                                   "context field embedding dim",
                                   "text encoder input dim")
        # Ensure that the dimensions of the target or text field embedder and
        # the target encoder match
        target_field_embedder_dim = context_field_embedder.get_output_dim()
        target_field_error = "context field embedding dim"
        if self.target_field_embedder:
            target_field_embedder_dim = target_field_embedder.get_output_dim()
            target_field_error = "target field embedding dim"

        check_dimensions_match(target_field_embedder_dim,
                               target_encoder.get_input_dim(),
                               target_field_error, "target encoder input dim")

        if self.inter_target_encoding:
            check_dimensions_match(target_encoder_out + context_encoder_out,
                                   self.inter_target_encoding.get_input_dim(),
                                   'Output from target and context encdoers',
                                   'Inter Target encoder input dim')

        self.target_position_weight = target_position_weight
        # TimeDistributed anything that is related to the targets.
        if self.feedforward is not None:
            self.feedforward = TimeDistributed(self.feedforward)
        self.label_projection = TimeDistributed(self.label_projection)
        self._time_naive_dropout = TimeDistributed(self._naive_dropout)

        initializer(self)
Ejemplo n.º 17
0
    def common_init(
        self,
        encoder_output_dim: int,
        decoder: DecoderNet,
        decoder_type: str,
        decoder_num_layers: int,
        share_decoder_params: bool,
        start_token: str = "[CLS]",
        end_token: str = "[SEP]",
        index_name: str = "bert",
        beam_size: int = 4,
        min_dec_len: int = 4,
        max_dec_len: int = 30,
        coverage_factor: float = 0.0,
        device: Union[int, str, List[int]] = -1,
        metrics: Optional[List[Metric]] = None,
        valid_metric_keys: List[str] = None,
        seed: int = 42,
        initializer: InitializerApplicator = InitializerApplicator()):
        """几个不同模型通用的初始化过程"""
        seed_everything(seed)  # 初始化随机种子
        # ----------- metrics相关初始化 -------------
        # 定义metrics
        self._metrics = [TokenBasedBLEU(), TokenBasedROUGE()]
        if metrics is not None:
            self._metrics = metrics
        self._rewrite_em = RewriteEM()
        self._restore_score = RestorationScore(compute_restore_tokens=True)
        self._cov_loss_value = Average()
        self.valid_metric_keys = valid_metric_keys

        # ----------- 参数相关初始化 -------------
        # 定义token以及其他参数
        self._start_token = start_token
        self._end_token = end_token
        self._index_name = index_name
        # 使用bert模型,本质上还是要事先读取词表
        # 所以需要将对应的vocabulary的namespace进行修改
        # 这里非常重要,如果namespace不对,很容易出现assert_trigger_error
        if "bert" in self._index_name:
            self._vocab_namespace = "tokens"
        else:
            self._vocab_namespace = self._index_name
        self.coverage_factor = coverage_factor
        self.decoder_num_layers = decoder_num_layers
        decoder_type = decoder_type.lower()
        # 保存一些重要的参数
        self.params = Params(
            params={
                "beam_size": beam_size,
                "min_dec_len": min_dec_len,
                "max_dec_len": max_dec_len,
                "decoder_type": decoder_type
            })

        # ----------- device相关初始化 -------------
        device = parse_cuda_device(device)
        check_for_gpu(device)  # 检查gpu设置是否超过可用范围
        if isinstance(device, list):
            device = device[0]
        if device < 0:
            self._device = torch.device("cpu")
        else:
            self._device = torch.device(f"cuda:{device}")

        # ----------- decoder相关初始化 -------------
        # 定义decoder
        self.decoder = decoder
        self._share_decoder_params = share_decoder_params
        # 如果解码器是lstm,需要判断是否使用coverage机制
        # transformer使用coverage机制比较麻烦,所以直接使用内部计算出来的attention分布
        if self.params['decoder_type'] == 'lstm':
            # 用于LSTM解码器
            if self.coverage_factor > 0.0:
                # 定义用于计算decoder中的每一个step对应encoder结果的attention层
                # 以及计算对于当前轮和历史轮的attention分布的权重
                self.attention = BilinearAttention(
                    vector_dim=encoder_output_dim,
                    matrix_dim=encoder_output_dim + 1,
                    activation=Activation.by_name('linear')())
                self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3 + 2,
                                                   2)
            else:
                self.attention = BilinearAttention(
                    vector_dim=encoder_output_dim,
                    matrix_dim=encoder_output_dim,
                    activation=Activation.by_name('linear')())
                self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2)
        else:
            # 用于Transformer解码器
            self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2)

        # ----------- 词表相关初始化 -------------
        self._vocab_size = self.vocab.get_vocab_size(
            namespace=self._vocab_namespace)
        self._unk_id = self.vocab.get_token_index(
            self.vocab._oov_token, namespace=self._vocab_namespace)
        # ----------- 初始化模型参数 -------------
        self._initializer = initializer
        self._initializer(self.lamb_linear)
        self._initializer(self.decoder)
Ejemplo n.º 18
0
embedding_dim2 = 16
sequence_length = 10

# Attention

# dot product attention only allows vector/matrix of the same size
vector = torch.rand((batch_size, embedding_dim1,))
matrix = torch.rand((batch_size, sequence_length, embedding_dim1))
attention = DotProductAttention()
output = attention(vector, matrix)
print('Output from DotProductAttention:', output.size(), output)

# bilinear & linear attention allows inputs of different sizes
vector = torch.rand((batch_size, embedding_dim1,))
matrix = torch.rand((batch_size, sequence_length, embedding_dim2))
attention = BilinearAttention(vector_dim=embedding_dim1, matrix_dim=embedding_dim2)
output = attention(vector, matrix)
print('Output from BilinearAttention:', output.size(), output)

tanh = Activation.by_name('tanh')()
attention = LinearAttention(
    tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2,
    combination='x,y', activation=tanh)
output = attention(vector, matrix)
print('Output from LinearAttention:', output)

# MatrixAttention
sequence_length1 = 10
sequence_length2 = 15

# dot product attention only allows matrices of the same size
    def __init__(
            self,
            # Vocabluary.
            vocab: Vocabulary,

            # Embeddings.
            source_field_embedder: TextFieldEmbedder,
            target_embedding_size: int,

            # Encoders and Decoders.
            encoder: Seq2SeqEncoder,
            decoder_type: str,
            output_projection_layer: FeedForward,
            source_namespace: str = "source",
            target_namespace: str = "target",

            # Hyperparamters and flags.
            decoder_attention_function: BilinearAttention = None,
            decoder_is_bidirectional: bool = False,
            decoder_num_layers: int = 1,
            apply_attention: Optional[bool] = False,
            max_decoding_steps: int = 100,
            scheduled_sampling_ratio: float = 0.4,

            # Logistical.
            initializer: InitializerApplicator = InitializerApplicator(),
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)
        if encoder.get_input_dim() != source_field_embedder.get_output_dim():
            raise ConfigurationError(
                "The input dimension of the encoder must match the embedding"
                "size of the source_field_embedder. Found {} and {}, respectively."
                .format(encoder.get_input_dim(),
                        source_field_embedder.get_output_dim()))
        if output_projection_layer.get_output_dim() != vocab.get_vocab_size(
                target_namespace):
            raise ConfigurationError(
                "The output dimension of the output_projection_layer must match the "
                "size of the French vocabulary. Found {} and {}, "
                "respectively.".format(
                    output_projection_layer.get_output_dim(),
                    vocab.get_vocab_size(target_namespace)))
        if decoder_type not in SequenceToSequence.DECODERS:
            raise ConfigurationError(
                "Unrecognized decoder option '{}'".format(decoder_type))

        # For dealing with input.
        self.source_vocab_size = vocab.get_vocab_size(source_namespace)
        self.target_vocab_size = vocab.get_vocab_size(target_namespace)
        self.source_field_embedder = source_field_embedder or TextFieldEmbedder(
        )
        self.encoder = encoder

        # For dealing with / producing output.
        self.target_vocab_size = vocab.get_vocab_size(target_namespace)
        self.target_embedder = Embedding(self.target_vocab_size,
                                         target_embedding_size)

        # Input size will either be the target embedding size or the target embedding size plus the
        # encoder hidden size to attend on the input.
        #
        # When making a custom attention function that uses neither of those input sizes, you will
        # have to define the decoder yourself.
        decoder_input_size = target_embedding_size
        if apply_attention:
            decoder_input_size += encoder.get_output_dim()

        # Hidden size of the encoder and decoder should match.
        decoder_hidden_size = encoder.get_output_dim()
        self.decoder = SequenceToSequence.DECODERS[decoder_type](
            decoder_input_size,
            decoder_hidden_size,
            num_layers=decoder_num_layers,
            batch_first=True,
            bias=True,
            bidirectional=decoder_is_bidirectional)
        self.output_projection_layer = output_projection_layer
        self.apply_attention = apply_attention
        self.decoder_attention_function = decoder_attention_function or BilinearAttention(
            matrix_dim=encoder.get_output_dim(),
            vector_dim=encoder.get_output_dim())

        # Hyperparameters.
        self._max_decoding_steps = max_decoding_steps
        self._scheduled_sampling_ratio = scheduled_sampling_ratio

        # Used for prepping the translation primer (initialization of the target word-level
        # encoder's hidden state).
        #
        # If the decoder is an LSTM, both hidden states and cell states must be initialized.
        # Also, hidden states that prime translation via this encoder must be duplicated
        # across by number of layers they has.
        self._decoder_is_lstm = isinstance(self.decoder, torch.nn.LSTM)
        self._decoder_num_layers = decoder_num_layers

        self._start_index = vocab.get_token_index(START_SYMBOL,
                                                  target_namespace)
        self._end_index = vocab.get_token_index(END_SYMBOL, target_namespace)
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self._batch_size = None

        initializer(self)
Ejemplo n.º 20
0
    def __init__(self, options: Options,
                 statistics: GraphEmbeddingStatisticsBase):
        super().__init__()

        self.use_char_embedding = options.use_char_embedding
        self.use_property_embeddings = options.use_property_embeddings
        self.use_highway = options.use_highway
        self.compress_node_embedding = options.compress_node_embedding
        self.num_rnn_layers = options.num_rnn_layers

        self.dropout = nn.Dropout(p=options.dropout_rate)

        self.word_embedding = Embedding(len(statistics.words),
                                        statistics.get_embedding_dim_of("words"),
                                        padding_idx=0
                                        )
        self.conn_label_embedding = Embedding(len(statistics.conn_labels),
                                              statistics.get_embedding_dim_of("conn_labels"),
                                              padding_idx=0
                                              )
        self.embeddings = {}

        self.properties = statistics.get_properties()
        for vocab_name in self.properties:
            vocab = getattr(statistics, vocab_name)
            embedding = Embedding(len(vocab),
                                  statistics.get_embedding_dim_of(vocab_name),
                                  padding_idx=0
                                  )
            embedding_name = '{}_embedding'.format(vocab_name)
            self.embeddings[vocab_name] = embedding
            self.add_module(embedding_name, embedding)

        if not options.word_vector_trainable:
            # Do not train embeddings
            self.word_embedding.weight.requires_grad_(False)

        node_embedding_dim = self.word_embedding.embedding_dim

        if self.use_char_embedding:
            char_dim = self.char_embedding.embedding_dim
            node_embedding_dim += options.char_lstm_hidden_size
            # TODO: change data format to avoid batch_fisrt=True
            self.char_lstm = nn.LSTM(char_dim,
                                     options.char_lstm_hidden_size,
                                     options.num_char_lstm_layers,
                                     batch_first=True)

        if self.use_property_embeddings:
            for prop_name in self.properties:
                node_embedding_dim += self.embeddings[prop_name].embedding_dim

        if self.compress_node_embedding:
            self.compress_linear = nn.Linear(node_embedding_dim,
                                             options.compressed_embedding_dim)
            node_embedding_dim = options.compressed_embedding_dim

        if self.use_highway:
            self.multi_highway = Highway(node_embedding_dim,
                                         options.num_highway_layers,
                                         f=torch.tanh)

        conn_label_dim = self.conn_label_embedding.embedding_dim
        hidden_size = options.model_hidden_size
        self.hidden_size = hidden_size
        self.node_embedding_dim = node_embedding_dim
        self.neighbor_linear = nn.Linear(node_embedding_dim + conn_label_dim,
                                         hidden_size)

        self.use_out = use_out = statistics.use_out
        self.input_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out)
        self.output_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out)
        self.forget_gate = GraphRNNGate(hidden_size, torch.sigmoid, use_out)
        self.cell = GraphRNNGate(hidden_size, torch.tanh, use_out)

        if options.use_attention:
            self.embedding_attention = BilinearAttention(
                self.node_embedding_dim,
                self.node_embedding_dim + self.conn_label_embedding.embedding_dim,
                activation=torch.nn.functional.tanh
            )

            self.hidden_attention = BilinearAttention(
                self.hidden_size,
                self.hidden_size,
                activation=torch.nn.functional.tanh
            )
        else:
            # use sum instead of attention
            self.embedding_attention = self.hidden_attention = None
Ejemplo n.º 21
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str = None,
                 start_attention: Attention = None,
                 end_attention: Attention = None,
                 text_field_embedder: TextFieldEmbedder = None,
                 task_pretrained_file: str = None,
                 neg_sample_ratio: float = 0.0,
                 max_turn_len: int = 3,
                 start_token: str = "[CLS]",
                 end_token: str = "[SEP]",
                 index_name: str = "bert",
                 eps: float = 1e-8,
                 seed: int = 42,
                 loss_factor: float = 1.0,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: RegularizerApplicator = None):
        super().__init__(vocab, regularizer)
        if model_name is None and text_field_embedder is None:
            raise ValueError(
                f"`model_name` and `text_field_embedder` can't both equal to None."
            )
        # 单纯的resolution任务,只需要返回最后一层的embedding表征即可
        self._text_field_embedder = text_field_embedder or PretrainedChineseBertMismatchedEmbedder(
            model_name,
            return_all=False,
            output_hidden_states=False,
            max_turn_length=max_turn_len)

        seed_everything(seed)
        self._neg_sample_ratio = neg_sample_ratio
        self._start_token = start_token
        self._end_token = end_token
        self._index_name = index_name
        self._initializer = initializer

        linear_input_size = self._text_field_embedder.get_output_dim()
        # 使用attention的方法
        self.start_attention = start_attention or BilinearAttention(
            vector_dim=linear_input_size, matrix_dim=linear_input_size)
        self.end_attention = end_attention or BilinearAttention(
            vector_dim=linear_input_size, matrix_dim=linear_input_size)
        # mask的指标,主要考虑F-score,而且我们更加关注`1`的召回率
        self._span_start_accuracy = CategoricalAccuracy()
        self._span_end_accuracy = CategoricalAccuracy()
        self._span_accuracy = BooleanAccuracy()
        self._rewrite_em = RewriteEM(valid_keys="semr,nr_semr,re_semr")
        self._restore_score = RestorationScore(compute_restore_tokens=True)
        self._metrics = [
            TokenBasedBLEU(mode="1,2"),
            TokenBasedROUGE(mode="1r,2r")
        ]
        self._eps = eps
        self._loss_factor = loss_factor

        self._initializer(self.start_attention)
        self._initializer(self.end_attention)

        # 加载其他任务预训练的模型
        if task_pretrained_file is not None and os.path.isfile(
                task_pretrained_file):
            logger.info("loading related task pretrained weights...")
            self.load_state_dict(torch.load(task_pretrained_file),
                                 strict=False)
Ejemplo n.º 22
0
    def __init__(
            self,
            # Vocabluary.
            vocab: Vocabulary,
            cuda_device,

            # Embeddings.
            source_text_field_embedder: TextFieldEmbedder,
            target_embedding_size: int,
            hidden_size: int,
            decoder_type: str = "gru",
            source_namespace: str = "tokens",
            target_namespace: str = "target",

            # Hyperparamters and flags.
            drop_out_rate: float = 0.0,
            decoder_attention_function: BilinearAttention = None,
            decoder_is_bidirectional: bool = False,
            decoder_num_layers: int = 1,
            apply_attention: bool = False,
            max_decoding_steps: int = 100,
            # scheduled_sampling_ratio: float = 0.0,
            attention_file: str = "attention_data.jsonl",
            regularizer: Optional[RegularizerApplicator] = None) -> None:
        super().__init__(vocab, regularizer)
        assert decoder_type in SequenceToSequence.DECODERS

        self.source_vocab_size = vocab.get_vocab_size(source_namespace)
        self.target_vocab_size = vocab.get_vocab_size(target_namespace)
        self.source_field_embedder = source_text_field_embedder
        self.encoder = torch.nn.LSTM(
            self.source_field_embedder.get_output_dim(),
            hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True)
        self.metrics = {"BELU": BELU()}

        self.target_vocab_size = vocab.get_vocab_size(target_namespace)
        self.target_embedder = Embedding(self.target_vocab_size,
                                         target_embedding_size)

        if apply_attention:
            decoder_input_size = target_embedding_size + hidden_size
        else:
            decoder_input_size = target_embedding_size + hidden_size

        # self.analyze_this_target = START_SYMBOL + " S T A I R C A S E . . . " + END_SYMBOL
        self.attention_file = attention_file

        self.dropout = torch.nn.Dropout(p=drop_out_rate)
        # Hidden size of the encoder and decoder should match.
        decoder_hidden_size = hidden_size
        self.decoder = SequenceToSequence.DECODERS[decoder_type](
            decoder_input_size,
            decoder_hidden_size,
            num_layers=decoder_num_layers,
            batch_first=True,
            bias=True,
            bidirectional=decoder_is_bidirectional)
        self.output_projection_layer = torch.nn.Linear(
            hidden_size, len(vocab._token_to_index['target']))
        self.apply_attention = apply_attention
        self.decoder_attention_function = decoder_attention_function or BilinearAttention(
            matrix_dim=hidden_size, vector_dim=hidden_size)

        # Hyperparameters.
        self._max_decoding_steps = max_decoding_steps
        # self._scheduled_sampling_ratio = scheduled_sampling_ratio

        self._decoder_is_lstm = isinstance(self.decoder, torch.nn.LSTM)
        self._decoder_is_gru = isinstance(self.decoder, torch.nn.GRU)
        self._decoder_num_layers = decoder_num_layers

        self._start_index = vocab.get_token_index(START_SYMBOL,
                                                  target_namespace)
        self._end_index = vocab.get_token_index(END_SYMBOL, target_namespace)
        self._source_namespace = source_namespace
        self._target_namespace = target_namespace
        self.count = 0
        self.first_dump = True
        if cuda_device[0] == -1 or cuda_device == -1:
            self.device = torch.device("cpu")
        else:
            cuda = "cuda:" + str(cuda_device[0])
            self.device = torch.device(
                cuda if torch.cuda.is_available() else "cpu")
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 target_encoder: Seq2VecEncoder,
                 feedforward: Optional[FeedForward] = None,
                 target_field_embedder: Optional[TextFieldEmbedder] = None,
                 attention_activation_function: Optional[str] = 'tanh',
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None,
                 word_dropout: float = 0.0,
                 dropout: float = 0.0) -> None:
        '''
        :param vocab: vocab : A Vocabulary, required in order to compute sizes 
                              for input/output projections.
        :param text_field_embedder: Used to embed the text and target text if
                                    target_field_embedder is None but the 
                                    target_encoder is not None.
        :param text_encoder: Sequence Encoder that will create the 
                             representation of each token in the context 
                             sentence.
        :param target_encoder: Encoder that will create the representation of 
                               target text tokens.
        :param feedforward: An optional feed forward layer to apply after
                            either the text encoder if target encoder is None. 
                            Else it would be after the target and the text 
                            encoded representations have been concatenated.
        :param target_field_embedder: Used to embed the target text to give as 
                                      input to the target_encoder. Thus this 
                                      allows a seperate embedding for text and 
                                      target text.
        :param attention_activation_function: The name of the activation 
                                              function applied after the 
                                              ``h^T W t + b`` calculation.
                                              Activation names can be found 
                                              `here <https://allenai.github.io/
                                              allennlp-docs/api/allennlp.nn.
                                              activations.html>`_. Default is 
                                              tanh.
        :param initializer: Used to initialize the model parameters.
        :param regularizer: If provided, will be used to calculate the 
                            regularization penalty during training.
        :param word_dropout: Dropout that is applied after the embedding of the 
                             tokens/words. It will drop entire words with this 
                             probabilty.
        :param dropout: To apply dropout after each layer apart from the last 
                        layer. All dropout that is applied to timebased data 
                        will be `variational dropout`_ all else will be  
                        standard dropout.
        
        This attention target classifier is based on the model in `Exploiting  
        Document Knowledge for Aspect-level Sentiment Classification Ruidan 
        <https://aclanthology.info/papers/P18-2092/p18-2092>`_ where the 
        attention on the encoded context words are based on the encoded target 
        vector.

        .. _variational dropout:
           https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf
        '''
        super().__init__(vocab, regularizer)

        self.text_field_embedder = text_field_embedder
        self.target_field_embedder = target_field_embedder
        self.num_classes = self.vocab.get_vocab_size("labels")
        self.text_encoder = text_encoder
        self.target_encoder = target_encoder
        self.feedforward = feedforward
        attention_activation_function = Activation.by_name(
            f'{attention_activation_function}')()
        self.attention_layer = BilinearAttention(
            self.target_encoder.get_output_dim(),
            self.text_encoder.get_output_dim(),
            attention_activation_function,
            normalize=True)

        if feedforward is not None:
            output_dim = self.feedforward.get_output_dim()
        else:
            output_dim = self.text_encoder.get_output_dim()
        self.label_projection = Linear(output_dim, self.num_classes)
        self.metrics = {"accuracy": CategoricalAccuracy()}
        self.f1_metrics = {}
        # F1 Scores
        label_index_name = self.vocab.get_index_to_token_vocabulary('labels')
        for label_index, label_name in label_index_name.items():
            label_name = f'F1_{label_name.capitalize()}'
            self.f1_metrics[label_name] = F1Measure(label_index)

        self._word_dropout = WordDrouput(word_dropout)
        self._variational_dropout = InputVariationalDropout(dropout)
        self._naive_dropout = Dropout(dropout)

        self.loss = torch.nn.CrossEntropyLoss()

        # Ensure that the dimensions of the text field embedder and text encoder
        # match
        check_dimensions_match(text_field_embedder.get_output_dim(),
                               text_encoder.get_input_dim(),
                               "text field embedding dim",
                               "text encoder input dim")
        # Ensure that the dimensions of the target or text field embedder and
        # the target encoder match
        target_field_embedder_dim = text_field_embedder.get_output_dim()
        target_field_error = "text field embedding dim"
        if self.target_field_embedder:
            target_field_embedder_dim = target_field_embedder.get_output_dim()
            target_field_error = "target field embedding dim"

        check_dimensions_match(target_field_embedder_dim,
                               target_encoder.get_input_dim(),
                               target_field_error, "target encoder input dim")
        initializer(self)
Ejemplo n.º 24
0
    def __init__(
        self,
        vocab: Vocabulary,
        input_dim: int,
        decoder_hidden_size: int,
        max_decoding_steps: int,
        output_proj_input_dim: int,
        target_namespace: str = "targets",
        target_embedding_dim: int = None,
        attention: str = "none",
        dropout: float = 0.0,
        scheduled_sampling_ratio: float = 0.0,
    ) -> None:
        super(Seq2SeqDecoder, self).__init__(vocab)

        self._max_decoding_steps = max_decoding_steps
        self._target_namespace = target_namespace

        # We need the start symbol to provide as the input at the first timestep of decoding, and
        # end symbol as a way to indicate the end of the decoded sequence.
        self._start_index = self.vocab.get_token_index(START_SYMBOL,
                                                       self._target_namespace)
        self._end_index = self.vocab.get_token_index(END_SYMBOL,
                                                     self._target_namespace)
        self._unk_index = self.vocab.get_token_index("@@UNKNOWN@@",
                                                     self._target_namespace)
        num_classes = self.vocab.get_vocab_size(self._target_namespace)

        # Decoder output dim needs to be the same as the encoder output dim since we initialize the
        # hidden state of the decoder with that of the final hidden states of the encoder. Also, if
        # we're using attention with ``DotProductSimilarity``, this is needed.
        self._encoder_output_dim = input_dim
        self._decoder_hidden_dim = decoder_hidden_size
        if self._encoder_output_dim != self._decoder_hidden_dim:
            self._projection_encoder_out = Linear(self._encoder_output_dim,
                                                  self._decoder_hidden_dim)
        else:
            self._projection_encoder_out = lambda x: x
        self._decoder_output_dim = self._decoder_hidden_dim
        self._output_proj_input_dim = output_proj_input_dim
        self._target_embedding_dim = target_embedding_dim
        self._target_embedder = Embedding(num_classes,
                                          self._target_embedding_dim)

        # Used to get an initial hidden state from the encoder states
        self._sent_pooler = Pooler(project=True,
                                   d_inp=input_dim,
                                   d_proj=decoder_hidden_size)

        if attention == "Bahdanau":
            self._decoder_attention = BahdanauAttention(
                decoder_hidden_size + target_embedding_dim, input_dim)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time
            # step.
            self._decoder_input_dim = input_dim + target_embedding_dim
        elif attention == "bilinear":
            self._decoder_attention = BilinearAttention(
                decoder_hidden_size + target_embedding_dim, input_dim)
            # The output of attention, a weighted average over encoder outputs, will be
            # concatenated to the input vector of the decoder at each time
            # step.
            self._decoder_input_dim = input_dim + target_embedding_dim
        elif attention == "none":
            self._decoder_attention = None
            self._decoder_input_dim = target_embedding_dim
        else:
            raise Exception("attention not implemented {}".format(attention))

        self._decoder_cell = LSTMCell(self._decoder_input_dim,
                                      self._decoder_hidden_dim)
        # Allow for a bottleneck layer between encoder outputs and distribution over vocab
        # The bottleneck layer consists of a linear transform and helps to reduce
        # number of parameters
        if self._output_proj_input_dim != self._decoder_output_dim:
            self._projection_bottleneck = Linear(self._decoder_output_dim,
                                                 self._output_proj_input_dim)
        else:
            self._projection_bottleneck = lambda x: x
        self._output_projection_layer = Linear(self._output_proj_input_dim,
                                               num_classes)
        self._dropout = torch.nn.Dropout(p=dropout)
class Baseline(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings

        self.text_seq_encoder = PytorchSeq2VecWrapper(
            LSTM(word_embeddings.get_output_dim(),
                 int(word_embeddings.get_output_dim() / 2),
                 batch_first=True,
                 bidirectional=True))

        self.out = torch.nn.Linear(
            in_features=self.word_embeddings.get_output_dim() * 4,
            out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.f_score_0 = F1Measure(positive_label=0)
        self.f_score_1 = F1Measure(positive_label=1)
        self.f_score_2 = F1Measure(positive_label=2)
        self.loss = CrossEntropyLoss()
        self.attention = BilinearAttention(
            word_embeddings.get_output_dim() * 3,
            word_embeddings.get_output_dim())

    def forward(self,
                article: Dict[str, torch.Tensor],
                outcome: Dict[str, torch.Tensor],
                intervention: Dict[str, torch.Tensor],
                comparator: Dict[str, torch.Tensor],
                labels: torch.Tensor = None,
                evidence: torch.Tensor = None) -> Dict[str, torch.Tensor]:

        p_mask = get_text_field_mask(article, 1)
        p_size = p_mask.size()
        a_mask = (torch.sum(p_mask, dim=2) > 0)
        unf_p_mask = p_mask.reshape(p_size[0] * p_size[1], p_size[2])

        a_embeddings = self.word_embeddings(article)
        unf_a_embeddings = a_embeddings.reshape(p_size[0] * p_size[1],
                                                p_size[2], -1)
        unf_a_vec = self.text_seq_encoder(unf_a_embeddings, unf_p_mask)

        a_vec = unf_a_vec.reshape(p_size[0], p_size[1], -1)

        o_mask = get_text_field_mask(outcome)
        o_embeddings = self.word_embeddings(outcome)
        o_vec = self.text_seq_encoder(o_embeddings, o_mask)

        i_mask = get_text_field_mask(intervention)
        i_embeddings = self.word_embeddings(intervention)
        i_vec = self.text_seq_encoder(i_embeddings, i_mask)

        c_mask = get_text_field_mask(comparator)
        c_embeddings = self.word_embeddings(comparator)
        c_vec = self.text_seq_encoder(c_embeddings, c_mask)

        prompt_vec = torch.cat((o_vec, i_vec, c_vec), dim=1)
        a_attentions = self.attention.forward(prompt_vec, a_vec, a_mask)

        attended_a_vec = torch.sum(a_vec * a_attentions.unsqueeze(2), dim=1)
        logits = self.out(
            torch.cat((attended_a_vec, o_vec, i_vec, c_vec), dim=1))
        output = {'logits': logits, 'attentions': a_attentions}

        if (labels is not None) and (evidence is not None):

            evidence_one_hot = get_one_hot(evidence, p_mask.size(1))
            skip_no_evidence_mask = (torch.sum(evidence_one_hot, dim=1) >
                                     0).unsqueeze(1).float()
            att_loss = -1 * torch.mean(
                ((evidence_one_hot *
                  torch.log(torch.clamp(a_attentions, min=1e-9, max=1))) +
                 ((1 - evidence_one_hot) *
                  torch.log(torch.clamp(1 - a_attentions, min=1e-9, max=1)))) *
                a_mask.float() * skip_no_evidence_mask)

            classification_loss = self.loss(logits, labels)

            self.accuracy(logits, labels)
            self.f_score_0(logits, labels)
            self.f_score_1(logits, labels)
            self.f_score_2(logits, labels)

            output['loss'] = classification_loss + (5 * att_loss)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        _, _, f_score0 = self.f_score_0.get_metric(reset)
        _, _, f_score1 = self.f_score_1.get_metric(reset)
        _, _, f_score2 = self.f_score_2.get_metric(reset)
        return {
            'accuracy': self.accuracy.get_metric(reset),
            'f-score': np.mean([f_score0, f_score1, f_score2])
        }