예제 #1
0
    def _get_loss(logits: torch.LongTensor,
                  targets: torch.LongTensor,
                  target_mask: torch.LongTensor) -> torch.LongTensor:
        """
        Takes logits (unnormalized outputs from the decoder) of size (batch_size,
        num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1)
        and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross
        entropy loss while taking the mask into account.

        The length of ``targets`` is expected to be greater than that of ``logits`` because the
        decoder does not need to compute the output corresponding to the last timestep of
        ``targets``. This method aligns the inputs appropriately to compute the loss.

        During training, we want the logit corresponding to timestep i to be similar to the target
        token from timestep i + 1. That is, the targets should be shifted by one timestep for
        appropriate comparison.  Consider a single example where the target has 3 words, and
        padding is to 7 tokens.
           The complete sequence would correspond to <S> w1  w2  w3  <E> <P> <P>
           and the mask would be                     1   1   1   1   1   0   0
           and let the logits be                     l1  l2  l3  l4  l5  l6
        We actually need to compare:
           the sequence           w1  w2  w3  <E> <P> <P>
           with masks             1   1   1   1   0   0
           against                l1  l2  l3  l4  l5  l6
           (where the input was)  <S> w1  w2  w3  <E> <P>
        """
        relevant_targets = targets[:, 1:].contiguous()  # (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous()  # (batch_size, num_decoding_steps)
        loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask)
        return loss
예제 #2
0
    def test_sequence_cross_entropy_with_logits_averages_batch_correctly(self):
        # test batch average is the same as dividing the batch averaged
        # loss by the number of batches containing any non-padded tokens.
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights)

        vector_loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, average=None)
        # Batch has one completely padded row, so divide by 4.
        assert loss.data.numpy() == vector_loss.data.sum() / 4
예제 #3
0
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                tags: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
                                                                          sequence_length,
                                                                          self.num_classes])

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}

        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
            for metric in self.metrics.values():
                metric(logits, tags, mask.float())
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #4
0
    def test_sequence_cross_entropy_with_logits_averages_token_correctly(self):
        # test token average is the same as multiplying the per-batch loss
        # with the per-batch weights and dividing by the total weight
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, average="token")

        vector_loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, batch_average=False)
        total_token_loss = (vector_loss * weights.float().sum(dim=-1)).sum()
        average_token_loss = (total_token_loss / weights.float().sum()).detach()
        assert_almost_equal(loss.detach()[0], average_token_loss[0])
예제 #5
0
    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        mask = get_text_field_mask(sentence)
        embeddings = self.word_embeddings(sentence)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output
예제 #6
0
    def test_sequence_cross_entropy_with_logits_masks_loss_correctly(self):

        # test weight masking by checking that a tensor with non-zero values in
        # masked positions returns the same loss as a tensor with zeros in those
        # positions.
        tensor = torch.rand([5, 7, 4])
        tensor[0, 3:, :] = 0
        tensor[1, 4:, :] = 0
        tensor[2, 2:, :] = 0
        tensor[3, :, :] = 0
        weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1)
        tensor2 = tensor.clone()
        tensor2[0, 3:, :] = 2
        tensor2[1, 4:, :] = 13
        tensor2[2, 2:, :] = 234
        tensor2[3, :, :] = 65
        targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7]))
        targets *= weights

        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights)
        loss2 = util.sequence_cross_entropy_with_logits(tensor2, targets, weights)
        assert loss.data.numpy() == loss2.data.numpy()
 def test_loss_is_computed_correctly(self):
     batch_size = 5
     num_decoding_steps = 5
     num_classes = 10
     sample_logits = Variable(torch.randn(batch_size, num_decoding_steps-1, num_classes))
     sample_targets = Variable(torch.from_numpy(numpy.random.randint(0, num_classes,
                                                                     (batch_size, num_decoding_steps))))
     # Mask should be either 0 or 1
     sample_mask = Variable(torch.from_numpy(numpy.random.randint(0, 2,
                                                                  (batch_size, num_decoding_steps))))
     expected_loss = sequence_cross_entropy_with_logits(sample_logits, sample_targets[:, 1:].contiguous(),
                                                        sample_mask[:, 1:].contiguous())
     # pylint: disable=protected-access
     actual_loss = self.model._get_loss(sample_logits, sample_targets, sample_mask)
     assert numpy.equal(expected_loss.data.numpy(), actual_loss.data.numpy())
예제 #8
0
    def test_sequence_cross_entropy_with_logits_smooths_labels_correctly(self):
        tensor = torch.rand([1, 3, 4])
        targets = torch.LongTensor(numpy.random.randint(0, 3, [1, 3]))

        weights = torch.ones([2, 3])
        loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, label_smoothing=0.1)

        correct_loss = 0.0
        for prediction, label in zip(tensor.squeeze(0), targets.squeeze(0)):
            prediction = torch.nn.functional.log_softmax(prediction, dim=-1)
            correct_loss += prediction[label] * 0.9
            # incorrect elements
            correct_loss += prediction.sum() * 0.1/4
        # Average over sequence.
        correct_loss = - correct_loss / 3
        numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())
예제 #9
0
    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> torch.Tensor:
        #### AllenNLP is designed to operate on batched inputs, but different input sequences have different lengths. Behind the scenes AllenNLP is padding the shorter inputs so that the batch has uniform shape, which means our computations need to use a mask to exclude the padding. Here we just use the utility function <code>get_text_field_mask</code>, which returns a tensor of 0s and 1s corresponding to the padded and unpadded locations.
        mask = get_text_field_mask(sentence)
        #### We start by passing the <code>sentence</code> tensor (each sentence a sequence of token ids) to the <code>word_embeddings</code> module, which converts each sentence into a sequence of embedded tensors.
        embeddings = self.word_embeddings(sentence)
        #### We next pass the embedded tensors (and the mask) to the LSTM, which produces a sequence of encoded outputs.
        encoder_out = self.encoder(embeddings, mask)
        #### Finally, we pass each encoded output tensor to the feedforward layer to produce logits corresponding to the various tags.
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        #### As before, the labels were optional, as we might want to run this model to make predictions on unlabeled data. If we do have labels, then we use them to update our accuracy metric and compute the "loss" that goes in our output.
        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output
예제 #10
0
    def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor,
                  target_mask: torch.LongTensor) -> torch.Tensor:
        """
        Compute loss.

        Takes logits (unnormalized outputs from the decoder) of size (batch_size,
        num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1)
        and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross
        entropy loss while taking the mask into account.

        The length of ``targets`` is expected to be greater than that of ``logits`` because the
        decoder does not need to compute the output corresponding to the last timestep of
        ``targets``. This method aligns the inputs appropriately to compute the loss.

        During training, we want the logit corresponding to timestep i to be similar to the target
        token from timestep i + 1. That is, the targets should be shifted by one timestep for
        appropriate comparison.  Consider a single example where the target has 3 words, and
        padding is to 7 tokens.
           The complete sequence would correspond to <S> w1  w2  w3  <E> <P> <P>
           and the mask would be                     1   1   1   1   1   0   0
           and let the logits be                     l1  l2  l3  l4  l5  l6
        We actually need to compare:
           the sequence           w1  w2  w3  <E> <P> <P>
           with masks             1   1   1   1   0   0
           against                l1  l2  l3  l4  l5  l6
           (where the input was)  <S> w1  w2  w3  <E> <P>
        """
        # shape: (batch_size, num_decoding_steps)
        relevant_targets = targets[:, 1:].contiguous()

        # shape: (batch_size, num_decoding_steps)
        relevant_mask = target_mask[:, 1:].contiguous()

        return util.sequence_cross_entropy_with_logits(logits,
                                                       relevant_targets,
                                                       relevant_mask)
예제 #11
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            verb_indicator: torch.LongTensor,
            tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        verb_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.embedding_dropout(
            self.text_field_embedder(tokens))
        mask = get_text_field_mask(tokens)
        embedded_verb_indicator = self.binary_feature_embedding(
            verb_indicator.long())
        # Concatenate the verb feature onto the embedded text. This now
        # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim).
        embedded_text_with_verb_indicator = torch.cat(
            [embedded_text_input, embedded_verb_indicator], -1)
        batch_size, sequence_length, embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size(
        )

        if self.stacked_encoder.get_input_dim(
        ) != embedding_dim_with_binary_feature:
            raise ConfigurationError(
                "The SRL model uses an indicator feature, which makes "
                "the embedding dimension one larger than the value "
                "specified. Therefore, the 'input_dim' of the stacked_encoder "
                "must be equal to total_embedding_dim + 1.")

        encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator,
                                            mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs).view(
            [batch_size, sequence_length, self.num_classes])
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities,
            "encoded_text": encoded_text
        }
        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
            self.span_metric(class_probabilities, tags, mask)
            output_dict["loss"] = loss

        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.decode.
        output_dict["mask"] = mask
        return output_dict
def calculate_perplexity(batch_size=1, gpu_id=0, decoder_path='decoder.pth'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    model = BertGPT()
    model.load_state_dict(torch.load(decoder_path))
    print(f'load from {decoder_path}')
    model = model.to(device)
    model.eval()
    print('load success')
    #------------------------END LOAD MODEL--------------

    test_data = torch.load("validate_data.pth")
    test_dataset = MyDataset(*test_data)

    test_dataloader = DataLoader(dataset=test_dataset,
                                 shuffle=False,
                                 batch_size=batch_size,
                                 num_workers=2,
                                 collate_fn=collate_fn)
    #------------------------END LOAD VAL DATA--------------

    # #------------------------START VAL-------------------
    # perplexity = 0
    # batch_count = 0
    # print('start calculate the train perplexity....')

    # with torch.no_grad():
    #     for batch in tqdm(train_dataloader):
    #         batch = [item.to(device) for item in batch]

    #         encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

    #         _, past = encoder(encoder_input, mask_encoder_input)

    #         mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
    #         logits, _ = decoder(decoder_input, mask, past=past, past_length=0)

    #         out = logits[:, :-1].contiguous()
    #         target = decoder_input[:, 1:].contiguous()
    #         target_mask = mask_decoder_input[:, 1:].contiguous()

    #         loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
    #         perplexity += np.exp(loss.item())
    #         batch_count += 1

    # print(f'train perplexity: {perplexity / batch_count}')
    perplexity = 0
    batch_count = 0
    print('start calculate the test perplexity....')

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(encoder_input, mask_encoder_input, decoder_input,
                           mask_decoder_input)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1

    print(f'test perplexity: {perplexity / batch_count}')
예제 #13
0
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                verb_indicator: torch.LongTensor,
                tags: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        verb_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence and the verb to compute the
            frame for, under 'words' and 'verb' keys, respectively.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.embedding_dropout(self.text_field_embedder(tokens))
        mask = get_text_field_mask(tokens)
        embedded_verb_indicator = self.binary_feature_embedding(verb_indicator.long())
        # Concatenate the verb feature onto the embedded text. This now
        # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim).
        embedded_text_with_verb_indicator = torch.cat([embedded_text_input, embedded_verb_indicator], -1)
        batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size()

        encoded_text = self.encoder(embedded_text_with_verb_indicator, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
                                                                          sequence_length,
                                                                          self.num_classes])
        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits,
                                                      tags,
                                                      mask,
                                                      label_smoothing=self._label_smoothing)
            if not self.ignore_span_metric:
                self.span_metric(class_probabilities, tags, mask)
            output_dict["loss"] = loss

        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.decode.
        output_dict["mask"] = mask

        words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata])
        if metadata is not None:
            output_dict["words"] = list(words)
            output_dict["verb"] = list(verbs)
        return output_dict
예제 #14
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            verb_span: torch.LongTensor,
            entity_span: torch.LongTensor,
            state_change_type_labels: torch.LongTensor = None,
            state_change_tags: torch.LongTensor = None
    ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        verb_span: torch.LongTensor, required.
            An integer ``SequenceLabelField`` representation of the position of the focus verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that pre-processing stage could not extract a verbal predicate.
        entity_span: torch.LongTensor, required.
            An integer ``SequenceLabelField`` representation of the position of the focus entity
            in the sentence. This should have shape (batch_size, num_tokens) 
        state_change_type_labels: torch.LongTensor, optional (default = None)
            A torch tensor representing the state change type class labels of shape ``(batch_size, 1)???
        state_change_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``
            In the first implementation we focus only on state_change_types.

        Returns
        -------
        An output dictionary consisting of:
        type_probs : torch.FloatTensor
            A tensor of shape ``(batch_size, num_state_change_types)`` representing
            a distribution of state change types per datapoint.
        tags_class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_state_change_types, num_tokens)`` representing
            a distribution of location tags per token in a sentence.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """

        # Layer 1 = Word + Character embedding layer
        embedded_sentence = self.text_field_embedder(tokens)
        mask = get_text_field_mask(tokens).float()

        # Layer 2 = Add positional bit to encode position of focus verb and entity
        embedded_sentence_verb_entity = \
            torch.cat([embedded_sentence, verb_span.float().unsqueeze(-1), entity_span.float().unsqueeze(-1)], dim=-1)

        # Layer 3 = Contextual embedding layer using Bi-LSTM over the sentence
        contextual_embedding = self.seq2seq_encoder(
            embedded_sentence_verb_entity, mask)

        # Layer 4: Attention (Contextual embedding, BOW(verb span))
        verb_weight_matrix = verb_span.float() / (
            verb_span.float().sum(-1).unsqueeze(-1) + 1e-13)
        verb_vector = weighted_sum(
            contextual_embedding * verb_span.float().unsqueeze(-1),
            verb_weight_matrix)
        entity_weight_matrix = entity_span.float() / (
            entity_span.float().sum(-1).unsqueeze(-1) + 1e-13)
        entity_vector = weighted_sum(
            contextual_embedding * entity_span.float().unsqueeze(-1),
            entity_weight_matrix)
        verb_entity_vector = torch.cat([verb_vector, entity_vector], 1)
        batch_size, sequence_length, binary_feature_dim = verb_span.float(
        ).unsqueeze(-1).size()

        # attention weights for type prediction
        attention_weights_types = self.attention_layer(verb_entity_vector,
                                                       contextual_embedding)
        attention_output_vector = weighted_sum(contextual_embedding,
                                               attention_weights_types)

        # contextual embedding + positional vectors for tag prediction
        context_positional_tags = torch.cat([
            contextual_embedding,
            verb_span.float().unsqueeze(-1),
            entity_span.float().unsqueeze(-1)
        ],
                                            dim=-1)

        # Layer 5 = Dense softmax layer to pick one state change type per datapoint,
        # and one tag per word in the sentence
        type_logits = self.aggregate_feedforward(attention_output_vector)
        type_probs = torch.nn.functional.softmax(type_logits, dim=-1)

        tags_logits = self.tag_projection_layer(context_positional_tags)
        reshaped_log_probs = tags_logits.view(-1, self.num_tags)
        tags_class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_tags])

        # Create output dictionary for the trainer
        # Compute loss and epoch metrics
        output_dict = {'type_probs': type_probs}
        if state_change_type_labels is not None:
            state_change_type_labels_loss = self._loss(
                type_logits,
                state_change_type_labels.long().view(-1))
            for type_label in self.type_labels_vocab.values():
                metric = self.type_f1_metrics["type_" + type_label]
                metric(type_probs, state_change_type_labels.squeeze(-1))

            self._type_accuracy(type_probs,
                                state_change_type_labels.squeeze(-1))

        if state_change_tags is not None:
            state_change_tags_loss = sequence_cross_entropy_with_logits(
                tags_logits, state_change_tags, mask)
            self.span_metric(tags_class_probabilities, state_change_tags, mask)
            output_dict["tags_class_probabilities"] = tags_class_probabilities

        output_dict['loss'] = (state_change_type_labels_loss +
                               state_change_tags_loss)

        return output_dict
예제 #15
0
    def forward(  # type: ignore
        self,
        tokens: TextFieldTensors,
        verb_indicator: torch.Tensor,
        frame_indicator: torch.Tensor,
        metadata: List[Any],
        tags: torch.LongTensor = None,
        frame_tags: torch.LongTensor = None,
    ):
        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`, which should typically be passed directly to a
            `TextFieldEmbedder`. For this model, this must be a `SingleIdTokenIndexer` which
            indexes wordpieces from the BERT vocabulary.
        verb_indicator: `torch.LongTensor`, required.
            An integer `SequenceFeatureField` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        frame_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the frame
            in the sentence. This should have shape (batch_size, num_tokens). Similar to verb_indicator,
            but handles bert wordpiece tokenizer by cosnidering a frame only the first subtoken.
        tags : `torch.LongTensor`, optional (default = `None`)
            A torch tensor representing the sequence of integer gold class labels
            of shape `(batch_size, num_tokens)`
        frame_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the gold frames
            of shape ``(batch_size, num_tokens)``
        metadata : `List[Dict[str, Any]]`, optional, (default = `None`)
            metadata containg the original words in the sentence, the verb to compute the
            frame for, and start offsets for converting wordpieces back to a sequence of words,
            under 'words', 'verb' and 'offsets' keys, respectively.

        # Returns

        An output dictionary consisting of:
        logits : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
            a distribution of the tag classes per word.
        loss : `torch.FloatTensor`, optional
            A scalar loss to be optimised.
        """
        mask = get_text_field_mask(tokens)
        input_ids = util.get_token_ids_from_text_field_tensors(tokens)
        bert_embeddings, _ = self.transformer(
            input_ids=input_ids,
            token_type_ids=verb_indicator,
            attention_mask=mask,
            return_dict=False,
        )
        # extract embeddings
        embedded_text_input = self.embedding_dropout(bert_embeddings)
        frame_embeddings = embedded_text_input[frame_indicator == 1]
        # get sizes
        batch_size, sequence_length, _ = embedded_text_input.size()
        # outputs
        logits = self.tag_projection_layer(embedded_text_input)
        frame_logits = self.frame_projection_layer(frame_embeddings)

        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])

        frame_probabilities = F.softmax(frame_logits, dim=-1)
        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.make_output_human_readable.
        output_dict = {
            "logits": logits,
            "frame_logits": frame_logits,
            "class_probabilities": class_probabilities,
            "frame_probabilities": frame_probabilities,
            "mask": mask,
        }
        # We add in the offsets here so we can compute the un-wordpieced tags.
        words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"])
                                      for x in metadata])
        lemmas = [l for x in metadata for l in x["lemmas"]]
        output_dict["words"] = list(words)
        output_dict["lemma"] = list(lemmas)
        output_dict["verb"] = list(verbs)
        output_dict["wordpiece_offsets"] = list(offsets)

        if tags is not None:
            # compute role loss
            role_loss = sequence_cross_entropy_with_logits(
                logits, tags, mask, label_smoothing=self._label_smoothing)
            # compute frame loss
            frame_tags_filtered = frame_tags[frame_indicator == 1]
            frame_loss = self.frame_criterion(frame_logits,
                                              frame_tags_filtered)
            if not self.ignore_span_metric and self.span_metric is not None and not self.training:
                batch_verb_indices = [
                    example_metadata["verb_index"]
                    for example_metadata in metadata
                ]
                batch_sentences = [
                    example_metadata["words"] for example_metadata in metadata
                ]
                # Get the BIO tags from make_output_human_readable()
                batch_bio_predicted_tags = self.make_output_human_readable(
                    output_dict).pop("tags")
                from allennlp_models.structured_prediction.models.srl import (
                    convert_bio_tags_to_conll_format, )

                batch_conll_predicted_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_predicted_tags
                ]
                batch_bio_gold_tags = [
                    example_metadata["gold_tags"]
                    for example_metadata in metadata
                ]
                batch_conll_gold_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_gold_tags
                ]
                self.span_metric(
                    batch_verb_indices,
                    batch_sentences,
                    batch_conll_predicted_tags,
                    batch_conll_gold_tags,
                )
            self.f1_frame_metric(frame_logits, frame_tags_filtered)
            output_dict["frame_loss"] = frame_loss
            output_dict["role_loss"] = role_loss
            output_dict["loss"] = (role_loss + frame_loss) / 2
        return output_dict
def calculate_perplexity(
        batch_size=1,
        gpu_id=0,
        model_path='/content/GPT CheckPoints/model-9.pth'
):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    # ------------------------LOAD MODEL-----------------
    print('load the model....')

    model = EncoderDecoderModel.from_encoder_decoder_pretrained("gpt2", "gpt2", use_cache=False)
    model.load_state_dict(torch.load(model_path, map_location='cuda'))

    model = model.to(device)
    model.eval()

    print('load success')
    # ------------------------END LOAD MODEL--------------

    # ------------------------LOAD VAL DATA------------------
    val_data = torch.load("/content/validate_data.pth")
    val_dataset = TensorDataset(*val_data)

    train_data = torch.load("/content/train_data.pth")
    train_dataset = TensorDataset(*train_data)

    test_data = torch.load("/content/test_data.pth")
    test_dataset = TensorDataset(*test_data)

    val_dataloader = DataLoader(dataset=val_dataset, shuffle=False, batch_size=batch_size)
    train_dataloader = DataLoader(dataset=train_dataset, shuffle=False, batch_size=batch_size)
    test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)
    # ------------------------END LOAD VAL DATA--------------

    # ------------------------START VAL-------------------
    perplexity = 0
    batch_count = 0
    print('start calculate the train perplexity....')

    with torch.no_grad():
        for batch in train_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past))

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1

    print(f'train perplexity: {perplexity / batch_count}')

    perplexity = 0
    batch_count = 0
    print('start calculate the validate perplexity....')

    with torch.no_grad():
        for batch in val_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past))

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1

    print(f'validate perplexity: {perplexity / batch_count}')

    perplexity = 0
    batch_count = 0
    print('start calculate the test perplexity....')

    with torch.no_grad():
        for batch in test_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past))

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1

    print(f'test perplexity: {perplexity / batch_count}')
예제 #17
0
    def _forward_loop(self,
                      state: Dict[str, torch.Tensor],
                      targets: Dict[str, torch.Tensor],
                      labels: torch.Tensor) -> Dict[str, torch.Tensor]:
        """Compute loss using greedy decoding."""
        batch_size = state['input_mask'].shape[0]
        target_tokens = targets['tokens']
        num_decoding_steps = target_tokens.shape[1] - 1

        # Greedy decoding phase
        output_logit_list = []
        attention_logit_list = []
        select_idx_list = []
        for timestep in range(num_decoding_steps):
            # Feed target sequence as input
            decoder_input = target_tokens[:, timestep]
            output_logits, attention_logits, state = self._prepare_output_projections(decoder_input, state)
            # Store output and attention logits
            output_logit_list.append(output_logits.unsqueeze(1))
            attention_logit_list.append(attention_logits.unsqueeze(1))

        # Compute reconstruction loss
        output_logit_tensor = torch.cat(output_logit_list, dim=1)
        relevant_target_tokens = target_tokens[:, 1:].contiguous()
        target_mask = util.get_text_field_mask(targets)[:, 1:].contiguous()
        reconstruction_loss = util.sequence_cross_entropy_with_logits(output_logit_tensor,
                                                                      relevant_target_tokens,
                                                                      target_mask)

        # Compute claim scoring loss. A loss is computed between **each** attention vector and the
        # true label. In order for that to work we need to:
        #   a. Tile the source labels (so that they are copied for each word)
        #   b. Mask out padding tokens - this requires taking the outer-product of the target mask
        #       and the claim mask
        attention_logit_tensor = torch.cat(attention_logit_list, dim=1)
        claim_level_mask = (state['claim_mask'].sum(-1) > 0).long()
        attention_mask = target_mask.unsqueeze(-1) * claim_level_mask.unsqueeze(1)
        labels = labels.unsqueeze(1).repeat(1, num_decoding_steps, 1).float()
        claim_scoring_loss = F.binary_cross_entropy_with_logits(attention_logit_tensor, labels, reduction='none')
        claim_scoring_loss *= attention_mask.float()  # Apply mask

        # We want to apply 'batch' reduction (as is done in `sequence_cross_entropy...` which
        # entails averaging over each dimension.
        denom = attention_mask
        for i in range(3):
            denom = denom.sum(-1)
            claim_scoring_loss =  claim_scoring_loss.sum(-1) / (denom.float() + 1e-13)
            denom = (denom > 0)

        total_loss = reconstruction_loss + self.beta * claim_scoring_loss

        # Update metrics
        self.avg_reconstruction_loss(reconstruction_loss)
        self.avg_claim_scoring_loss(claim_scoring_loss)

        output_dict =  {
            "loss": total_loss,
            "reconstruction_loss": reconstruction_loss,
            "claim_scoring_loss": claim_scoring_loss,
            "attention_logits": attention_logit_tensor
        }

        return output_dict
예제 #18
0
    def forward(
            self,  # type: ignore
            label_indices: torch.LongTensor,
            token_representations: torch.FloatTensor = None,
            raw_tokens: List[List[str]] = None,
            labels: torch.LongTensor = None,
            **kwargs) -> Dict[str, torch.Tensor]:
        """
        If ``token_representations`` is provided, ``tokens`` is not required. If
        ``token_representations`` is ``None``, then ``tokens`` is required.

        Parameters
        ----------
        label_indices : torch.LongTensor
            A LongTensor of shape (batch_size, max_num_adpositions) with the tokens
            to predict a label for for each element (sentence) in the batch.
        token_representations : torch.FloatTensor, optional (default = None)
            A tensor of shape (batch_size, sequence_length, representation_dim) with
            the represenatation of the first token. If None, we use a contextualizer
            within this model to produce the token representation.
        raw_tokens : List[List[str]], optional (default = None)
            A batch of lists with the raw token strings. Used to compute
            token_representations, if either are None.
        labels : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_label_indices)``.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_label_indices,
            num_classes)`` representing unnormalized log probabilities
            of the classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_label_indices,
            num_classes)`` representing a distribution of the tag classes.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimized.
        """
        # Convert to LongTensor
        # TODO: add PR to ArrayField to preserve array types.
        label_indices = label_indices.long()
        if token_representations is None:
            if self._contextualizer is None:
                raise ConfigurationError(
                    "token_representation not provided as input to the model, and no "
                    "contextualizer was specified. Either add a contextualizer to your "
                    "dataset reader (preferred if your contextualizer is frozen) or to "
                    "this model (if you wish to train your contextualizer).")
            if raw_tokens is None:
                raise ValueError(
                    "Input raw_tokens is ``None`` --- make sure to set "
                    "include_raw_tokens in the DatasetReader to True.")
            if label_indices is None:
                raise ValueError("Did not recieve any token indices, needed "
                                 "if the contextualizer is within the model.")
            # Convert contextualizer output into a tensor
            # Shape: (batch_size, max_seq_len, representation_dim)
            token_representations, _ = pad_contextualizer_output(
                self._contextualizer(raw_tokens))

        # Move token representation to the same device as the
        # module (CPU or CUDA). TODO(nfliu): This only works if the module
        # is on one device.
        device = next(self._decoder._linear_layers[0].parameters()).device
        token_representations = token_representations.to(device)
        text_mask = get_text_mask_from_representations(token_representations)
        text_mask = text_mask.to(device)
        label_mask = self._get_label_mask_from_label_indices(label_indices)
        label_mask = label_mask.to(device)

        # Mask out the -1 padding in the label_indices, since that doesn't
        # work with indexing. Note that we can't 0 pad because 0 is actually
        # a valid label index, so we pad with -1 just for the purposes of
        # proper mask calculation and then convert to 0-padding by applying
        # the mask.
        label_indices = label_indices * label_mask

        # Encode the token representation.
        encoded_token_representations = self._encoder(token_representations,
                                                      text_mask)

        batch_size = label_indices.size(0)
        # Index into the encoded_token_representations to get tensors corresponding
        # to the representations of the tokens to predict labels for.
        # Shape: (batch_size, num_label_indices, representation_dim)
        range_vector = get_range_vector(
            batch_size, get_device_of(label_indices)).unsqueeze(1)
        selected_token_representations = encoded_token_representations[
            range_vector, label_indices]
        selected_token_representations = selected_token_representations.contiguous(
        )

        # Decode out a label from the token representation
        # Shape: (batch_size, num_label_indices, num_classes)
        logits = self._decoder(selected_token_representations)
        class_probabilities = F.softmax(logits, dim=-1)
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }
        if labels is not None:
            loss = sequence_cross_entropy_with_logits(
                logits, labels, label_mask, average=self.loss_average)
            for name, metric in self.metrics.items():
                # When not running in error analysis mode, skip
                # metrics that start with "_"
                if not self.error_analysis and name.startswith("_"):
                    continue
                metric(logits, labels, label_mask.float())
            output_dict["loss"] = loss
        return output_dict
예제 #19
0
파일: srl_bert.py 프로젝트: wjn922/allennlp
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.Tensor],
            verb_indicator: torch.Tensor,
            metadata: List[Any],
            tags: torch.LongTensor = None):
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. For this model, this must be a `SingleIdTokenIndexer` which
            indexes wordpieces from the BERT vocabulary.
        verb_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence, the verb to compute the
            frame for, and start offsets for converting wordpieces back to a sequence of words,
            under 'words', 'verb' and 'offsets' keys, respectively.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        mask = get_text_field_mask(tokens)
        bert_embeddings, _ = self.bert_model(input_ids=tokens["tokens"],
                                             token_type_ids=verb_indicator,
                                             attention_mask=mask,
                                             output_all_encoded_layers=False)

        embedded_text_input = self.embedding_dropout(bert_embeddings)
        batch_size, sequence_length, _ = embedded_text_input.size()
        logits = self.tag_projection_layer(embedded_text_input)

        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }
        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.decode.
        output_dict["mask"] = mask
        # We add in the offsets here so we can compute the un-wordpieced tags.
        words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"])
                                      for x in metadata])
        output_dict["words"] = list(words)
        output_dict["verb"] = list(verbs)
        output_dict["wordpiece_offsets"] = list(offsets)

        if tags is not None:
            loss = sequence_cross_entropy_with_logits(
                logits, tags, mask, label_smoothing=self._label_smoothing)
            if not self.ignore_span_metric and self.span_metric is not None and not self.training:
                batch_verb_indices = [
                    example_metadata["verb_index"]
                    for example_metadata in metadata
                ]
                batch_sentences = [
                    example_metadata["words"] for example_metadata in metadata
                ]
                # Get the BIO tags from decode()
                # TODO (nfliu): This is kind of a hack, consider splitting out part
                # of decode() to a separate function.
                batch_bio_predicted_tags = self.decode(output_dict).pop("tags")
                batch_conll_predicted_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_predicted_tags
                ]
                batch_bio_gold_tags = [
                    example_metadata["gold_tags"]
                    for example_metadata in metadata
                ]
                batch_conll_gold_tags = [
                    convert_bio_tags_to_conll_format(tags)
                    for tags in batch_bio_gold_tags
                ]
                self.span_metric(batch_verb_indices, batch_sentences,
                                 batch_conll_predicted_tags,
                                 batch_conll_gold_tags)
            output_dict["loss"] = loss
        return output_dict
예제 #20
0
    def forward(
        self,  # type: ignore
        tokens: TextFieldTensors,
        tags: torch.LongTensor = None,
        metadata: List[Dict[str, Any]] = None,
        ignore_loss_on_o_tags: bool = False,
    ) -> Dict[str, torch.Tensor]:
        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`, which should typically be passed directly to a
            `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer`
            tensors.  At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens":
            Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used
            for the `TokenIndexers` when you created the `TextField` representing your
            sequence.  The dictionary is designed to be passed directly to a `TextFieldEmbedder`,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : `torch.LongTensor`, optional (default = `None`)
            A torch tensor representing the sequence of integer gold class labels of shape
            `(batch_size, num_tokens)`.
        metadata : `List[Dict[str, Any]]`, optional, (default = `None`)
            metadata containing the original words in the sentence to be tagged under a 'words' key.
        ignore_loss_on_o_tags : `bool`, optional (default = `False`)
            If True, we compute the loss only for actual spans in `tags`, and not on `O` tokens.
            This is useful for computing gradients of the loss on a _single span_, for
            interpretation / attacking.

        # Returns

        An output dictionary consisting of:
            - `logits` (`torch.FloatTensor`) :
                A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
                unnormalised log probabilities of the tag classes.
            - `class_probabilities` (`torch.FloatTensor`) :
                A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing
                a distribution of the tag classes per word.
            - `loss` (`torch.FloatTensor`, optional) :
                A scalar loss to be optimised.

        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])

        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }

        if tags is not None:
            if ignore_loss_on_o_tags:
                o_tag_index = self.vocab.get_token_index(
                    "O", namespace=self.label_namespace)
                tag_mask = mask & (tags != o_tag_index)
            else:
                tag_mask = mask
            loss = sequence_cross_entropy_with_logits(logits, tags, tag_mask)
            for metric in self.metrics.values():
                metric(logits, tags, mask)
            if self.calculate_span_f1:
                self._f1_metric(logits, tags, mask)
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #21
0
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                spans: torch.LongTensor,
                metadata: List[Dict[str, Any]],
                pos_tags: Dict[str, torch.LongTensor] = None,
                span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        spans : ``torch.LongTensor``, required.
            A tensor of shape ``(batch_size, num_spans, 2)`` representing the
            inclusive start and end indices of all possible spans in the sentence.
        metadata : List[Dict[str, Any]], required.
            A dictionary of metadata for each batch element which has keys:
                tokens : ``List[str]``, required.
                    The original string tokens in the sentence.
                gold_tree : ``nltk.Tree``, optional (default = None)
                    Gold NLTK trees for use in evaluation.
                pos_tags : ``List[str]``, optional.
                    The POS tags for the sentence. These can be used in the
                    model as embedded features, but they are passed here
                    in addition for use in constructing the tree.
        pos_tags : ``torch.LongTensor``, optional (default = None)
            The output of a ``SequenceLabelField`` containing POS tags.
        span_labels : ``torch.LongTensor``, optional (default = None)
            A torch tensor representing the integer gold class labels for all possible
            spans, of shape ``(batch_size, num_spans)``.

        Returns
        -------
        An output dictionary consisting of:
        class_probabilities : ``torch.FloatTensor``
            A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)``
            representing a distribution over the label classes per span.
        spans : ``torch.LongTensor``
            The original spans tensor.
        tokens : ``List[List[str]]``, required.
            A list of tokens in the sentence for each element in the batch.
        pos_tags : ``List[List[str]]``, required.
            A list of POS tags in the sentence for each element in the batch.
        num_spans : ``torch.LongTensor``, required.
            A tensor of shape (batch_size), representing the lengths of non-padded spans
            in ``enumerated_spans``.
        loss : ``torch.FloatTensor``, optional
            A scalar loss to be optimised.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        if pos_tags is not None and self.pos_tag_embedding is not None:
            embedded_pos_tags = self.pos_tag_embedding(pos_tags)
            embedded_text_input = torch.cat([embedded_text_input, embedded_pos_tags], -1)
        elif self.pos_tag_embedding is not None:
            raise ConfigurationError("Model uses a POS embedding, but no POS tags were passed.")

        mask = get_text_field_mask(tokens)
        # Looking at the span start index is enough to know if
        # this is padding or not. Shape: (batch_size, num_spans)
        span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long()
        if span_mask.dim() == 1:
            # This happens if you use batch_size 1 and encounter
            # a length 1 sentence in PTB, which do exist. -.-
            span_mask = span_mask.unsqueeze(-1)
        if span_labels is not None and span_labels.dim() == 1:
            span_labels = span_labels.unsqueeze(-1)

        num_spans = get_lengths_from_binary_sequence_mask(span_mask)

        encoded_text = self.encoder(embedded_text_input, mask)
        span_representations = self.span_extractor(encoded_text, spans, mask, span_mask)
        if self.feedforward_layer is not None:
            span_representations = self.feedforward_layer(span_representations)
        logits = self.tag_projection_layer(span_representations)
        class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1))

        output_dict = {
                "class_probabilities": class_probabilities,
                "spans": spans,
                "tokens": [meta["tokens"] for meta in metadata],
                "pos_tags": [meta.get("pos_tags") for meta in metadata],
                "num_spans": num_spans
        }
        if span_labels is not None:
            loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask)
            self.tag_accuracy(class_probabilities, span_labels, span_mask)
            output_dict["loss"] = loss

        # The evalb score is expensive to compute, so we only compute
        # it for the validation and test sets.
        batch_gold_trees = [meta.get("gold_tree") for meta in metadata]
        if all(batch_gold_trees) and self._evalb_score is not None and not self.training:
            gold_pos_tags: List[List[str]] = [list(zip(*tree.pos()))[1]
                                              for tree in batch_gold_trees]
            predicted_trees = self.construct_trees(class_probabilities.cpu().data,
                                                   spans.cpu().data,
                                                   num_spans.data,
                                                   output_dict["tokens"],
                                                   gold_pos_tags)
            self._evalb_score(predicted_trees, batch_gold_trees)

        return output_dict
예제 #22
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            labels: torch.LongTensor = None,
            d_tags: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        lables : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        d_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containing the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """  # 下面就是整个翻译模型的算法架构.
        '''
        手动shuffle, 外卖的shuffle不好使,不知道为什么!!!!!!!!!!!!!!!!
        '''
        if metadata:
            import random

            chang = len(metadata)
            tmp = list(range((chang)))
            random.shuffle(tmp)
            print('\n')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print('                      ')
            print(tmp, '修改后的顺序是')
            # tokens['bert'].numpy()[1,2,3,4,0]    tokens['bert'][[2,3,4,0,1],:]
            tokens['bert'] = tokens['bert'][tmp, :]
            tokens['bert-offsets'] = tokens['bert-offsets'][tmp, :]
            tokens['mask'] = tokens['mask'][tmp, :]
            labels = labels[tmp, :]
            d_tags = d_tags[tmp, :]
            import numpy as np
            metadata = np.array(metadata)
            metadata = metadata[tmp]
# shuffle 完毕.

        encoded_text = self.text_field_embedder(
            tokens)  # 第一步先用pre_trained embedding
        batch_size, sequence_length, _ = encoded_text.size(
        )  # 整个算法的输入.(9, 50, 768) 每一个单词看做一个token.
        mask = get_text_field_mask(
            tokens)  # 就是把补全到50的padding 标志位0,其他标志位1.   torch.Size([9, 50])
        logits_labels = self.tag_labels_projection_layer(
            self.predictor_dropout(
                encoded_text))  #torch.Size([9, 50, 28])# 28分类问题
        logits_d = self.tag_detect_projection_layer(
            encoded_text)  # 4分类问题  #torch.Size([9, 50, 4])

        class_probabilities_labels = F.softmax(
            logits_labels, dim=-1
        ).view(  #!!!!!!!!!!!!!!!!!!!!!!!!!---------------------
            [batch_size, sequence_length, self.num_labels_classes]
        )  #----------------class_probabilities_labels 这个是核心的输出,只用这个就可以得到最后output
        import numpy as np

        # ???????????????这行为什么报错????????np.array(class_probabilities_labels)    #----------------class_probabilities_labels 这个是核心的输出,只用这个就可以得到最后output
        # from predict import confidence
        # #  下面做 置信度finetune, 把置信度 <args.   ----------------
        # tmp=confidence

        # with open('conf', ) as f:
        #     tmp = float(f.readlines()[0])
        # if tmp!=0:
        #     tmp2=(class_probabilities_labels.numpy()[:,:,1:]>tmp).astype(int)
        #     class_probabilities_labels[:,:,1:]=torch.tensor(tmp2)

        class_probabilities_d = F.softmax(logits_d, dim=-1).view(
            [batch_size, sequence_length, self.num_detect_classes])
        error_probs = class_probabilities_d[:, :, self.
                                            incorr_index] * mask  # 那些padding的loss不需要计算,没意义.
        incorr_prob = torch.max(error_probs,
                                dim=-1)[0]  # 按照一句话里面错率最大的字来算整个句子的错误率.

        if self.confidence > 0:
            probability_change = [self.confidence
                                  ] + [0] * (self.num_labels_classes - 1)
            class_probabilities_labels += torch.FloatTensor(
                probability_change).repeat((batch_size, sequence_length, 1))

        output_dict = {
            "logits_labels": logits_labels,
            "logits_d_tags": logits_d,
            "class_probabilities_labels": class_probabilities_labels,
            "class_probabilities_d_tags": class_probabilities_d,
            "max_error_probability": incorr_prob
        }
        # 下面只在训练的时候输出,因为只有训练的时候才有labels这个 groud_true标签. predict时候会跳过下面代码.!!!!!!!!!!!!!!!!!!!!!!!!!!!1        2020-07-08,18点49
        if labels is not None and d_tags is not None:  # sequence_cross_entropy_with_logits 这个里面yhat 不用softmax? 这个点进去看说明就可以,他里面说了不用归一化之后的数据,直接输入即可. 诡异话之后的数据是class_probabilities_labels
            loss_labels = sequence_cross_entropy_with_logits(
                logits_labels,
                labels,
                mask,
                label_smoothing=self.label_smoothing
            )  # logits_labels 是28分类的概率分布, labels是 y标签. mask是遮罩也就是带入的weights. 用这个来算交叉熵.
            from train_finetune_latest2 import vocabdir
            with open(vocabdir) as f:
                tmp3 = f.readlines()
            tmp3 = [i.strip('\n') for i in tmp3]
            tmp3 = np.array(tmp3)

            loss_d = sequence_cross_entropy_with_logits(
                logits_d, d_tags, mask)  # 同理
            for metric in self.metrics.values():
                metric(logits_labels, labels, mask.float())
                metric(logits_d, d_tags, mask.float())
            output_dict["loss"] = loss_labels + loss_d
            print('\n ------------------------------------------------\n')
            print('\n ------------------------------------------------\n')
            print('\n ------------------------------------------------\n')
            print('\n ------------------------------------------------\n')
            print('我们打印几个看看,目前策略是只打印最后2个,为了保证算法不会过多损耗性能.看看预测的结果是否是我们打的tag:')
            print('我们打印,最大分类标签和置信度.')
            allfenlei = torch.max(class_probabilities_labels,
                                  dim=-1)[1][-2:]  # 这个东西我们用来生成标签.这个是所有的分类标签.
            gailv = torch.max(class_probabilities_labels,
                              dim=-1)[0][-2:]  # 这个东西我们用来生成标签.这个是所有的分类标签.
            newlist = []
            for ii2 in range(len(allfenlei)):
                saveindex = [
                    i for i in range(len(allfenlei[ii2]))
                    if allfenlei[ii2][i] != 0
                ]
                newlist.append(gailv[ii2][saveindex])
            shuju = metadata[-2:]
            for jj in range(len(allfenlei)):
                print('原始句子为', shuju[jj])
                tmp = [i for i in allfenlei[jj] if i != 0]
                print('输出的变换为', '\t'.join(tmp3[tmp]))
                print('对应的概率为')
                print(newlist[jj])
            print('------------该epoch评测完毕.')
            # 原始数据是metadata.所以也打印一下,对比一下效果.

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #23
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            tags: torch.LongTensor = None,
            domain: torch.LongTensor = None,
            intent: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None,
            # pylint: disable=unused-argument
            **kwargs) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : ``Dict[str, torch.LongTensor]``, required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : ``torch.LongTensor``, optional (default = ``None``)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:

        logits : ``torch.FloatTensor``
            The logits that are the output of the ``tag_projection_layer``
        mask : ``torch.LongTensor``
            The text field mask for the input tokens
        tags : ``List[List[int]]``
            The predicted tags using the Viterbi algorithm.
        loss : ``torch.FloatTensor``, optional
            A scalar loss to be optimised. Only computed if gold label ``tags`` are provided.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        mask = util.get_text_field_mask(tokens)

        if self.dropout:
            embedded_text_input = self.dropout(embedded_text_input)

        encoded_text = self.encoder(embedded_text_input, mask)

        if self.dropout:
            encoded_text = self.dropout(encoded_text)

        if self._feedforward is not None:
            encoded_summary = self._feedforward(
                util.get_final_encoder_states(encoded_text, mask,
                                              self.encoder.is_bidirectional()))
        else:
            encoded_summary = util.get_final_encoder_states(
                encoded_text, mask, self.encoder.is_bidirectional())

        tag_logits = self.tag_projection_layer(encoded_text)
        if self.crf:
            best_paths = self.crf.viterbi_tags(tag_logits, mask)
            # Just get the tags and ignore the score.
            predicted_tags = [x for x, y in best_paths]
        else:
            predicted_tags = self.get_predicted_tags(tag_logits)

        domain_logits = self.domain_projection_layer(encoded_summary)
        domain_probs = F.softmax(domain_logits, dim=-1)

        intent_logits = self.intent_projection_layer(encoded_summary)
        intent_probs = F.softmax(intent_logits, dim=-1)

        output = {
            "tag_logits": tag_logits,
            "mask": mask,
            "tags": predicted_tags,
            "domain_probs": domain_probs,
            "intent_probs": intent_probs
        }

        if tags is not None:
            if self.crf:
                # Add negative log-likelihood as loss
                log_likelihood = self.crf(tag_logits, tags, mask)
                output["loss"] = -log_likelihood

                # Represent viterbi tags as "class probabilities" that we can
                # feed into the metrics
                class_probabilities = tag_logits * 0.
                for i, instance_tags in enumerate(predicted_tags):
                    for j, tag_id in enumerate(instance_tags):
                        class_probabilities[i, j, tag_id] = 1
            else:
                loss = sequence_cross_entropy_with_logits(
                    tag_logits, tags, mask)
                class_probabilities = tag_logits
                output["loss"] = loss

            # self.metrics['tag_acc'](class_probabilities, tags, mask.float())
            # if self.calculate_span_f1:
            #     self._f1_metric(class_probabilities, tags, mask.float())
        if domain is not None:
            output["loss"] += self.ce_loss(domain_logits, domain)
        if intent is not None:
            output["loss"] += self.ce_loss(intent_logits, intent)

        if metadata:
            output["words"] = [x["words"] for x in metadata]

        if tags is not None and metadata:
            self.decode(output)
            self._dai_f1_metric(output["dialog_act"],
                                [x["dialog_act"] for x in metadata])

        return output
예제 #24
0
    def _forward_loop(
        self,
        state: Dict[str, torch.Tensor],
        gold_mentions: torch.LongTensor,
        target_tokens: Dict[str, torch.LongTensor] = None
    ) -> Dict[str, torch.Tensor]:
        # shape: (batch_size, max_input_sequence_length)
        source_mask = state["source_mask"]

        # shape: (batch_size, max_input_sequence_length, embedding_dim)
        encoder_outputs = state['encoder_outputs']

        batch_size = source_mask.size()[0]

        max_input_sequence_length = source_mask.size()[1]

        # 下面两步将gold_mention用0扩充到 (batch_size, max_input_sequence_length)
        gold_mentions_expanded = torch.zeros(
            batch_size, max_input_sequence_length).cuda(self.cuda_device)
        gold_mentions_expanded[:, :gold_mentions.size()[1]] = gold_mentions

        # 通过get_text_field_mask, 用0-1表示当前位置是否有效
        # shape: (batch_size, mac_input_sequence_length)
        mention_mask = util.get_text_field_mask(
            {'gold_mentions': gold_mentions_expanded})

        for b in range(batch_size):
            encoder_output = encoder_outputs[b]
            gold_mention = gold_mentions_expanded[b]
            # 选择对应mention的output,剩余的用0位置的output填充
            # 例如gold_mention = [3,5,0,0], 那么就选择3和5位置的output,并且用0位置的output填充矩阵剩余部分
            encoder_selected = torch.index_select(encoder_output, 0,
                                                  gold_mention.long())

            if b == 0:
                encoder_resorted = encoder_selected.unsqueeze(0)
            else:
                encoder_resorted = torch.cat(
                    (encoder_resorted, encoder_selected.unsqueeze(0)), 0)

        # 通过decoder进行输出
        # shape: (batch_size, max_sentence_length, num_classes)
        decoder_outputs = self._decode(encoder_resorted, mention_mask)

        # 按照token一个个计算
        token_logits = []
        token_predictions = []
        token_class_probs = []
        for i in range(max_input_sequence_length):
            encoder_slice = encoder_resorted[:, i, :]

            decoder_hidden = decoder_outputs[:, i, :]

            # source_mask_slice = source_mask[:, i].float()

            # TODO decoder hidden需要拼接上 h_encoder_t
            encoder_weights = self._attention(decoder_hidden, encoder_outputs,
                                              source_mask.float())

            # 加权求和
            # shape: (batch_size, hidden_dim)
            attended_output = util.weighted_sum(encoder_outputs,
                                                encoder_weights)

            # shape: (batch_size, hidden_dim * 3)
            hidden_attention_cat = torch.cat(
                (decoder_hidden, attended_output, encoder_slice), -1)

            # shape: (batch_size, num_classes)
            score = self._output_projection_layer(hidden_attention_cat)

            token_logits.append(score.unsqueeze(1))

            class_probabilities = F.softmax(score, dim=-1)

            token_class_probs.append(class_probabilities.unsqueeze(1))

            # shape (predicted_classes): (batch_size,)
            _, predicted_classes = torch.max(class_probabilities, 1)

            last_predictions = predicted_classes

            token_predictions.append(last_predictions.unsqueeze(1))

        predictions = torch.cat(token_predictions, 1)
        class_probs = torch.cat(token_class_probs, 1)
        # 裁切超过target长度的
        output_dict = {
            'predictions': predictions,
            'class_probs': class_probs.detach()
        }

        if target_tokens:

            targets = target_tokens['tokens']
            target_length = targets.size()[1]

            # 下面的步骤主要在做裁切,因为输出的shape是(batch_size, max_sentence_length, num_classes)
            # 而target是(batch_size, max_target_length) max_sentence_length 和 max_target_length不相等
            predictions_slice = predictions[:, :target_length]
            class_probs_slice = class_probs[:, :target_length, :]
            output_dict['predictions'] = predictions_slice
            output_dict['class_probs'] = class_probs_slice

            target_length = targets.size()[1]
            logits = torch.cat(token_logits, 1)
            # 裁切超过target长度的
            logits_slice = logits[:, :target_length, :].contiguous()
            targets = targets.contiguous()
            mention_mask = mention_mask[:, :target_length].contiguous()
            loss = util.sequence_cross_entropy_with_logits(
                logits_slice.float(), targets, mention_mask.float())
            output_dict['loss'] = loss
            output_dict['logits'] = logits_slice
            output_dict['mention_mask'] = mention_mask

        return output_dict
예제 #25
0
    def forward(
            self,
            tokens: Dict[str, torch.LongTensor],
            tags: torch.LongTensor = None,
            relation_root_idxs: torch.LongTensor = None,
            relations: torch.LongTensor = None,
            binary_coref: torch.FloatTensor = None,
            spacy_patterns: torch.FloatTensor = None,
            coarse_tags: torch.LongTensor = None,
            modifier_tags: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ,no-member
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor
            An integer tensor containing the gold ner tag label indexes.
        relation_root_idxs : torch.LongTensor, optional (default = None)
            An integer tensor containing the gold relation head indexes for training.
        relations : torch.LongTensor, optional (default = None)
            An integer tensor containing the gold relation label indexes for training.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information such as the original words and the entity ids.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)

        encoder_input_tensors = [embedded_text_input]
        if binary_coref is not None:
            encoder_input_tensors.append(binary_coref.unsqueeze(2))
        if spacy_patterns is not None:
            encoder_input_tensors.append(spacy_patterns.permute(0, 2, 1))
        if len(encoder_input_tensors) > 1:
            encoder_input = torch.cat(encoder_input_tensors, dim=2)
        else:
            encoder_input = encoder_input_tensors[0]

        # Shape: batch x seq_len x emb_dim
        encoded_text = self.encoder(encoder_input, mask)

        ner_logits = self.tag_projection_layer(encoded_text)
        best_ner_paths = self.crf.viterbi_tags(ner_logits, mask)

        # Just get the tags and ignore the score.
        predicted_ner_tags = []
        predicted_ner_tags_tensor = torch.zeros_like(mask)
        for ner_path, _ in best_ner_paths:
            batch_idx = len(predicted_ner_tags)
            predicted_ner_tags.append(ner_path)
            for token_idx, ner_tag_idx in enumerate(ner_path):
                predicted_ner_tags_tensor[batch_idx, token_idx] = ner_tag_idx
        # predicted_ner_tags = [x for x, y in best_ner_paths]

        output_dict = {
            "ner_logits": ner_logits,
            "mask": mask,
            "tags": predicted_ner_tags
        }

        if self._use_aux_ner_labels:
            coarse_logits = self._coarse_projection_layer(encoded_text)
            modifier_logits = self._modifier_projection_layer(encoded_text)

        if self.ner_tag_embedder is not None:
            embedded_tags = self.ner_tag_embedder(predicted_ner_tags_tensor)
            encoded_sequence = torch.cat([encoded_text, embedded_tags], dim=2)
        else:
            encoded_sequence = torch.cat([
                encoded_text, ner_logits,
                predicted_ner_tags_tensor.unsqueeze(2).float()
            ],
                                         dim=2)

        re_output = self.relation_scorer(encoded_sequence, mask,
                                         relation_root_idxs, relations)

        # Add a prefix for relation extraction logits
        output_dict['re_logits'] = re_output['logits']
        output_dict['relation_scores'] = re_output['relation_scores']

        if tags is not None:
            # Add negative log-likelihood as loss
            log_likelihood = self.crf(ner_logits, tags, mask)

            # It's not clear why, but pylint seems to think `log_likelihood` is tuple
            # (in fact, it's a torch.Tensor), so we need a disable.
            output_dict["ner_loss"] = -log_likelihood  # pylint: disable=invalid-unary-operand-type

            # Represent viterbi tags as "class probabilities" that we can
            # feed into the metrics
            class_probabilities = torch.zeros_like(ner_logits)
            for i, instance_tags in enumerate(predicted_ner_tags):
                for j, tag_id in enumerate(instance_tags):
                    class_probabilities[i, j, tag_id] = 1

            self.ner_accuracy(class_probabilities, tags, mask.float())
            self.ner_f1(class_probabilities, tags, mask.float())

            output_dict['loss'] = output_dict[
                'ner_loss'] + self._re_loss_weight * re_output['loss']

            if self._use_aux_ner_labels:
                assert coarse_tags is not None and modifier_tags is not None, 'Auxiliary losses require auxiliary input'
                self._coarse_acc(coarse_logits, coarse_tags, mask.float())
                self._modifier_acc(modifier_logits, modifier_tags,
                                   mask.float())
                coarse_loss = sequence_cross_entropy_with_logits(
                    coarse_logits, coarse_tags, mask)
                modifier_loss = sequence_cross_entropy_with_logits(
                    modifier_logits, modifier_tags, mask)
                output_dict['loss'] += self._aux_loss_weight * (coarse_loss +
                                                                modifier_loss)

        # Attach metadata
        if metadata is not None:
            for key in metadata[0]:
                output_dict[key] = [x[key] for x in metadata]

        return output_dict
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            verb_indicator: torch.LongTensor,
            target_index: torch.LongTensor,
            span_starts: torch.LongTensor,
            span_ends: torch.LongTensor,
            span_mask: torch.LongTensor,
            constituents: torch.LongTensor = None,
            tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        verb_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        bio : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``
        tags: shape ``(batch_size, num_spans)``
        span_starts: shape ``(batch_size, num_spans)``
        span_ends: shape ``(batch_size, num_spans)``

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        self.batch += 1
        embedded_text_input = self.embedding_dropout(
            self.text_field_embedder(tokens))
        batch_size = embedded_text_input.size(0)
        text_mask = util.get_text_field_mask(tokens)
        embedded_verb_indicator = self.binary_feature_embedding(
            verb_indicator.long())
        # Concatenate the verb feature onto the embedded text. This now
        # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim).
        embedded_text_with_verb_indicator = torch.cat(
            [embedded_text_input, embedded_verb_indicator], -1)
        embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size(
        )[2]

        if self.stacked_encoder.get_input_dim(
        ) != embedding_dim_with_binary_feature:
            raise ConfigurationError(
                "The SRL model uses an indicator feature, which makes "
                "the embedding dimension one larger than the value "
                "specified. Therefore, the 'input_dim' of the stacked_encoder "
                "must be equal to total_embedding_dim + 1.")
        encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator,
                                            text_mask)

        span_starts = F.relu(span_starts.float()).long().view(batch_size, -1)
        span_ends = F.relu(span_ends.float()).long().view(batch_size, -1)
        target_index = F.relu(target_index.float()).long().view(batch_size)
        # shape (batch_size, sequence_length * max_span_width, embedding_dim)
        span_embeddings = span_srl_util.compute_span_representations(
            self.max_span_width, encoded_text, target_index, span_starts,
            span_ends, self.span_width_embedding,
            self.span_direction_embedding, self.span_distance_embedding,
            self.span_distance_bin, self.head_scorer)
        span_scores = self.span_feedforward(span_embeddings)

        srl_logits = self.srl_arg_projection_layer(span_scores)
        constit_logits = self.constit_arg_projection_layer(span_scores)
        output_dict = {
            "srl_logits": srl_logits,
            "constit_logits": constit_logits,
            "mask": text_mask
        }

        tags = tags.view(batch_size, -1, self.max_span_width)
        constituents = constituents.view(batch_size, -1, self.max_span_width)

        # Viterbi decoding
        if not self.training or (self.training and not self.fast_mode):
            srl_prediction, srl_probabilities = self.semi_crf.viterbi_tags(
                srl_logits, text_mask)
            output_dict["srl_tags"] = srl_prediction
            output_dict["srl_tag_probabilities"] = srl_probabilities
            self.metrics["srl"](predictions=srl_prediction.view(
                batch_size, -1, self.max_span_width),
                                gold_labels=tags,
                                mask=text_mask)

            reshaped_constit_logits = constit_logits.view(
                -1, self.num_constit_tags)
            constit_probabilities = F.softmax(reshaped_constit_logits, dim=-1)
            constit_predictions = constit_probabilities.max(-1)[1]
            output_dict["constit_tags"] = constit_predictions
            output_dict["constit_probabilities"] = constit_probabilities

            constit_predictions = constit_predictions.view(
                batch_size, -1, self.max_span_width)
            self.metrics["constituents"](predictions=constit_predictions,
                                         gold_labels=constituents,
                                         mask=text_mask)

        # Loss computation
        if self.training or (not self.training and not self.fast_mode):
            if tags is not None:
                srl_log_likelihood, _ = self.semi_crf(srl_logits,
                                                      tags,
                                                      mask=text_mask)
                output_dict["srl_loss"] = -srl_log_likelihood
            if constituents is not None:
                # Flattening it out.
                constituents = constituents.view(batch_size, -1)
                constit_loss = util.sequence_cross_entropy_with_logits(
                    constit_logits, constituents, span_mask)
                output_dict["constit_loss"] = constit_loss
            if tags is not None and constituents is not None:
                if self.batch > self.cutoff_batch:
                    output_dict["loss"] = - srl_log_likelihood + self.mixing_ratio * \
                        constit_loss
                else:
                    output_dict["loss"] = -srl_log_likelihood
        if self.fast_mode and not self.training:
            output_dict["loss"] = Variable(torch.FloatTensor([0.00]))

        return output_dict
예제 #27
0
    def forward(self,  # type: ignore
                question: Dict[str, torch.LongTensor],
                passage: Dict[str, torch.LongTensor],
                answer_impossible:torch.LongTensor = None,
                span_start: torch.IntTensor = None,
                span_end: torch.IntTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        question : Dict[str, torch.LongTensor]
            From a ``TextField``.
        passage : Dict[str, torch.LongTensor]
            From a ``TextField``.  The model assumes that this passage contains the answer to the
            question, and predicts the beginning and ending positions of the answer within the
            passage.
        span_start : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            beginning position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        span_end : ``torch.IntTensor``, optional
            From an ``IndexField``.  This is one of the things we are trying to predict - the
            ending position of the answer with the passage.  This is an `inclusive` token index.
            If this is given, we will compute a loss that gets included in the output dictionary.
        metadata : ``List[Dict[str, Any]]``, optional
            If present, this should contain the question ID, original passage text, and token
            offsets into the passage for each instance in the batch.  We use this for computing
            official metrics using the official SQuAD evaluation script.  The length of this list
            should be the batch size, and each dictionary should have the keys ``id``,
            ``original_passage``, and ``token_offsets``.  If you only want the best span string and
            don't care about official metrics, you can omit the ``id`` key.

        Returns
        -------
        An output dictionary consisting of:
        span_start_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span start position.
        span_start_probs : torch.FloatTensor
            The result of ``softmax(span_start_logits)``.
        span_end_logits : torch.FloatTensor
            A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log
            probabilities of the span end position (inclusive).
        span_end_probs : torch.FloatTensor
            The result of ``softmax(span_end_logits)``.
        best_span : torch.IntTensor
            The result of a constrained inference over ``span_start_logits`` and
            ``span_end_logits`` to find the most probable span.  Shape is ``(batch_size, 2)``
            and each offset is a token index.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        best_span_str : List[str]
            If sufficient metadata was provided for the instances in the batch, we also return the
            string from the original passage that the model thinks is the best answer to the
            question.
        """
        embedded_question = self._highway_layer(self._text_field_embedder(question))
        embedded_passage = self._highway_layer(self._text_field_embedder(passage))
        batch_size = embedded_question.size(0)
        passage_length = embedded_passage.size(1)
        question_mask = util.get_text_field_mask(question).float()
        passage_mask = util.get_text_field_mask(passage).float()
        question_lstm_mask = question_mask if self._mask_lstms else None
        passage_lstm_mask = passage_mask if self._mask_lstms else None

        encoded_question = self._dropout(self._phrase_layer(embedded_question, question_lstm_mask))
        encoded_passage = self._dropout(self._phrase_layer(embedded_passage, passage_lstm_mask))
        encoding_dim = encoded_question.size(-1)

        # Shape: (batch_size, passage_length, question_length)
        passage_question_similarity = self._matrix_attention(encoded_passage, encoded_question)
        # Shape: (batch_size, passage_length, question_length)
        passage_question_attention = util.last_dim_softmax(passage_question_similarity, question_mask)
        # Shape: (batch_size, passage_length, encoding_dim)
        passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)

        # We replace masked values with something really negative here, so they don't affect the
        # max below.
        masked_similarity = util.replace_masked_values(passage_question_similarity,
                                                       question_mask.unsqueeze(1),
                                                       -1e7)
        # Shape: (batch_size, passage_length)
        question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1)
        # Shape: (batch_size, passage_length)
        question_passage_attention = util.masked_softmax(question_passage_similarity, passage_mask)
        # Shape: (batch_size, encoding_dim)
        question_passage_vector = util.weighted_sum(encoded_passage, question_passage_attention)
        # Shape: (batch_size, passage_length, encoding_dim)
        tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(batch_size,
                                                                                    passage_length,
                                                                                    encoding_dim)

        # Shape: (batch_size, passage_length, encoding_dim * 4)
        final_merged_passage = torch.cat([encoded_passage,
                                          passage_question_vectors,
                                          encoded_passage * passage_question_vectors,
                                          encoded_passage * tiled_question_passage_vector],
                                         dim=-1)

        modeled_passage = self._dropout(self._modeling_layer(final_merged_passage, passage_lstm_mask))
        modeling_dim = modeled_passage.size(-1)

        # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim))
        span_start_input = self._dropout(torch.cat([final_merged_passage, modeled_passage], dim=-1))
        # Shape: (batch_size, passage_length)
        span_start_logits = self._span_start_predictor(span_start_input).squeeze(-1)
        # Shape: (batch_size, passage_length)
        span_start_probs = sigmoid(span_start_logits)

        # Shape: (batch_size, modeling_dim)
        span_start_representation = util.weighted_sum(modeled_passage, span_start_probs)
        # Shape: (batch_size, passage_length, modeling_dim)
        tiled_start_representation = span_start_representation.unsqueeze(1).expand(batch_size,
                                                                                   passage_length,
                                                                                   modeling_dim)

        # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim * 3)
        span_end_representation = torch.cat([final_merged_passage,
                                             modeled_passage,
                                             tiled_start_representation,
                                             modeled_passage * tiled_start_representation],
                                            dim=-1)
        # Shape: (batch_size, passage_length, encoding_dim)
        encoded_span_end = self._dropout(self._span_end_encoder(span_end_representation,
                                                                passage_lstm_mask))
        # Shape: (batch_size, passage_length, encoding_dim * 4 + span_end_encoding_dim)
        span_end_input = self._dropout(torch.cat([final_merged_passage, encoded_span_end], dim=-1))
        span_end_logits = self._span_end_predictor(span_end_input).squeeze(-1)
        
        span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7)
        span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7)
        span_start_probs = sigmoid(span_start_logits)
        span_end_probs = sigmoid(span_end_logits)
        best_span = self.get_best_span(span_start_probs,span_end_probs)
        
        
        output_dict = {
                "passage_question_attention": passage_question_attention,
                "span_start_logits": span_start_logits,
                "span_start_probs": span_start_probs,
                "span_end_logits": span_end_logits,
                "span_end_probs": span_end_probs,
                "best_span": best_span,
                
                }

        # Compute the loss for training.
        if answer_impossible is not None:
            
            target_start=torch.arange(0,span_start_logits.size(1),device=span_start_logits.device,dtype=torch.long)
            target_start=target_start.squeeze(0).expand(span_start_logits.size(0),-1)==span_start
            target_start=target_start.long()*(-1*(answer_impossible-1).unsqueeze(1).expand(-1,target_start.size(-1)))
            
            target_end=torch.arange(0,span_end_logits.size(1),device=span_end_logits.device,dtype=torch.long)
            target_end=target_end.squeeze(0).expand(span_end_logits.size(0),-1)==span_end
            target_end=target_end.long()*(-1*(answer_impossible-1).unsqueeze(1).expand(-1,target_start.size(-1)))
            
            span_start_logits_for_loss=torch.stack([-1*span_start_logits,span_start_logits],dim=-1)
            
            loss = util.sequence_cross_entropy_with_logits(span_start_logits_for_loss,target_start, passage_mask)
            
            span_end_logits_for_loss=torch.stack([-1*span_end_logits,span_end_logits],dim=-1)
            loss += util.sequence_cross_entropy_with_logits(span_end_logits_for_loss,target_end, passage_mask)
                
            
            
            
            self._span_start_accuracy((span_start_logits>0).long(), target_start)
            self._span_end_accuracy((span_end_logits>0).long(), target_end)
            self._answer_impossible_accuracy(((best_span.narrow(1,0, 1)==-1)*(best_span.narrow(1,1, 1)==-1)).long(), answer_impossible)
#             self._span_accuracy(best_span, torch.stack([span_start, span_end], -1))
            output_dict["loss"] = loss

        # Compute the EM and F1 on SQuAD and add the tokenized input to the output.
        
        if metadata is not None:
            output_dict['best_span_str'] = []
            question_tokens = []
            passage_tokens = []
            for i in range(batch_size):
                question_tokens.append(metadata[i]['question_tokens'])
                passage_tokens.append(metadata[i]['passage_tokens'])
                passage_str = metadata[i]['original_passage']
                offsets = metadata[i]['token_offsets']
                predicted_span = tuple(best_span[i].detach().cpu().numpy())
                try:
                    if predicted_span[0]!=-1:
                        start_offset = offsets[predicted_span[0]][0]
                    else:
                        start_offset=-1
                    if predicted_span[1]!=-1:
                        end_offset = offsets[predicted_span[1]][1]
                    else:
                        end_offset=-1
                    if end_offset!=-1 and start_offset!=-1:
                        best_span_string = passage_str[start_offset:end_offset]
                    else:
                        best_span_string=""
                    output_dict['best_span_str'].append(best_span_string)
                    answer_texts = metadata[i].get('answer_texts', [])
                    if answer_texts:
                        self._squad_metrics(best_span_string, answer_texts)
                except Exception as e:
                    print(str(e))    
            output_dict['question_tokens'] = question_tokens
            output_dict['passage_tokens'] = passage_tokens
        return output_dict
예제 #28
0
    def forward(
        self,  # type: ignore
        tokens: TextFieldTensors,
        spans: torch.LongTensor,
        metadata: List[Dict[str, Any]],
        pos_tags: TextFieldTensors = None,
        span_labels: torch.LongTensor = None,
    ) -> Dict[str, torch.Tensor]:

        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`, which should typically be passed directly to a
            `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer`
            tensors.  At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens":
            Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used
            for the `TokenIndexers` when you created the `TextField` representing your
            sequence.  The dictionary is designed to be passed directly to a `TextFieldEmbedder`,
            which knows how to combine different word representations into a single vector per
            token in your input.
        spans : `torch.LongTensor`, required.
            A tensor of shape `(batch_size, num_spans, 2)` representing the
            inclusive start and end indices of all possible spans in the sentence.
        metadata : `List[Dict[str, Any]]`, required.
            A dictionary of metadata for each batch element which has keys:
                tokens : `List[str]`, required.
                    The original string tokens in the sentence.
                gold_tree : `nltk.Tree`, optional (default = `None`)
                    Gold NLTK trees for use in evaluation.
                pos_tags : `List[str]`, optional.
                    The POS tags for the sentence. These can be used in the
                    model as embedded features, but they are passed here
                    in addition for use in constructing the tree.
        pos_tags : `torch.LongTensor`, optional (default = `None`)
            The output of a `SequenceLabelField` containing POS tags.
        span_labels : `torch.LongTensor`, optional (default = `None`)
            A torch tensor representing the integer gold class labels for all possible
            spans, of shape `(batch_size, num_spans)`.

        # Returns

        An output dictionary consisting of:

        class_probabilities : `torch.FloatTensor`
            A tensor of shape `(batch_size, num_spans, span_label_vocab_size)`
            representing a distribution over the label classes per span.
        spans : `torch.LongTensor`
            The original spans tensor.
        tokens : `List[List[str]]`, required.
            A list of tokens in the sentence for each element in the batch.
        pos_tags : `List[List[str]]`, required.
            A list of POS tags in the sentence for each element in the batch.
        num_spans : `torch.LongTensor`, required.
            A tensor of shape (batch_size), representing the lengths of non-padded spans
            in `enumerated_spans`.
        loss : `torch.FloatTensor`, optional
            A scalar loss to be optimised.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        if pos_tags is not None and self.pos_tag_embedding is not None:
            embedded_pos_tags = self.pos_tag_embedding(pos_tags)
            embedded_text_input = torch.cat([embedded_text_input, embedded_pos_tags], -1)
        elif self.pos_tag_embedding is not None:
            raise ConfigurationError("Model uses a POS embedding, but no POS tags were passed.")

        mask = get_text_field_mask(tokens)
        # Looking at the span start index is enough to know if
        # this is padding or not. Shape: (batch_size, num_spans)
        span_mask = (spans[:, :, 0] >= 0).squeeze(-1)
        if span_mask.dim() == 1:
            # This happens if you use batch_size 1 and encounter
            # a length 1 sentence in PTB, which do exist. -.-
            span_mask = span_mask.unsqueeze(-1)
        if span_labels is not None and span_labels.dim() == 1:
            span_labels = span_labels.unsqueeze(-1)

        num_spans = get_lengths_from_binary_sequence_mask(span_mask)

        encoded_text = self.encoder(embedded_text_input, mask)

        span_representations = self.span_extractor(encoded_text, spans, mask, span_mask)

        if self.feedforward_layer is not None:
            span_representations = self.feedforward_layer(span_representations)

        logits = self.tag_projection_layer(span_representations)
        class_probabilities = masked_softmax(logits, span_mask.unsqueeze(-1))

        output_dict = {
            "class_probabilities": class_probabilities,
            "spans": spans,
            "tokens": [meta["tokens"] for meta in metadata],
            "pos_tags": [meta.get("pos_tags") for meta in metadata],
            "num_spans": num_spans,
        }
        if span_labels is not None:
            loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask)
            self.tag_accuracy(class_probabilities, span_labels, span_mask)
            output_dict["loss"] = loss

        # The evalb score is expensive to compute, so we only compute
        # it for the validation and test sets.
        batch_gold_trees = [meta.get("gold_tree") for meta in metadata]
        if all(batch_gold_trees) and self._evalb_score is not None and not self.training:
            gold_pos_tags: List[List[str]] = [
                list(zip(*tree.pos()))[1] for tree in batch_gold_trees
            ]
            predicted_trees = self.construct_trees(
                class_probabilities.cpu().data,
                spans.cpu().data,
                num_spans.data,
                output_dict["tokens"],
                gold_pos_tags,
            )
            self._evalb_score(predicted_trees, batch_gold_trees)

        return output_dict
예제 #29
0
파일: model.py 프로젝트: zzshou/mt-en2zh
    def forward(self,
                source_tokens,
                target_tokens=None) -> Dict[str, torch.Tensor]:
        inputs = source_tokens
        targets = target_tokens
        input_ids, input_mask = inputs["tokens"]["token_ids"], inputs["tokens"]["mask"]

        outputs = {}

        # If no targets are provided, then shift input to right by 1. Bart already does this internally
        # but it does not use them for loss calculation.
        if targets is not None:
            target_ids, target_mask = targets["tokens"]["token_ids"], targets["tokens"]["mask"]
        else:
            target_ids = input_ids[:, 1:]
            target_mask = input_mask[:, 1:]

        if self.training: # training
            outputs = self.plm(input_ids=input_ids, attention_mask=input_mask,
                               decoder_input_ids=target_ids[:, :-1].contiguous(),
                               decoder_attention_mask=target_mask[:, :-1].contiguous(),
                               use_cache=False, return_dict=True)
            outputs["decoder_logits"] = outputs.logits
            outputs["loss"] = sequence_cross_entropy_with_logits(
                outputs.logits,
                cast(torch.LongTensor, target_ids[:, 1:].contiguous()),
                cast(torch.BoolTensor, target_mask[:, 1:].contiguous()),
                label_smoothing=0.1,
                average="token",
            )
        elif targets is not None: # validation
            outputs = self.plm(input_ids=input_ids, attention_mask=input_mask,
                               decoder_input_ids=target_ids[:, :-1].contiguous(),
                               decoder_attention_mask=target_mask[:, :-1].contiguous(),
                               use_cache=False, return_dict=True)
            outputs["decoder_logits"] = outputs.logits
            outputs["loss"] = sequence_cross_entropy_with_logits(
                outputs.logits,
                cast(torch.LongTensor, target_ids[:, 1:].contiguous()),
                cast(torch.BoolTensor, target_mask[:, 1:].contiguous()),
                label_smoothing=0.1,
            )
            self._rouge(torch.argmax(outputs.logits, -1), target_ids)
            self._bleu(torch.argmax(outputs.logits, -1), target_ids)
        else: #prediction
            # Use decoder start id and start of sentence to start decoder
            initial_decoder_ids = torch.tensor(
                [[self._decoder_start_id]],
                dtype=input_ids.dtype,
                device=input_ids.device,
            ).repeat(input_ids.shape[0], 1)

            inital_state = {
                "input_ids": input_ids,
                "input_mask": input_mask,
            }
            beam_result = self._beam_search.search(
                initial_decoder_ids, inital_state, self.take_step
            )

            predictions = beam_result[0]
            logger.info(beam_result)

            max_pred_indices = (
                beam_result[1].argmax(dim=-1).view(-1, 1, 1).expand(-1, -1, predictions.shape[-1])
            )
            predictions = predictions.gather(dim=1, index=max_pred_indices).squeeze(dim=1)

            self._rouge(predictions, target_ids)
            self._bleu(predictions, target_ids)

            outputs["predictions"] = predictions
            outputs["log_probabilities"] = (
                beam_result[1].gather(dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1)
            )

            self.make_output_human_readable(outputs)

        return outputs
예제 #30
0
    def forward(
            self,
            token_sequence: Dict[str, torch.Tensor],
            label_sequence: torch.Tensor = None) -> Dict[str, torch.Tensor]:

        mask = get_text_field_mask(token_sequence)

        if 'mixture' in self.config.embedding_strategy:
            word2vec_embedder = self.embedders[1]

            # Keep in mind to manage any custom models that don't produce 256-dim vectors
            if any([
                    strat in self.config.embedding_strategy
                    for strat in ['elmo_original', 'elmo_pubmed']
            ]):
                word2vec_embeddings = word2vec_embedder(
                    {'tokens': token_sequence['tokens']})
                word2vec_embeddings = torch.cat([
                    word2vec_embeddings, word2vec_embeddings,
                    word2vec_embeddings, word2vec_embeddings
                ],
                                                dim=2)
            else:
                word2vec_embeddings = word2vec_embedder(
                    {'tokens': token_sequence['tokens']})

            # Pad with zeros at BOS and EOS
            batch_size, _, embedding_dim = word2vec_embeddings.shape
            zeros = torch.zeros([batch_size, 1, embedding_dim])
            if self.config.device == 'gpu':
                zeros = zeros.cuda()

            padded_word2vec_embeddings = torch.cat(
                [zeros, word2vec_embeddings, zeros], dim=1)

            elmo_embedder = self.embedders[0]
            embeddings = elmo_embedder(
                {'characters': token_sequence['characters']},
                word2vec_embeddings=padded_word2vec_embeddings)

        else:
            embeddings = []
            for embedder in self.embedders:
                if hasattr(embedder, 'token_embedder_characters'):
                    embeddings.append(
                        embedder({'characters': token_sequence['characters']}))
                elif hasattr(embedder, 'token_embedder_tokens'):
                    embeddings.append(
                        embedder({'tokens': token_sequence['tokens']}))

            embeddings = torch.cat(embeddings, dim=2)

        encoder_output = self.encoder(embeddings, mask)

        label_logits = self.linear_layer(encoder_output)

        self.accuracy(label_logits, label_sequence, mask)
        self.F1(label_logits, label_sequence, mask)

        output = {
            "label_logits":
            label_logits,
            "loss":
            sequence_cross_entropy_with_logits(label_logits, label_sequence,
                                               mask)
        }

        return output
예제 #31
0
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                tags: torch.LongTensor = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containing the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """

        #Davedit
        print_vocab = False


        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
                                                                          sequence_length,
                                                                          self.num_classes])

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
        # x = tokens['tokens']
        # y = tokens['pos_tag']
        # w = tokens['ner_tag']

        # import code
        # code.interact(local=locals())
        # import pdb
        # pdb.set_trace()


        if tags is not None:
            if self.do_crossentropy_weighting:
                # Implementing custom loss function, weight tags = 1 vs tags = 0
                # Note this only works for binary tags at present


                Nt0 = self.Ntags0          # Should correspond to non-blanks
                Nt1 = self.Ntags1             # Should correspond to blanks

                if not (Nt0 and Nt1):
                    # If either Nt0 or Nt1 are unspecified
                    Nt0 = sum(sum((tags==0).double()))
                    Nt1 = sum(sum((tags==1).double()))

                #import pdb; pdb.set_trace()

                # # Convert N blanks to weights - weight is inversely proportional to number of tags
                # t0wp = Nt1 / (Nt0 + Nt1)      # t0 weighting percent
                # t1wp = Nt0 / (Nt0 + Nt1)       # t1 weighting percent
                # mask2 = mask.clone()
                # mask2 = mask2.double()
                # mask2[tags == 1] = mask2[tags == 1] * t1wp
                # mask2[tags == 0] = mask2[tags == 0] * t0wp
                # loss = sequence_cross_entropy_with_logits(logits, tags, mask2)

                # Convert N blanks to weights - weight is inversely proportional to number of tags
                t0wp = Nt1 / (Nt0 + Nt1)*100      # t0 weighting percent
                t1wp = Nt0 / (Nt0 + Nt1)*100       # t1 weighting percent
                mask2 = mask.clone()
                mask2[tags == 1] = mask2[tags == 1] * t1wp
                mask2[tags == 0] = mask2[tags == 0] * t0wp
                loss = sequence_cross_entropy_with_logits(logits, tags, mask2)

                # # Old code, hardcoded
                # Nnonblanks = 5877.15 - 256.16     # Average number of non-blanks per article
                # Nblanks = 256.16                  # Average # blanks per article
                # blank_weight = Nnonblanks / (Nnonblanks + Nblanks)*100
                # nblank_weight = Nblanks / (Nnonblanks + Nblanks)*100
                # mask2 = mask
                # mask2[tags == 1] = mask2[tags == 1] * blank_weight
                # mask2[tags == 0] = mask2[tags == 0] * nblank_weight
                # loss = sequence_cross_entropy_with_logits(logits, tags, mask2)
            else:
                # Defualt AllenNLP loss
                loss = sequence_cross_entropy_with_logits(logits, tags, mask)

            if print_vocab:
                vocab = self.vocab
                vo = vocab.get_index_to_token_vocabulary('pos')
                out = set([v for v in vo.values()])
                print(out)
                vo = vocab.get_index_to_token_vocabulary('ner')
                out = set([v for v in vo.values()])
                print(out)
                vo = vocab.get_index_to_token_vocabulary('dependencies')
                out = set([v for v in vo.values()])
                print(out)
                # Results of vocab from 1st 20 articles. Can use these to set embedding dimensions
                # {'VBP', 'PRP$', 'SYM', 'XX', ':', 'ADD', 'NNS', 'CC', 'VBG', 'RBR', 'NNP', 'IN', 'JJ', 'TO', 'NFP', 'NNPS', 'PRP', 'LS', 'NN', 'CD', 'FW', 'MD', 'AFX', 'PDT', "''", 'RP', 'JJR', 'RB', 'VB', '``', '.', 'VBD', 'VBN', '-RRB-', 'JJS', 'RBS', '$', '@@PADDING@@', 'EX', 'HYPH', 'POS', '-LRB-', 'WP$', 'VBZ', ',', 'UH', 'WP', 'DT', 'WDT', 'WRB', '@@UNKNOWN@@'}
                # {'PERCENT', 'ORG', 'NONE', 'ORDINAL', 'MONEY', 'CARDINAL', 'NORP', 'LANGUAGE', 'DATE', 'WORK_OF_ART', 'LAW', 'LOC', 'PERSON', 'QUANTITY', 'EVENT', 'GPE', 'TIME', 'FAC', '@@PADDING@@', 'PRODUCT', '@@UNKNOWN@@'}
                # {'poss', 'attr', 'xcomp', 'npadvmod', 'agent', 'parataxis', 'mark', 'nmod', 'predet', 'compound', 'ROOT', 'intj', 'csubjpass', 'nsubjpass', 'preconj', 'amod', 'csubj', 'ccomp', 'punct', 'advcl', 'conj', 'acomp', 'oprd', 'case', 'nsubj', 'dobj', 'nummod', 'prt', 'cc', 'advmod', 'appos', 'neg', 'pcomp', 'quantmod', 'dep', 'meta', '@@PADDING@@', 'relcl', 'expl', 'acl', 'dative', 'auxpass', 'det', 'aux', 'prep', 'pobj', '@@UNKNOWN@@'}
                # Vocab size
                # Vocabulary with namespaces:  dependencies, Size: 47 || ner, Size: 21 || pos, Size: 51 || tokens, Size: 21902 || labels, Size: 2 || Non Padded Namespaces: {'*tags', '*labels'}


                # import pdb
                # pdb.set_trace()

            for metric in self.metrics.values():
                metric(logits, tags, mask.float())
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #32
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            verb_indicator: torch.LongTensor,
            tags: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        verb_indicator: torch.LongTensor, required.
            An integer ``SequenceFeatureField`` representation of the position of the verb
            in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be
            all zeros, in the case that the sentence has no verbal predicate.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels
            of shape ``(batch_size, num_tokens)``
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containg the original words in the sentence and the verb to compute the
            frame for, under 'words' and 'verb' keys, respectively.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.embedding_dropout(
            self.text_field_embedder(tokens))
        mask = get_text_field_mask(tokens)
        embedded_verb_indicator = self.binary_feature_embedding(
            verb_indicator.long())
        # Concatenate the verb feature onto the embedded text. This now
        # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim).
        embedded_text_with_verb_indicator = torch.cat(
            [embedded_text_input, embedded_verb_indicator], -1)
        batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size(
        )

        encoded_text = self.encoder(embedded_text_with_verb_indicator, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])
        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }
        if tags is not None:
            loss = sequence_cross_entropy_with_logits(
                logits, tags, mask, label_smoothing=self._label_smoothing)
            if not self.ignore_span_metric:
                self.span_metric(class_probabilities, tags, mask)
            output_dict["loss"] = loss

        # We need to retain the mask in the output dictionary
        # so that we can crop the sequences to remove padding
        # when we do viterbi inference in self.decode.
        output_dict["mask"] = mask

        words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata])
        if metadata is not None:
            output_dict["words"] = list(words)
            output_dict["verb"] = list(verbs)
        return output_dict
def train_model(epochs=50,
                num_gradients_accumulation=4,
                batch_size=4,
                gpu_id=0,
                lr=1e-5,
                load_dir='decoder_model'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')

    model = transformers_model()
    device = torch.device(f"cuda:0")
    model.to(device)

    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("../train_data.pth")
    train_dataset = TensorDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size)
    val_data = torch.load("../validate_data.pth")
    val_dataset = TensorDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,\
        lr=lr,\
        weight_decay=0.01,
    )
    scheduler = get_linear_schedule_with_warmup(
        optimizer, \
        num_warmup_steps=num_train_optimization_steps // 10, \
        num_training_steps=num_train_optimization_steps
    )
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    lowest_perplexity = 10000000

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        model.train()
        losses = 0
        times = 0
        for batch in train_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch
            logits = model(encoder_input, mask_encoder_input, decoder_input,
                           mask_decoder_input)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()

            times += 1
            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        model.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in val_dataloader:
                batch = [item.to(device) for item in batch]

                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch
                logits = model(encoder_input, mask_encoder_input,
                               decoder_input, mask_decoder_input)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")

                perplexity += np.exp(loss.item())

                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        validate_perplexity = perplexity / batch_count
        direct_path = os.path.join(os.path.abspath('.'), load_dir)
        if not os.path.exists(direct_path):
            os.mkdir(direct_path)
        #torch.save(model.state_dict(), os.path.join(direct_path, str(epoch) + "model.pth"))

        if validate_perplexity < lowest_perplexity:
            lowest_perplexity = validate_perplexity
            torch.save(model.state_dict(),
                       os.path.join(direct_path, "best_model.pth"))
예제 #34
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=8,
                gpu_id=0,
                lr=1e-4,
                load_dir='decoder_model',
                decoder_model='original_pretrained_model_for_bertGPT.pth'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    model = BertGPT()
    model.load_state_dict(torch.load(decoder_model))
    # model = nn.DataParallel(model, device_ids = [0])
    model = model.to(device)
    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("train_data.pth")
    train_dataset = MyDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size,
                                  num_workers=2,
                                  collate_fn=collate_fn)
    val_data = torch.load("validate_data.pth")
    val_dataset = MyDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size,
                                num_workers=2,
                                collate_fn=collate_fn)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.0
    }]
    print('train')
    print(len(optimizer_grouped_parameters[0]['params']))

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=lr,
                           warmup=0.01,
                           max_grad_norm=1.0,
                           weight_decay=0.01,
                           t_total=num_train_optimization_steps)
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        model.train()
        losses = 0
        times = 0
        for batch in tqdm(train_dataloader, desc='dirs'):
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(encoder_input, mask_encoder_input, decoder_input,
                           mask_decoder_input)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()
            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                optimizer.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        model.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                batch = [item.to(device) for item in batch]
                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                logits = model(encoder_input, mask_encoder_input,
                               decoder_input, mask_decoder_input)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        direct_path = os.path.join(os.path.abspath('.'), load_dir)
        if not os.path.exists(direct_path):
            os.mkdir(direct_path)

        torch.save(model.state_dict(),
                   os.path.join(direct_path,
                                str(epoch) + "model.pth"))
예제 #35
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            spans: torch.LongTensor,
            span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        spans : ``torch.LongTensor``, required.
            A tensor of shape ``(batch_size, num_spans, 2)`` representing the
            inclusive start and end indices of all possible spans in the sentence.
        span_labels : torch.LongTensor, optional (default = None)
            A torch tensor representing the integer gold class labels for all possible
            spans, of shape ``(batch_size, num_spans)``.

        Returns
        -------
        An output dictionary consisting of:
        logits : ``torch.FloatTensor``
            A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)``
            representing unnormalised log probabilities of the label classes for each span.
        class_probabilities : ``torch.FloatTensor``
            A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)``
            representing a distribution over the label classes per span.
        loss : ``torch.FloatTensor``, optional
            A scalar loss to be optimised.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        mask = get_text_field_mask(tokens)
        # Looking at the span start index is enough to know if
        # this is padding or not. Shape: (batch_size, num_spans)
        span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long()

        encoded_text = self.encoder(embedded_text_input, mask)
        span_representations = self.span_extractor(encoded_text, spans, mask,
                                                   span_mask)
        if self.feedforward_layer is not None:
            span_representations = self.feedforward_layer(span_representations)
        logits = self.tag_projection_layer(span_representations)
        class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1))

        output_dict = {
            "class_probabilities": class_probabilities,
            "spans": spans,
            # TODO(Mark): This relies on having tokens represented with a SingleIdTokenIndexer...
            "tokens": tokens["tokens"],
            "token_mask": mask
        }
        if span_labels is not None:
            loss = sequence_cross_entropy_with_logits(logits, span_labels,
                                                      span_mask)
            for metric in self.metrics.values():
                metric(logits, span_labels, span_mask)
            output_dict["loss"] = loss

        return output_dict
예제 #36
0
    def forward(self, prev_tokens: Dict[str, torch.LongTensor],
                prev_tags: Dict[str, torch.LongTensor],
                fol_tokens: Dict[str, torch.LongTensor],
                fol_tags: Dict[str, torch.LongTensor],
                prev_labels: torch.Tensor = None,
                fol_labels: torch.Tensor = None,
                conflicts: List[Any] = None,
                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:

        prev_mask = get_text_field_mask(prev_tokens)
        # embedding sequence
        prev_embedding_seq = self.token_field_embedding(prev_tokens)
        # embedding tag
        prev_tag_embedding = self.char_field_embedding(prev_tags)

        fol_mask = get_text_field_mask(fol_tokens)
        # embedding sequence
        fol_embedding_seq = self.token_field_embedding(fol_tokens)
        # embedding tag
        fol_tag_embedding = self.char_field_embedding(fol_tags)

        batch_size, _ = prev_mask.size()

        # initialization in specific gpu devices
        gpu_device = prev_embedding_seq.device

        prev_phrase_tensor = torch.tensor([0.0], device=gpu_device)
        fol_phrase_tensor = torch.tensor([1.0], device=gpu_device)

        prev_phrase_embedding_seq = prev_phrase_tensor.repeat(
            prev_embedding_seq.size(0),
            prev_embedding_seq.size(1),
            1
        )

        fol_phrase_embedding_seq = fol_phrase_tensor.repeat(
            fol_embedding_seq.size(0),
            fol_embedding_seq.size(1),
            1
        )

        # concat embedding and phrase
        prev_embedding_seq = torch.cat([prev_embedding_seq, prev_phrase_embedding_seq, prev_tag_embedding],
                                       dim=2)
        fol_embedding_seq = torch.cat([fol_embedding_seq, fol_phrase_embedding_seq, fol_tag_embedding], dim=2)

        prev_embedding_seq = self.projection_layer(prev_embedding_seq)
        fol_embedding_seq = self.projection_layer(fol_embedding_seq)

        # embedding phrase label 0 means prev, 1 means follow-up
        if self.training:
            embedding = torch.cat([prev_embedding_seq, fol_embedding_seq], dim=1)
            embedding_var = self._variational_dropout(embedding)
            prev_mask_len = prev_mask.size(1)
            prev_embedding_seq_var = embedding_var[:, :prev_mask_len]
            fol_embedding_seq_var = embedding_var[:, prev_mask_len:]
        else:
            prev_embedding_seq_var = prev_embedding_seq
            fol_embedding_seq_var = fol_embedding_seq

        # encode sequence
        prev_encoder_out = self.tokens_encoder(prev_embedding_seq_var, prev_mask)
        fol_encoder_out = self.tokens_encoder(fol_embedding_seq_var, fol_mask)

        prev_forward_output = prev_encoder_out[:, :, :self.hidden_size]
        prev_backward_output = prev_encoder_out[:, :, self.hidden_size:]

        fol_forward_output = fol_encoder_out[:, :, :self.hidden_size]
        fol_backward_output = fol_encoder_out[:, :, self.hidden_size:]

        prev_attn_mask = prev_mask.view(batch_size, -1, 1) * fol_mask.view(batch_size, 1, -1)
        prev_forward_attn_matrix = self._self_attention(prev_forward_output, fol_forward_output) / self._scaled_value
        prev_backward_attn_matrix = self._self_attention(prev_backward_output, fol_backward_output) / self._scaled_value
        prev_mean_pooling_attn = util.masked_softmax(prev_forward_attn_matrix + prev_backward_attn_matrix,
                                                     prev_attn_mask)

        # take max pooling rather than average
        prev_attn_vec = torch.matmul(prev_mean_pooling_attn, fol_encoder_out)

        fol_attn_mask = fol_mask.view(batch_size, -1, 1) * prev_mask.view(batch_size, 1, -1)
        fol_forward_attn_matrix = self._self_attention(fol_forward_output, prev_forward_output) / self._scaled_value
        fol_backward_attn_matrix = self._self_attention(fol_backward_output, prev_backward_output) / self._scaled_value
        fol_mean_pooling_attn = util.masked_softmax(fol_forward_attn_matrix + fol_backward_attn_matrix, fol_attn_mask)

        # take max pooling rather than average
        fol_attn_vec = torch.matmul(fol_mean_pooling_attn, prev_encoder_out)

        # non_linear_output = self._non_linear(torch.cat([encoder_out, self_attention_vec], dim=2))
        # prev_linear = torch.cat([prev_encoder_out, prev_attn_vec], dim=2)
        # fol_linear = torch.cat([fol_encoder_out, fol_attn_vec], dim=2)
        prev_attn_multiply = prev_encoder_out * prev_attn_vec
        zero_tensor = torch.zeros((batch_size, 1, prev_attn_multiply.size(2)), device=gpu_device, dtype=torch.float)
        prev_attn_shift = torch.cat((zero_tensor,
                                     prev_attn_multiply[:, :-1, :]), dim=1)
        # shift attn vector to right, and then subtract them
        prev_linear = torch.cat([prev_encoder_out, prev_attn_multiply, prev_attn_shift], dim=2)

        fol_attn_multiply = fol_encoder_out * fol_attn_vec
        fol_attn_shift = torch.cat((zero_tensor,
                                    fol_attn_multiply[:, :-1, :]), dim=1)
        # shift attn vector to right, and then subtract them
        fol_linear = torch.cat([fol_encoder_out, fol_attn_multiply, fol_attn_shift], dim=2)

        prev_tag_logistics = self.policy_net(prev_linear)
        fol_tag_logistics = self.policy_net(fol_linear)

        # project to space
        prev_tag_prob = F.softmax(prev_tag_logistics, dim=2)
        prev_predict_labels = torch.argmax(prev_tag_prob, dim=2)

        fol_tag_prob = F.softmax(fol_tag_logistics, dim=2)
        fol_predict_labels = torch.argmax(fol_tag_prob, dim=2)

        predict_restate_str_list = []
        predict_restate_tag_list = []
        max_bleu_list = []

        # debug information
        _debug_batch_conflict_map = {}

        # using predict labels to cut utterance into span and fetch representations of span
        for batch_ind in range(batch_size):
            _debug_batch_conflict_map[batch_ind] = []

            # batch reference object
            batch_origin_obj = metadata[batch_ind]["origin_obj"]

            prev_start_end, fol_start_end = predict_span_start_end(
                prev_predict_labels[batch_ind, :sum(prev_mask[batch_ind])],
                fol_predict_labels[batch_ind, :sum(fol_mask[batch_ind])])

            # Phase 2: Predict actual fusion str via span start/end and similar gate
            predict_restate_str, predict_restate_tag \
                = self.predict_restate(batch_origin_obj,
                                       fol_start_end,
                                       prev_start_end,
                                       prev_forward_output,
                                       prev_backward_output,
                                       fol_forward_output,
                                       fol_backward_output,
                                       batch_ind,
                                       gpu_device,
                                       _debug_batch_conflict_map)

            # add it to batch
            predict_restate_str_list.append(predict_restate_str)
            predict_restate_tag_list.append(predict_restate_tag)

        batch_golden_restate_str = [" ".join(single_metadata["origin_obj"]["restate"].utterance)
                                    for single_metadata in metadata]

        batch_golden_restate_tag = [single_metadata["origin_obj"]["restate"].tags
                                    for single_metadata in metadata]
        output = {
            "probs": prev_tag_prob,
            "prev_labels": prev_predict_labels,
            "fol_labels": fol_predict_labels,
            "restate": predict_restate_str_list,
            "max_bleu": max_bleu_list
        }

        avg_bleu = self.metrics["bleu"](predict_restate_str_list, batch_golden_restate_str)
        avg_symbol = self.metrics["symbol"](predict_restate_tag_list, batch_golden_restate_tag)

        # overall measure
        self.metrics["overall"]([0.4 * avg_bleu + 0.6 * avg_symbol] * batch_size)

        conflict_confidences = []

        # condition on training to
        if self.training:
            if prev_labels is not None:

                labels = torch.cat([prev_labels, fol_labels], dim=1)
                # Initialization pre-training with longest common string
                logistics = torch.cat([prev_tag_logistics, fol_tag_logistics], dim=1)
                mask = torch.cat([prev_mask, fol_mask], dim=1)
                loss_snippet = sequence_cross_entropy_with_logits(logistics, labels, mask,
                                                                  label_smoothing=0.2)

                # for pre-training, we regard them as optimal ground truth
                conflict_confidences = [1.0] * batch_size
            else:
                if DEBUG:
                    rl_sample_count = 1
                else:
                    rl_sample_count = 20

                batch_loss_snippet = []
                batch_sample_conflicts = []

                # Training Phase 2: train conflict model via margin loss
                for batch_ind in range(batch_size):

                    dynamic_conflicts = []
                    dynamic_confidence = []

                    # batch reference object
                    batch_origin_obj = metadata[batch_ind]["origin_obj"]

                    prev_mask_len = prev_mask[batch_ind].sum().view(1).data.cpu().numpy()[0]
                    fol_mask_len = fol_mask[batch_ind].sum().view(1).data.cpu().numpy()[0]

                    sample_data = []

                    for _ in range(rl_sample_count):
                        prev_multi = Categorical(logits=prev_tag_logistics[batch_ind])
                        fol_multi = Categorical(logits=fol_tag_logistics[batch_ind])

                        prev_label_tensor = prev_multi.sample()
                        prev_label_tensor.data[0].fill_(1)
                        prev_sample_label = prev_label_tensor.data.cpu().numpy().astype(int)[:prev_mask_len]

                        fol_label_tensor = fol_multi.sample()
                        fol_label_tensor.data[0].fill_(1)
                        fol_sample_label = fol_label_tensor.data.cpu().numpy().astype(int)[:fol_mask_len]

                        log_prob = torch.cat(
                            [prev_multi.log_prob(prev_label_tensor), fol_multi.log_prob(fol_label_tensor)],
                            dim=-1)

                        conflict_prob_mat = self.calculate_conflict_prob_matrix(prev_sample_label,
                                                                                fol_sample_label,
                                                                                batch_ind,
                                                                                prev_forward_output,
                                                                                prev_backward_output,
                                                                                fol_forward_output,
                                                                                fol_backward_output,
                                                                                gpu_device)
                        self.policy_net.saved_log_probs.append(log_prob)
                        sample_data.append((prev_sample_label, fol_sample_label, batch_origin_obj, conflict_prob_mat))

                    if DEBUG:
                        ret_data = [sample_action(row) for row in sample_data]
                    else:
                        # Parallel to speed up the sampling process
                        with ThreadPool(4) as p:
                            chunk_size = rl_sample_count // 4
                            ret_data = p.map(sample_action, sample_data, chunksize=chunk_size)

                    for conflict_confidence, reinforce_reward, conflict_pair in ret_data:
                        self.policy_net.rewards.append(reinforce_reward)
                        dynamic_conflicts.append(conflict_pair)
                        dynamic_confidence.append(conflict_confidence)

                    rewards = torch.tensor(self.policy_net.rewards, device=gpu_device).float()
                    self.metrics["reward"](self.policy_net.rewards)
                    rewards -= rewards.mean().detach()
                    self.metrics["reward_var"]([rewards.std().data.cpu().numpy()])

                    loss_snippet = []
                    # reward high, optimize it; reward low, reversal optimization
                    for log_prob, reward in zip(self.policy_net.saved_log_probs,
                                                rewards):
                        loss_snippet.append((- log_prob * reward).unsqueeze(0))

                    loss_snippet = torch.cat(loss_snippet).mean(dim=1).sum().view(1)
                    batch_loss_snippet.append(loss_snippet)

                    # random select one
                    best_conflict_id = choice(range(rl_sample_count))
                    # best_conflict_id = np.argmax(self.policy_net.rewards)
                    batch_sample_conflicts.append(dynamic_conflicts[best_conflict_id])
                    conflict_confidences.append(dynamic_confidence[best_conflict_id])

                    self.policy_net.reset()

                loss_snippet = torch.cat(batch_loss_snippet).mean()

                # according to confidence
                conflicts = []
                for conflict_batch_id in range(batch_size):
                    conflicts.append(batch_sample_conflicts[conflict_batch_id])

            # Training Phase 1: train snippet model
            total_loss = loss_snippet

            border = torch.tensor([0.0], device=gpu_device)
            pos_target = torch.tensor([1.0], device=gpu_device)
            neg_target = torch.tensor([-1.0], device=gpu_device)

            # Training Phase 2: train conflict model via margin loss

            loss_conflict = torch.tensor([0.0], device=gpu_device)[0]
            # random decision on which to use

            for batch_ind in range(0, batch_size):
                batch_conflict_list = conflicts[batch_ind]
                # use prediction results to conflict

                temp_loss_conflict = torch.tensor([0.0], device=gpu_device)[0]

                if batch_conflict_list and len(batch_conflict_list) > 0:
                    for conflict in batch_conflict_list:
                        (prev_start, prev_end), (fol_start, fol_end), conflict_mode = conflict

                        fol_span_repr = get_span_repr(fol_forward_output[batch_ind],
                                                      fol_backward_output[batch_ind],
                                                      fol_start, fol_end)

                        prev_span_repr = get_span_repr(prev_forward_output[batch_ind],
                                                       prev_backward_output[batch_ind],
                                                       prev_start, prev_end)

                        inter_prob = self.cosine_similar(fol_span_repr, prev_span_repr).view(1)
                        # actual conflict
                        if conflict_mode == 1:
                            temp_loss_conflict += self.margin_loss(inter_prob,
                                                                   border,
                                                                   pos_target)
                        else:
                            temp_loss_conflict += self.margin_loss(inter_prob,
                                                                   border,
                                                                   neg_target)

                    temp_confidence = conflict_confidences[batch_ind]
                    loss_conflict += temp_confidence * temp_loss_conflict / len(batch_conflict_list)

            loss_conflict = loss_conflict / batch_size

            # for larger margin
            total_loss += loss_conflict

            output["loss"] = total_loss

        return output
예제 #37
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            spans: torch.LongTensor,
            metadata: List[Dict[str, Any]],
            span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        spans : ``torch.LongTensor``, required.
            A tensor of shape ``(batch_size, num_spans, 2)`` representing the
            inclusive start and end indices of all possible spans in the sentence.
        span_labels : ``torch.LongTensor``, optional (default = None)
            A torch tensor representing the integer gold class labels for all possible
            spans, of shape ``(batch_size, num_spans)``.
        metadata : List[Dict[str, Any]], required.
            A dictionary of metadata for each batch element which has keys:
                tokens : ``List[str]``, required.
                    The original string tokens in the sentence.
                gold_tree : ``nltk.Tree``, optional (default = None)
                    Gold NLTK trees for use in evaluation.

        Returns
        -------
        An output dictionary consisting of:
        class_probabilities : ``torch.FloatTensor``
            A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)``
            representing a distribution over the label classes per span.
        spans : ``torch.LongTensor``
            The original spans tensor.
        tokens : ``List[List[str]]``, required.
            A list of tokens in the sentence for each element in the batch.
        num_spans : ``torch.LongTensor``, required.
            A tensor of shape (batch_size), representing the lengths of non-padded spans
            in ``enumerated_spans``.
        loss : ``torch.FloatTensor``, optional
            A scalar loss to be optimised.
        """

        embedded_text_input = self.text_field_embedder(tokens)
        mask = get_text_field_mask(tokens)
        # Looking at the span start index is enough to know if
        # this is padding or not. Shape: (batch_size, num_spans)
        span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long()
        if span_mask.dim() == 1:
            # This happens if you use batch_size 1 and encounter
            # a length 1 sentence in PTB, which do exist. -.-
            span_mask = span_mask.unsqueeze(-1)

        num_spans = get_lengths_from_binary_sequence_mask(span_mask)

        encoded_text = self.encoder(embedded_text_input, mask)
        span_representations = self.span_extractor(encoded_text, spans, mask,
                                                   span_mask)
        if self.feedforward_layer is not None:
            span_representations = self.feedforward_layer(span_representations)
        logits = self.tag_projection_layer(span_representations)
        class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1))

        output_dict = {
            "class_probabilities": class_probabilities,
            "spans": spans,
            "tokens": [meta["tokens"] for meta in metadata],
            "num_spans": num_spans
        }
        if span_labels is not None:
            loss = sequence_cross_entropy_with_logits(logits, span_labels,
                                                      span_mask)
            for metric in self.metrics.values():
                metric(logits, span_labels, span_mask)
            output_dict["loss"] = loss

        # The evalb score is expensive to compute, so we only compute
        # it for the validation and test sets.
        batch_gold_trees = [meta.get("gold_tree") for meta in metadata]
        if all(batch_gold_trees
               ) and self._evalb_score is not None and not self.training:
            # TODO(Mark): Predict POS and use here instead of using the gold ones.
            gold_pos_tags: List[List[str]] = [
                list(zip(*tree.pos()))[1] for tree in batch_gold_trees
            ]
            predicted_trees = self.construct_trees(
                class_probabilities.cpu().data,
                spans.cpu().data, num_spans.data, output_dict["tokens"],
                gold_pos_tags)
            self._evalb_score(predicted_trees, batch_gold_trees)

        return output_dict
def calculate_perplexity(
    batch_size=1,
    gpu_id=0,
    model_path='./BERT/model-10.pth'
    ):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    # encoder = BartModel.from_pretrained("facebook/bart-base")
    # encoder = encoder.to(device)
    # encoder.eval()

    bert_encoder = BertConfig.from_pretrained('bert-base-uncased')
    bert_decoder = BertConfig.from_pretrained('bert-base-uncased',is_decoder = True)
    config = EncoderDecoderConfig.from_encoder_decoder_configs(bert_encoder,bert_decoder)
    model = EncoderDecoderModel(config)
    model = model.to(device)
    model.load_state_dict(torch.load(model_path,map_location='cuda'))
    model.eval()

    print('load success')
    #------------------------END LOAD MODEL--------------


    #------------------------LOAD VAL DATA------------------
    val_data = torch.load("/content/validate_data.pth")
    val_dataset = TensorDataset(*val_data)

    train_data = torch.load("/content/train_data.pth")
    train_dataset = TensorDataset(*train_data)

    test_data = torch.load("/content/test_data.pth")
    test_dataset = TensorDataset(*test_data)

    val_dataloader = DataLoader(dataset=val_dataset, shuffle=False, batch_size=batch_size)
    train_dataloader = DataLoader(dataset=train_dataset, shuffle=False, batch_size=batch_size)
    test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size)
    #------------------------END LOAD VAL DATA--------------


    #------------------------START VAL-------------------
    perplexity = 0
    batch_count = 0
    print('start calculate the train perplexity....')

    with torch.no_grad():
        for batch in train_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input,
                           decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input)
            
            out = logits[0][:, :-1].contiguous()
                
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()
            
            
            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1


    print(f'train perplexity: {perplexity / batch_count}')

    perplexity = 0
    batch_count = 0
    print('start calculate the validate perplexity....')

    with torch.no_grad():
        for batch in val_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input,
                           decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input)
            
            out = logits[0][:, :-1].contiguous()
                
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1


    print(f'validate perplexity: {perplexity / batch_count}')

    perplexity = 0
    batch_count = 0
    print('start calculate the test perplexity....')

    with torch.no_grad():
        for batch in test_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input,
                           decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input)
            
            out = logits[0][:, :-1].contiguous()
                
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token")
            perplexity += np.exp(loss.item())
            batch_count += 1


    print(f'test perplexity: {perplexity / batch_count}')
예제 #39
0
    def forward(
            self,  # type: ignore
            tokens: Dict[str, torch.LongTensor],
            labels: torch.LongTensor = None,
            d_tags: torch.LongTensor = None,
            metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        lables : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        d_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containing the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        encoded_text = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = encoded_text.size()
        mask = get_text_field_mask(tokens)
        logits_labels = self.tag_labels_projection_layer(
            self.predictor_dropout(encoded_text))
        logits_d = self.tag_detect_projection_layer(encoded_text)

        class_probabilities_labels = F.softmax(logits_labels, dim=-1).view(
            [batch_size, sequence_length, self.num_labels_classes])
        class_probabilities_d = F.softmax(logits_d, dim=-1).view(
            [batch_size, sequence_length, self.num_detect_classes])
        error_probs = class_probabilities_d[:, :, self.incorr_index] * mask
        incorr_prob = torch.max(error_probs, dim=-1)[0]

        if self.confidence > 0:
            probability_change = [self.confidence
                                  ] + [0] * (self.num_labels_classes - 1)
            if self.use_cpu:
                class_probabilities_labels += torch.FloatTensor(
                    probability_change).repeat(
                        (batch_size, sequence_length, 1))
            else:
                class_probabilities_labels += torch.cuda.FloatTensor(
                    probability_change).repeat(
                        (batch_size, sequence_length, 1))

        output_dict = {
            "logits_labels": logits_labels,
            "logits_d_tags": logits_d,
            "class_probabilities_labels": class_probabilities_labels,
            "class_probabilities_d_tags": class_probabilities_d,
            "max_error_probability": incorr_prob
        }
        if labels is not None and d_tags is not None:
            loss_labels = sequence_cross_entropy_with_logits(
                logits_labels,
                labels,
                mask,
                label_smoothing=self.label_smoothing)
            loss_d = sequence_cross_entropy_with_logits(logits_d, d_tags, mask)
            for metric in self.metrics.values():
                metric(logits_labels, labels, mask.float())
                metric(logits_d, d_tags, mask.float())
            output_dict["loss"] = loss_labels + loss_d

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #40
0
    def forward(
        self,  # type: ignore
        tokens: Dict[str, torch.LongTensor],
        tags: torch.LongTensor = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``, which should typically be passed directly to a
            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
            for the ``TokenIndexers`` when you created the ``TextField`` representing your
            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
            which knows how to combine different word representations into a single vector per
            token in your input.
        tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer gold class labels of shape
            ``(batch_size, num_tokens)``.
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            metadata containing the original words in the sentence to be tagged under a 'words' key.

        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            unnormalised log probabilities of the tag classes.
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
            a distribution of the tag classes per word.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.

        """
        embedded_text_input = self.text_field_embedder(tokens)
        batch_size, sequence_length, _ = embedded_text_input.size()
        mask = get_text_field_mask(tokens)
        encoded_text = self.encoder(embedded_text_input, mask)

        logits = self.tag_projection_layer(encoded_text)
        reshaped_log_probs = logits.view(-1, self.num_classes)
        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view(
            [batch_size, sequence_length, self.num_classes])

        output_dict = {
            "logits": logits,
            "class_probabilities": class_probabilities
        }

        if tags is not None:
            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
            for metric in self.metrics.values():
                metric(logits, tags, mask.float())
            if self._f1_metric is not None:
                self._f1_metric(logits, tags, mask.float())
            output_dict["loss"] = loss

        if metadata is not None:
            output_dict["words"] = [x["words"] for x in metadata]
        return output_dict
예제 #41
0
    def forward(
            self,
            source_tokens: TextFieldTensors,
            target_tokens: TextFieldTensors = None) -> Dict[str, torch.Tensor]:
        """
        Performs the forward step of Bart.

        # Parameters

        source_tokens : `TextFieldTensors`, required
            The source tokens for the encoder. We assume they are stored under the `tokens` key.
        target_tokens : `TextFieldTensors`, optional (default = `None`)
            The target tokens for the decoder. We assume they are stored under the `tokens` key. If no target
            tokens are given, the source tokens are shifted to the right by 1.


        # Returns

        `Dict[str, torch.Tensor]`
            During training, this dictionary contains the `decoder_logits` of shape `(batch_size,
            max_target_length, target_vocab_size)` and the `loss`. During inference, it contains `predictions`
            of shape `(batch_size, max_decoding_steps)` and `log_probabilities` of shape `(batch_size,)`.

        """
        inputs = source_tokens
        targets = target_tokens
        input_ids, input_mask = inputs["tokens"]["token_ids"], inputs[
            "tokens"]["mask"]

        outputs = {}

        # If no targets are provided, then shift input to right by 1. Bart already does this internally
        # but it does not use them for loss calculation.
        if targets is not None:
            target_ids, target_mask = targets["tokens"]["token_ids"], targets[
                "tokens"]["mask"]
        else:
            target_ids = input_ids[:, 1:]
            target_mask = input_mask[:, 1:]

        if self.training:
            decoder_logits = self.bart(
                input_ids=input_ids,
                attention_mask=input_mask,
                decoder_input_ids=target_ids[:, :-1].contiguous(),
                decoder_attention_mask=target_mask[:, :-1].contiguous(),
                use_cache=False,
            )[0]

            outputs["decoder_logits"] = decoder_logits

            # The BART paper mentions label smoothing of 0.1 for sequence generation tasks
            outputs["loss"] = sequence_cross_entropy_with_logits(
                decoder_logits,
                target_ids[:, 1:].contiguous(),
                target_mask[:, 1:].contiguous(),
                label_smoothing=0.1,
                average="token",
            )
        else:
            # Use decoder start id and start of sentence to start decoder
            initial_decoder_ids = torch.tensor(
                [[self._decoder_start_id, self._start_id]],
                dtype=input_ids.dtype,
                device=input_ids.device,
            ).repeat(input_ids.shape[0], 1)

            inital_state = {
                "input_ids": input_ids,
                "input_mask": input_mask,
                "encoder_states": None,
            }
            beam_result = self._beam_search.search(initial_decoder_ids,
                                                   inital_state,
                                                   self.take_step)

            predictions = beam_result[0]
            max_pred_indices = (beam_result[1].argmax(dim=-1).view(
                -1, 1, 1).expand(-1, -1, predictions.shape[-1]))
            predictions = predictions.gather(
                dim=1, index=max_pred_indices).squeeze(dim=1)

            self._rouge(predictions, target_ids)
            self._bleu(predictions, target_ids)

            outputs["predictions"] = predictions
            outputs["log_probabilities"] = (beam_result[1].gather(
                dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1))

            self.make_output_human_readable(outputs)

        return outputs