def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor) -> torch.LongTensor: """ Takes logits (unnormalized outputs from the decoder) of size (batch_size, num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1) and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross entropy loss while taking the mask into account. The length of ``targets`` is expected to be greater than that of ``logits`` because the decoder does not need to compute the output corresponding to the last timestep of ``targets``. This method aligns the inputs appropriately to compute the loss. During training, we want the logit corresponding to timestep i to be similar to the target token from timestep i + 1. That is, the targets should be shifted by one timestep for appropriate comparison. Consider a single example where the target has 3 words, and padding is to 7 tokens. The complete sequence would correspond to <S> w1 w2 w3 <E> <P> <P> and the mask would be 1 1 1 1 1 0 0 and let the logits be l1 l2 l3 l4 l5 l6 We actually need to compare: the sequence w1 w2 w3 <E> <P> <P> with masks 1 1 1 1 0 0 against l1 l2 l3 l4 l5 l6 (where the input was) <S> w1 w2 w3 <E> <P> """ relevant_targets = targets[:, 1:].contiguous() # (batch_size, num_decoding_steps) relevant_mask = target_mask[:, 1:].contiguous() # (batch_size, num_decoding_steps) loss = sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask) return loss
def test_sequence_cross_entropy_with_logits_averages_batch_correctly(self): # test batch average is the same as dividing the batch averaged # loss by the number of batches containing any non-padded tokens. tensor = torch.rand([5, 7, 4]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, :, :] = 0 weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1) targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7])) targets *= weights loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights) vector_loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, average=None) # Batch has one completely padded row, so divide by 4. assert loss.data.numpy() == vector_loss.data.sum() / 4
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, self.num_classes]) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask) for metric in self.metrics.values(): metric(logits, tags, mask.float()) output_dict["loss"] = loss if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def test_sequence_cross_entropy_with_logits_averages_token_correctly(self): # test token average is the same as multiplying the per-batch loss # with the per-batch weights and dividing by the total weight tensor = torch.rand([5, 7, 4]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, :, :] = 0 weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1) targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7])) targets *= weights loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, average="token") vector_loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, batch_average=False) total_token_loss = (vector_loss * weights.float().sum(dim=-1)).sum() average_token_loss = (total_token_loss / weights.float().sum()).detach() assert_almost_equal(loss.detach()[0], average_token_loss[0])
def forward(self, sentence: Dict[str, torch.Tensor], labels: torch.Tensor = None) -> torch.Tensor: mask = get_text_field_mask(sentence) embeddings = self.word_embeddings(sentence) encoder_out = self.encoder(embeddings, mask) tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits} if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask) return output
def test_sequence_cross_entropy_with_logits_masks_loss_correctly(self): # test weight masking by checking that a tensor with non-zero values in # masked positions returns the same loss as a tensor with zeros in those # positions. tensor = torch.rand([5, 7, 4]) tensor[0, 3:, :] = 0 tensor[1, 4:, :] = 0 tensor[2, 2:, :] = 0 tensor[3, :, :] = 0 weights = (tensor != 0.0)[:, :, 0].long().squeeze(-1) tensor2 = tensor.clone() tensor2[0, 3:, :] = 2 tensor2[1, 4:, :] = 13 tensor2[2, 2:, :] = 234 tensor2[3, :, :] = 65 targets = torch.LongTensor(numpy.random.randint(0, 3, [5, 7])) targets *= weights loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights) loss2 = util.sequence_cross_entropy_with_logits(tensor2, targets, weights) assert loss.data.numpy() == loss2.data.numpy()
def test_loss_is_computed_correctly(self): batch_size = 5 num_decoding_steps = 5 num_classes = 10 sample_logits = Variable(torch.randn(batch_size, num_decoding_steps-1, num_classes)) sample_targets = Variable(torch.from_numpy(numpy.random.randint(0, num_classes, (batch_size, num_decoding_steps)))) # Mask should be either 0 or 1 sample_mask = Variable(torch.from_numpy(numpy.random.randint(0, 2, (batch_size, num_decoding_steps)))) expected_loss = sequence_cross_entropy_with_logits(sample_logits, sample_targets[:, 1:].contiguous(), sample_mask[:, 1:].contiguous()) # pylint: disable=protected-access actual_loss = self.model._get_loss(sample_logits, sample_targets, sample_mask) assert numpy.equal(expected_loss.data.numpy(), actual_loss.data.numpy())
def test_sequence_cross_entropy_with_logits_smooths_labels_correctly(self): tensor = torch.rand([1, 3, 4]) targets = torch.LongTensor(numpy.random.randint(0, 3, [1, 3])) weights = torch.ones([2, 3]) loss = util.sequence_cross_entropy_with_logits(tensor, targets, weights, label_smoothing=0.1) correct_loss = 0.0 for prediction, label in zip(tensor.squeeze(0), targets.squeeze(0)): prediction = torch.nn.functional.log_softmax(prediction, dim=-1) correct_loss += prediction[label] * 0.9 # incorrect elements correct_loss += prediction.sum() * 0.1/4 # Average over sequence. correct_loss = - correct_loss / 3 numpy.testing.assert_array_almost_equal(loss.data.numpy(), correct_loss.data.numpy())
def forward(self, sentence: Dict[str, torch.Tensor], labels: torch.Tensor = None) -> torch.Tensor: #### AllenNLP is designed to operate on batched inputs, but different input sequences have different lengths. Behind the scenes AllenNLP is padding the shorter inputs so that the batch has uniform shape, which means our computations need to use a mask to exclude the padding. Here we just use the utility function <code>get_text_field_mask</code>, which returns a tensor of 0s and 1s corresponding to the padded and unpadded locations. mask = get_text_field_mask(sentence) #### We start by passing the <code>sentence</code> tensor (each sentence a sequence of token ids) to the <code>word_embeddings</code> module, which converts each sentence into a sequence of embedded tensors. embeddings = self.word_embeddings(sentence) #### We next pass the embedded tensors (and the mask) to the LSTM, which produces a sequence of encoded outputs. encoder_out = self.encoder(embeddings, mask) #### Finally, we pass each encoded output tensor to the feedforward layer to produce logits corresponding to the various tags. tag_logits = self.hidden2tag(encoder_out) output = {"tag_logits": tag_logits} #### As before, the labels were optional, as we might want to run this model to make predictions on unlabeled data. If we do have labels, then we use them to update our accuracy metric and compute the "loss" that goes in our output. if labels is not None: self.accuracy(tag_logits, labels, mask) output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask) return output
def _get_loss(logits: torch.LongTensor, targets: torch.LongTensor, target_mask: torch.LongTensor) -> torch.Tensor: """ Compute loss. Takes logits (unnormalized outputs from the decoder) of size (batch_size, num_decoding_steps, num_classes), target indices of size (batch_size, num_decoding_steps+1) and corresponding masks of size (batch_size, num_decoding_steps+1) steps and computes cross entropy loss while taking the mask into account. The length of ``targets`` is expected to be greater than that of ``logits`` because the decoder does not need to compute the output corresponding to the last timestep of ``targets``. This method aligns the inputs appropriately to compute the loss. During training, we want the logit corresponding to timestep i to be similar to the target token from timestep i + 1. That is, the targets should be shifted by one timestep for appropriate comparison. Consider a single example where the target has 3 words, and padding is to 7 tokens. The complete sequence would correspond to <S> w1 w2 w3 <E> <P> <P> and the mask would be 1 1 1 1 1 0 0 and let the logits be l1 l2 l3 l4 l5 l6 We actually need to compare: the sequence w1 w2 w3 <E> <P> <P> with masks 1 1 1 1 0 0 against l1 l2 l3 l4 l5 l6 (where the input was) <S> w1 w2 w3 <E> <P> """ # shape: (batch_size, num_decoding_steps) relevant_targets = targets[:, 1:].contiguous() # shape: (batch_size, num_decoding_steps) relevant_mask = target_mask[:, 1:].contiguous() return util.sequence_cross_entropy_with_logits(logits, relevant_targets, relevant_mask)
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size( ) if self.stacked_encoder.get_input_dim( ) != embedding_dim_with_binary_feature: raise ConfigurationError( "The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities, "encoded_text": encoded_text } if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask) self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask return output_dict
def calculate_perplexity(batch_size=1, gpu_id=0, decoder_path='decoder.pth'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') model = BertGPT() model.load_state_dict(torch.load(decoder_path)) print(f'load from {decoder_path}') model = model.to(device) model.eval() print('load success') #------------------------END LOAD MODEL-------------- test_data = torch.load("validate_data.pth") test_dataset = MyDataset(*test_data) test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) #------------------------END LOAD VAL DATA-------------- # #------------------------START VAL------------------- # perplexity = 0 # batch_count = 0 # print('start calculate the train perplexity....') # with torch.no_grad(): # for batch in tqdm(train_dataloader): # batch = [item.to(device) for item in batch] # encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch # _, past = encoder(encoder_input, mask_encoder_input) # mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) # logits, _ = decoder(decoder_input, mask, past=past, past_length=0) # out = logits[:, :-1].contiguous() # target = decoder_input[:, 1:].contiguous() # target_mask = mask_decoder_input[:, 1:].contiguous() # loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") # perplexity += np.exp(loss.item()) # batch_count += 1 # print(f'train perplexity: {perplexity / batch_count}') perplexity = 0 batch_count = 0 print('start calculate the test perplexity....') with torch.no_grad(): for batch in tqdm(test_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'test perplexity: {perplexity / batch_count}')
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout(self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding(verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat([embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size() encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, self.num_classes]) output_dict = {"logits": logits, "class_probabilities": class_probabilities} if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric: self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) if metadata is not None: output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_span: torch.LongTensor, entity_span: torch.LongTensor, state_change_type_labels: torch.LongTensor = None, state_change_tags: torch.LongTensor = None ) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_span: torch.LongTensor, required. An integer ``SequenceLabelField`` representation of the position of the focus verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that pre-processing stage could not extract a verbal predicate. entity_span: torch.LongTensor, required. An integer ``SequenceLabelField`` representation of the position of the focus entity in the sentence. This should have shape (batch_size, num_tokens) state_change_type_labels: torch.LongTensor, optional (default = None) A torch tensor representing the state change type class labels of shape ``(batch_size, 1)??? state_change_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` In the first implementation we focus only on state_change_types. Returns ------- An output dictionary consisting of: type_probs : torch.FloatTensor A tensor of shape ``(batch_size, num_state_change_types)`` representing a distribution of state change types per datapoint. tags_class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_state_change_types, num_tokens)`` representing a distribution of location tags per token in a sentence. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ # Layer 1 = Word + Character embedding layer embedded_sentence = self.text_field_embedder(tokens) mask = get_text_field_mask(tokens).float() # Layer 2 = Add positional bit to encode position of focus verb and entity embedded_sentence_verb_entity = \ torch.cat([embedded_sentence, verb_span.float().unsqueeze(-1), entity_span.float().unsqueeze(-1)], dim=-1) # Layer 3 = Contextual embedding layer using Bi-LSTM over the sentence contextual_embedding = self.seq2seq_encoder( embedded_sentence_verb_entity, mask) # Layer 4: Attention (Contextual embedding, BOW(verb span)) verb_weight_matrix = verb_span.float() / ( verb_span.float().sum(-1).unsqueeze(-1) + 1e-13) verb_vector = weighted_sum( contextual_embedding * verb_span.float().unsqueeze(-1), verb_weight_matrix) entity_weight_matrix = entity_span.float() / ( entity_span.float().sum(-1).unsqueeze(-1) + 1e-13) entity_vector = weighted_sum( contextual_embedding * entity_span.float().unsqueeze(-1), entity_weight_matrix) verb_entity_vector = torch.cat([verb_vector, entity_vector], 1) batch_size, sequence_length, binary_feature_dim = verb_span.float( ).unsqueeze(-1).size() # attention weights for type prediction attention_weights_types = self.attention_layer(verb_entity_vector, contextual_embedding) attention_output_vector = weighted_sum(contextual_embedding, attention_weights_types) # contextual embedding + positional vectors for tag prediction context_positional_tags = torch.cat([ contextual_embedding, verb_span.float().unsqueeze(-1), entity_span.float().unsqueeze(-1) ], dim=-1) # Layer 5 = Dense softmax layer to pick one state change type per datapoint, # and one tag per word in the sentence type_logits = self.aggregate_feedforward(attention_output_vector) type_probs = torch.nn.functional.softmax(type_logits, dim=-1) tags_logits = self.tag_projection_layer(context_positional_tags) reshaped_log_probs = tags_logits.view(-1, self.num_tags) tags_class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_tags]) # Create output dictionary for the trainer # Compute loss and epoch metrics output_dict = {'type_probs': type_probs} if state_change_type_labels is not None: state_change_type_labels_loss = self._loss( type_logits, state_change_type_labels.long().view(-1)) for type_label in self.type_labels_vocab.values(): metric = self.type_f1_metrics["type_" + type_label] metric(type_probs, state_change_type_labels.squeeze(-1)) self._type_accuracy(type_probs, state_change_type_labels.squeeze(-1)) if state_change_tags is not None: state_change_tags_loss = sequence_cross_entropy_with_logits( tags_logits, state_change_tags, mask) self.span_metric(tags_class_probabilities, state_change_tags, mask) output_dict["tags_class_probabilities"] = tags_class_probabilities output_dict['loss'] = (state_change_type_labels_loss + state_change_tags_loss) return output_dict
def forward( # type: ignore self, tokens: TextFieldTensors, verb_indicator: torch.Tensor, frame_indicator: torch.Tensor, metadata: List[Any], tags: torch.LongTensor = None, frame_tags: torch.LongTensor = None, ): """ # Parameters tokens : `TextFieldTensors`, required The output of `TextField.as_array()`, which should typically be passed directly to a `TextFieldEmbedder`. For this model, this must be a `SingleIdTokenIndexer` which indexes wordpieces from the BERT vocabulary. verb_indicator: `torch.LongTensor`, required. An integer `SequenceFeatureField` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. frame_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the frame in the sentence. This should have shape (batch_size, num_tokens). Similar to verb_indicator, but handles bert wordpiece tokenizer by cosnidering a frame only the first subtoken. tags : `torch.LongTensor`, optional (default = `None`) A torch tensor representing the sequence of integer gold class labels of shape `(batch_size, num_tokens)` frame_tags : torch.LongTensor, optional (default = None) A torch tensor representing the gold frames of shape ``(batch_size, num_tokens)`` metadata : `List[Dict[str, Any]]`, optional, (default = `None`) metadata containg the original words in the sentence, the verb to compute the frame for, and start offsets for converting wordpieces back to a sequence of words, under 'words', 'verb' and 'offsets' keys, respectively. # Returns An output dictionary consisting of: logits : `torch.FloatTensor` A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing unnormalised log probabilities of the tag classes. class_probabilities : `torch.FloatTensor` A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing a distribution of the tag classes per word. loss : `torch.FloatTensor`, optional A scalar loss to be optimised. """ mask = get_text_field_mask(tokens) input_ids = util.get_token_ids_from_text_field_tensors(tokens) bert_embeddings, _ = self.transformer( input_ids=input_ids, token_type_ids=verb_indicator, attention_mask=mask, return_dict=False, ) # extract embeddings embedded_text_input = self.embedding_dropout(bert_embeddings) frame_embeddings = embedded_text_input[frame_indicator == 1] # get sizes batch_size, sequence_length, _ = embedded_text_input.size() # outputs logits = self.tag_projection_layer(embedded_text_input) frame_logits = self.frame_projection_layer(frame_embeddings) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) frame_probabilities = F.softmax(frame_logits, dim=-1) # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.make_output_human_readable. output_dict = { "logits": logits, "frame_logits": frame_logits, "class_probabilities": class_probabilities, "frame_probabilities": frame_probabilities, "mask": mask, } # We add in the offsets here so we can compute the un-wordpieced tags. words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"]) for x in metadata]) lemmas = [l for x in metadata for l in x["lemmas"]] output_dict["words"] = list(words) output_dict["lemma"] = list(lemmas) output_dict["verb"] = list(verbs) output_dict["wordpiece_offsets"] = list(offsets) if tags is not None: # compute role loss role_loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing) # compute frame loss frame_tags_filtered = frame_tags[frame_indicator == 1] frame_loss = self.frame_criterion(frame_logits, frame_tags_filtered) if not self.ignore_span_metric and self.span_metric is not None and not self.training: batch_verb_indices = [ example_metadata["verb_index"] for example_metadata in metadata ] batch_sentences = [ example_metadata["words"] for example_metadata in metadata ] # Get the BIO tags from make_output_human_readable() batch_bio_predicted_tags = self.make_output_human_readable( output_dict).pop("tags") from allennlp_models.structured_prediction.models.srl import ( convert_bio_tags_to_conll_format, ) batch_conll_predicted_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags ] batch_bio_gold_tags = [ example_metadata["gold_tags"] for example_metadata in metadata ] batch_conll_gold_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags ] self.span_metric( batch_verb_indices, batch_sentences, batch_conll_predicted_tags, batch_conll_gold_tags, ) self.f1_frame_metric(frame_logits, frame_tags_filtered) output_dict["frame_loss"] = frame_loss output_dict["role_loss"] = role_loss output_dict["loss"] = (role_loss + frame_loss) / 2 return output_dict
def calculate_perplexity( batch_size=1, gpu_id=0, model_path='/content/GPT CheckPoints/model-9.pth' ): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") # ------------------------LOAD MODEL----------------- print('load the model....') model = EncoderDecoderModel.from_encoder_decoder_pretrained("gpt2", "gpt2", use_cache=False) model.load_state_dict(torch.load(model_path, map_location='cuda')) model = model.to(device) model.eval() print('load success') # ------------------------END LOAD MODEL-------------- # ------------------------LOAD VAL DATA------------------ val_data = torch.load("/content/validate_data.pth") val_dataset = TensorDataset(*val_data) train_data = torch.load("/content/train_data.pth") train_dataset = TensorDataset(*train_data) test_data = torch.load("/content/test_data.pth") test_dataset = TensorDataset(*test_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=False, batch_size=batch_size) train_dataloader = DataLoader(dataset=train_dataset, shuffle=False, batch_size=batch_size) test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size) # ------------------------END LOAD VAL DATA-------------- # ------------------------START VAL------------------- perplexity = 0 batch_count = 0 print('start calculate the train perplexity....') with torch.no_grad(): for batch in train_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past)) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'train perplexity: {perplexity / batch_count}') perplexity = 0 batch_count = 0 print('start calculate the validate perplexity....') with torch.no_grad(): for batch in val_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past)) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') perplexity = 0 batch_count = 0 print('start calculate the test perplexity....') with torch.no_grad(): for batch in test_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch _, past = model.encoder(input_ids=encoder_input, attention_mask=mask_encoder_input) mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1) logits, _ = model.decoder(decoder_input, attention_mask=mask, past=list(past)) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'test perplexity: {perplexity / batch_count}')
def _forward_loop(self, state: Dict[str, torch.Tensor], targets: Dict[str, torch.Tensor], labels: torch.Tensor) -> Dict[str, torch.Tensor]: """Compute loss using greedy decoding.""" batch_size = state['input_mask'].shape[0] target_tokens = targets['tokens'] num_decoding_steps = target_tokens.shape[1] - 1 # Greedy decoding phase output_logit_list = [] attention_logit_list = [] select_idx_list = [] for timestep in range(num_decoding_steps): # Feed target sequence as input decoder_input = target_tokens[:, timestep] output_logits, attention_logits, state = self._prepare_output_projections(decoder_input, state) # Store output and attention logits output_logit_list.append(output_logits.unsqueeze(1)) attention_logit_list.append(attention_logits.unsqueeze(1)) # Compute reconstruction loss output_logit_tensor = torch.cat(output_logit_list, dim=1) relevant_target_tokens = target_tokens[:, 1:].contiguous() target_mask = util.get_text_field_mask(targets)[:, 1:].contiguous() reconstruction_loss = util.sequence_cross_entropy_with_logits(output_logit_tensor, relevant_target_tokens, target_mask) # Compute claim scoring loss. A loss is computed between **each** attention vector and the # true label. In order for that to work we need to: # a. Tile the source labels (so that they are copied for each word) # b. Mask out padding tokens - this requires taking the outer-product of the target mask # and the claim mask attention_logit_tensor = torch.cat(attention_logit_list, dim=1) claim_level_mask = (state['claim_mask'].sum(-1) > 0).long() attention_mask = target_mask.unsqueeze(-1) * claim_level_mask.unsqueeze(1) labels = labels.unsqueeze(1).repeat(1, num_decoding_steps, 1).float() claim_scoring_loss = F.binary_cross_entropy_with_logits(attention_logit_tensor, labels, reduction='none') claim_scoring_loss *= attention_mask.float() # Apply mask # We want to apply 'batch' reduction (as is done in `sequence_cross_entropy...` which # entails averaging over each dimension. denom = attention_mask for i in range(3): denom = denom.sum(-1) claim_scoring_loss = claim_scoring_loss.sum(-1) / (denom.float() + 1e-13) denom = (denom > 0) total_loss = reconstruction_loss + self.beta * claim_scoring_loss # Update metrics self.avg_reconstruction_loss(reconstruction_loss) self.avg_claim_scoring_loss(claim_scoring_loss) output_dict = { "loss": total_loss, "reconstruction_loss": reconstruction_loss, "claim_scoring_loss": claim_scoring_loss, "attention_logits": attention_logit_tensor } return output_dict
def forward( self, # type: ignore label_indices: torch.LongTensor, token_representations: torch.FloatTensor = None, raw_tokens: List[List[str]] = None, labels: torch.LongTensor = None, **kwargs) -> Dict[str, torch.Tensor]: """ If ``token_representations`` is provided, ``tokens`` is not required. If ``token_representations`` is ``None``, then ``tokens`` is required. Parameters ---------- label_indices : torch.LongTensor A LongTensor of shape (batch_size, max_num_adpositions) with the tokens to predict a label for for each element (sentence) in the batch. token_representations : torch.FloatTensor, optional (default = None) A tensor of shape (batch_size, sequence_length, representation_dim) with the represenatation of the first token. If None, we use a contextualizer within this model to produce the token representation. raw_tokens : List[List[str]], optional (default = None) A batch of lists with the raw token strings. Used to compute token_representations, if either are None. labels : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_label_indices)``. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing unnormalized log probabilities of the classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing a distribution of the tag classes. loss : torch.FloatTensor, optional A scalar loss to be optimized. """ # Convert to LongTensor # TODO: add PR to ArrayField to preserve array types. label_indices = label_indices.long() if token_representations is None: if self._contextualizer is None: raise ConfigurationError( "token_representation not provided as input to the model, and no " "contextualizer was specified. Either add a contextualizer to your " "dataset reader (preferred if your contextualizer is frozen) or to " "this model (if you wish to train your contextualizer).") if raw_tokens is None: raise ValueError( "Input raw_tokens is ``None`` --- make sure to set " "include_raw_tokens in the DatasetReader to True.") if label_indices is None: raise ValueError("Did not recieve any token indices, needed " "if the contextualizer is within the model.") # Convert contextualizer output into a tensor # Shape: (batch_size, max_seq_len, representation_dim) token_representations, _ = pad_contextualizer_output( self._contextualizer(raw_tokens)) # Move token representation to the same device as the # module (CPU or CUDA). TODO(nfliu): This only works if the module # is on one device. device = next(self._decoder._linear_layers[0].parameters()).device token_representations = token_representations.to(device) text_mask = get_text_mask_from_representations(token_representations) text_mask = text_mask.to(device) label_mask = self._get_label_mask_from_label_indices(label_indices) label_mask = label_mask.to(device) # Mask out the -1 padding in the label_indices, since that doesn't # work with indexing. Note that we can't 0 pad because 0 is actually # a valid label index, so we pad with -1 just for the purposes of # proper mask calculation and then convert to 0-padding by applying # the mask. label_indices = label_indices * label_mask # Encode the token representation. encoded_token_representations = self._encoder(token_representations, text_mask) batch_size = label_indices.size(0) # Index into the encoded_token_representations to get tensors corresponding # to the representations of the tokens to predict labels for. # Shape: (batch_size, num_label_indices, representation_dim) range_vector = get_range_vector( batch_size, get_device_of(label_indices)).unsqueeze(1) selected_token_representations = encoded_token_representations[ range_vector, label_indices] selected_token_representations = selected_token_representations.contiguous( ) # Decode out a label from the token representation # Shape: (batch_size, num_label_indices, num_classes) logits = self._decoder(selected_token_representations) class_probabilities = F.softmax(logits, dim=-1) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if labels is not None: loss = sequence_cross_entropy_with_logits( logits, labels, label_mask, average=self.loss_average) for name, metric in self.metrics.items(): # When not running in error analysis mode, skip # metrics that start with "_" if not self.error_analysis and name.startswith("_"): continue metric(logits, labels, label_mask.float()) output_dict["loss"] = loss return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.Tensor], verb_indicator: torch.Tensor, metadata: List[Any], tags: torch.LongTensor = None): # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. For this model, this must be a `SingleIdTokenIndexer` which indexes wordpieces from the BERT vocabulary. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence, the verb to compute the frame for, and start offsets for converting wordpieces back to a sequence of words, under 'words', 'verb' and 'offsets' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ mask = get_text_field_mask(tokens) bert_embeddings, _ = self.bert_model(input_ids=tokens["tokens"], token_type_ids=verb_indicator, attention_mask=mask, output_all_encoded_layers=False) embedded_text_input = self.embedding_dropout(bert_embeddings) batch_size, sequence_length, _ = embedded_text_input.size() logits = self.tag_projection_layer(embedded_text_input) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask # We add in the offsets here so we can compute the un-wordpieced tags. words, verbs, offsets = zip(*[(x["words"], x["verb"], x["offsets"]) for x in metadata]) output_dict["words"] = list(words) output_dict["verb"] = list(verbs) output_dict["wordpiece_offsets"] = list(offsets) if tags is not None: loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric and self.span_metric is not None and not self.training: batch_verb_indices = [ example_metadata["verb_index"] for example_metadata in metadata ] batch_sentences = [ example_metadata["words"] for example_metadata in metadata ] # Get the BIO tags from decode() # TODO (nfliu): This is kind of a hack, consider splitting out part # of decode() to a separate function. batch_bio_predicted_tags = self.decode(output_dict).pop("tags") batch_conll_predicted_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_predicted_tags ] batch_bio_gold_tags = [ example_metadata["gold_tags"] for example_metadata in metadata ] batch_conll_gold_tags = [ convert_bio_tags_to_conll_format(tags) for tags in batch_bio_gold_tags ] self.span_metric(batch_verb_indices, batch_sentences, batch_conll_predicted_tags, batch_conll_gold_tags) output_dict["loss"] = loss return output_dict
def forward( self, # type: ignore tokens: TextFieldTensors, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, ignore_loss_on_o_tags: bool = False, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : `TextFieldTensors`, required The output of `TextField.as_array()`, which should typically be passed directly to a `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer` tensors. At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens": Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used for the `TokenIndexers` when you created the `TextField` representing your sequence. The dictionary is designed to be passed directly to a `TextFieldEmbedder`, which knows how to combine different word representations into a single vector per token in your input. tags : `torch.LongTensor`, optional (default = `None`) A torch tensor representing the sequence of integer gold class labels of shape `(batch_size, num_tokens)`. metadata : `List[Dict[str, Any]]`, optional, (default = `None`) metadata containing the original words in the sentence to be tagged under a 'words' key. ignore_loss_on_o_tags : `bool`, optional (default = `False`) If True, we compute the loss only for actual spans in `tags`, and not on `O` tokens. This is useful for computing gradients of the loss on a _single span_, for interpretation / attacking. # Returns An output dictionary consisting of: - `logits` (`torch.FloatTensor`) : A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing unnormalised log probabilities of the tag classes. - `class_probabilities` (`torch.FloatTensor`) : A tensor of shape `(batch_size, num_tokens, tag_vocab_size)` representing a distribution of the tag classes per word. - `loss` (`torch.FloatTensor`, optional) : A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if tags is not None: if ignore_loss_on_o_tags: o_tag_index = self.vocab.get_token_index( "O", namespace=self.label_namespace) tag_mask = mask & (tags != o_tag_index) else: tag_mask = mask loss = sequence_cross_entropy_with_logits(logits, tags, tag_mask) for metric in self.metrics.values(): metric(logits, tags, mask) if self.calculate_span_f1: self._f1_metric(logits, tags, mask) output_dict["loss"] = loss if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], spans: torch.LongTensor, metadata: List[Dict[str, Any]], pos_tags: Dict[str, torch.LongTensor] = None, span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. spans : ``torch.LongTensor``, required. A tensor of shape ``(batch_size, num_spans, 2)`` representing the inclusive start and end indices of all possible spans in the sentence. metadata : List[Dict[str, Any]], required. A dictionary of metadata for each batch element which has keys: tokens : ``List[str]``, required. The original string tokens in the sentence. gold_tree : ``nltk.Tree``, optional (default = None) Gold NLTK trees for use in evaluation. pos_tags : ``List[str]``, optional. The POS tags for the sentence. These can be used in the model as embedded features, but they are passed here in addition for use in constructing the tree. pos_tags : ``torch.LongTensor``, optional (default = None) The output of a ``SequenceLabelField`` containing POS tags. span_labels : ``torch.LongTensor``, optional (default = None) A torch tensor representing the integer gold class labels for all possible spans, of shape ``(batch_size, num_spans)``. Returns ------- An output dictionary consisting of: class_probabilities : ``torch.FloatTensor`` A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)`` representing a distribution over the label classes per span. spans : ``torch.LongTensor`` The original spans tensor. tokens : ``List[List[str]]``, required. A list of tokens in the sentence for each element in the batch. pos_tags : ``List[List[str]]``, required. A list of POS tags in the sentence for each element in the batch. num_spans : ``torch.LongTensor``, required. A tensor of shape (batch_size), representing the lengths of non-padded spans in ``enumerated_spans``. loss : ``torch.FloatTensor``, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) if pos_tags is not None and self.pos_tag_embedding is not None: embedded_pos_tags = self.pos_tag_embedding(pos_tags) embedded_text_input = torch.cat([embedded_text_input, embedded_pos_tags], -1) elif self.pos_tag_embedding is not None: raise ConfigurationError("Model uses a POS embedding, but no POS tags were passed.") mask = get_text_field_mask(tokens) # Looking at the span start index is enough to know if # this is padding or not. Shape: (batch_size, num_spans) span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long() if span_mask.dim() == 1: # This happens if you use batch_size 1 and encounter # a length 1 sentence in PTB, which do exist. -.- span_mask = span_mask.unsqueeze(-1) if span_labels is not None and span_labels.dim() == 1: span_labels = span_labels.unsqueeze(-1) num_spans = get_lengths_from_binary_sequence_mask(span_mask) encoded_text = self.encoder(embedded_text_input, mask) span_representations = self.span_extractor(encoded_text, spans, mask, span_mask) if self.feedforward_layer is not None: span_representations = self.feedforward_layer(span_representations) logits = self.tag_projection_layer(span_representations) class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1)) output_dict = { "class_probabilities": class_probabilities, "spans": spans, "tokens": [meta["tokens"] for meta in metadata], "pos_tags": [meta.get("pos_tags") for meta in metadata], "num_spans": num_spans } if span_labels is not None: loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask) self.tag_accuracy(class_probabilities, span_labels, span_mask) output_dict["loss"] = loss # The evalb score is expensive to compute, so we only compute # it for the validation and test sets. batch_gold_trees = [meta.get("gold_tree") for meta in metadata] if all(batch_gold_trees) and self._evalb_score is not None and not self.training: gold_pos_tags: List[List[str]] = [list(zip(*tree.pos()))[1] for tree in batch_gold_trees] predicted_trees = self.construct_trees(class_probabilities.cpu().data, spans.cpu().data, num_spans.data, output_dict["tokens"], gold_pos_tags) self._evalb_score(predicted_trees, batch_gold_trees) return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], labels: torch.LongTensor = None, d_tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. lables : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. d_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containing the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ # 下面就是整个翻译模型的算法架构. ''' 手动shuffle, 外卖的shuffle不好使,不知道为什么!!!!!!!!!!!!!!!! ''' if metadata: import random chang = len(metadata) tmp = list(range((chang))) random.shuffle(tmp) print('\n') print(' ') print(' ') print(' ') print(' ') print(' ') print(' ') print(' ') print(' ') print(' ') print(tmp, '修改后的顺序是') # tokens['bert'].numpy()[1,2,3,4,0] tokens['bert'][[2,3,4,0,1],:] tokens['bert'] = tokens['bert'][tmp, :] tokens['bert-offsets'] = tokens['bert-offsets'][tmp, :] tokens['mask'] = tokens['mask'][tmp, :] labels = labels[tmp, :] d_tags = d_tags[tmp, :] import numpy as np metadata = np.array(metadata) metadata = metadata[tmp] # shuffle 完毕. encoded_text = self.text_field_embedder( tokens) # 第一步先用pre_trained embedding batch_size, sequence_length, _ = encoded_text.size( ) # 整个算法的输入.(9, 50, 768) 每一个单词看做一个token. mask = get_text_field_mask( tokens) # 就是把补全到50的padding 标志位0,其他标志位1. torch.Size([9, 50]) logits_labels = self.tag_labels_projection_layer( self.predictor_dropout( encoded_text)) #torch.Size([9, 50, 28])# 28分类问题 logits_d = self.tag_detect_projection_layer( encoded_text) # 4分类问题 #torch.Size([9, 50, 4]) class_probabilities_labels = F.softmax( logits_labels, dim=-1 ).view( #!!!!!!!!!!!!!!!!!!!!!!!!!--------------------- [batch_size, sequence_length, self.num_labels_classes] ) #----------------class_probabilities_labels 这个是核心的输出,只用这个就可以得到最后output import numpy as np # ???????????????这行为什么报错????????np.array(class_probabilities_labels) #----------------class_probabilities_labels 这个是核心的输出,只用这个就可以得到最后output # from predict import confidence # # 下面做 置信度finetune, 把置信度 <args. ---------------- # tmp=confidence # with open('conf', ) as f: # tmp = float(f.readlines()[0]) # if tmp!=0: # tmp2=(class_probabilities_labels.numpy()[:,:,1:]>tmp).astype(int) # class_probabilities_labels[:,:,1:]=torch.tensor(tmp2) class_probabilities_d = F.softmax(logits_d, dim=-1).view( [batch_size, sequence_length, self.num_detect_classes]) error_probs = class_probabilities_d[:, :, self. incorr_index] * mask # 那些padding的loss不需要计算,没意义. incorr_prob = torch.max(error_probs, dim=-1)[0] # 按照一句话里面错率最大的字来算整个句子的错误率. if self.confidence > 0: probability_change = [self.confidence ] + [0] * (self.num_labels_classes - 1) class_probabilities_labels += torch.FloatTensor( probability_change).repeat((batch_size, sequence_length, 1)) output_dict = { "logits_labels": logits_labels, "logits_d_tags": logits_d, "class_probabilities_labels": class_probabilities_labels, "class_probabilities_d_tags": class_probabilities_d, "max_error_probability": incorr_prob } # 下面只在训练的时候输出,因为只有训练的时候才有labels这个 groud_true标签. predict时候会跳过下面代码.!!!!!!!!!!!!!!!!!!!!!!!!!!!1 2020-07-08,18点49 if labels is not None and d_tags is not None: # sequence_cross_entropy_with_logits 这个里面yhat 不用softmax? 这个点进去看说明就可以,他里面说了不用归一化之后的数据,直接输入即可. 诡异话之后的数据是class_probabilities_labels loss_labels = sequence_cross_entropy_with_logits( logits_labels, labels, mask, label_smoothing=self.label_smoothing ) # logits_labels 是28分类的概率分布, labels是 y标签. mask是遮罩也就是带入的weights. 用这个来算交叉熵. from train_finetune_latest2 import vocabdir with open(vocabdir) as f: tmp3 = f.readlines() tmp3 = [i.strip('\n') for i in tmp3] tmp3 = np.array(tmp3) loss_d = sequence_cross_entropy_with_logits( logits_d, d_tags, mask) # 同理 for metric in self.metrics.values(): metric(logits_labels, labels, mask.float()) metric(logits_d, d_tags, mask.float()) output_dict["loss"] = loss_labels + loss_d print('\n ------------------------------------------------\n') print('\n ------------------------------------------------\n') print('\n ------------------------------------------------\n') print('\n ------------------------------------------------\n') print('我们打印几个看看,目前策略是只打印最后2个,为了保证算法不会过多损耗性能.看看预测的结果是否是我们打的tag:') print('我们打印,最大分类标签和置信度.') allfenlei = torch.max(class_probabilities_labels, dim=-1)[1][-2:] # 这个东西我们用来生成标签.这个是所有的分类标签. gailv = torch.max(class_probabilities_labels, dim=-1)[0][-2:] # 这个东西我们用来生成标签.这个是所有的分类标签. newlist = [] for ii2 in range(len(allfenlei)): saveindex = [ i for i in range(len(allfenlei[ii2])) if allfenlei[ii2][i] != 0 ] newlist.append(gailv[ii2][saveindex]) shuju = metadata[-2:] for jj in range(len(allfenlei)): print('原始句子为', shuju[jj]) tmp = [i for i in allfenlei[jj] if i != 0] print('输出的变换为', '\t'.join(tmp3[tmp])) print('对应的概率为') print(newlist[jj]) print('------------该epoch评测完毕.') # 原始数据是metadata.所以也打印一下,对比一下效果. if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None, domain: torch.LongTensor = None, intent: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, # pylint: disable=unused-argument **kwargs) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : ``Dict[str, torch.LongTensor]``, required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : ``torch.LongTensor``, optional (default = ``None``) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : ``torch.FloatTensor`` The logits that are the output of the ``tag_projection_layer`` mask : ``torch.LongTensor`` The text field mask for the input tokens tags : ``List[List[int]]`` The predicted tags using the Viterbi algorithm. loss : ``torch.FloatTensor``, optional A scalar loss to be optimised. Only computed if gold label ``tags`` are provided. """ embedded_text_input = self.text_field_embedder(tokens) mask = util.get_text_field_mask(tokens) if self.dropout: embedded_text_input = self.dropout(embedded_text_input) encoded_text = self.encoder(embedded_text_input, mask) if self.dropout: encoded_text = self.dropout(encoded_text) if self._feedforward is not None: encoded_summary = self._feedforward( util.get_final_encoder_states(encoded_text, mask, self.encoder.is_bidirectional())) else: encoded_summary = util.get_final_encoder_states( encoded_text, mask, self.encoder.is_bidirectional()) tag_logits = self.tag_projection_layer(encoded_text) if self.crf: best_paths = self.crf.viterbi_tags(tag_logits, mask) # Just get the tags and ignore the score. predicted_tags = [x for x, y in best_paths] else: predicted_tags = self.get_predicted_tags(tag_logits) domain_logits = self.domain_projection_layer(encoded_summary) domain_probs = F.softmax(domain_logits, dim=-1) intent_logits = self.intent_projection_layer(encoded_summary) intent_probs = F.softmax(intent_logits, dim=-1) output = { "tag_logits": tag_logits, "mask": mask, "tags": predicted_tags, "domain_probs": domain_probs, "intent_probs": intent_probs } if tags is not None: if self.crf: # Add negative log-likelihood as loss log_likelihood = self.crf(tag_logits, tags, mask) output["loss"] = -log_likelihood # Represent viterbi tags as "class probabilities" that we can # feed into the metrics class_probabilities = tag_logits * 0. for i, instance_tags in enumerate(predicted_tags): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 else: loss = sequence_cross_entropy_with_logits( tag_logits, tags, mask) class_probabilities = tag_logits output["loss"] = loss # self.metrics['tag_acc'](class_probabilities, tags, mask.float()) # if self.calculate_span_f1: # self._f1_metric(class_probabilities, tags, mask.float()) if domain is not None: output["loss"] += self.ce_loss(domain_logits, domain) if intent is not None: output["loss"] += self.ce_loss(intent_logits, intent) if metadata: output["words"] = [x["words"] for x in metadata] if tags is not None and metadata: self.decode(output) self._dai_f1_metric(output["dialog_act"], [x["dialog_act"] for x in metadata]) return output
def _forward_loop( self, state: Dict[str, torch.Tensor], gold_mentions: torch.LongTensor, target_tokens: Dict[str, torch.LongTensor] = None ) -> Dict[str, torch.Tensor]: # shape: (batch_size, max_input_sequence_length) source_mask = state["source_mask"] # shape: (batch_size, max_input_sequence_length, embedding_dim) encoder_outputs = state['encoder_outputs'] batch_size = source_mask.size()[0] max_input_sequence_length = source_mask.size()[1] # 下面两步将gold_mention用0扩充到 (batch_size, max_input_sequence_length) gold_mentions_expanded = torch.zeros( batch_size, max_input_sequence_length).cuda(self.cuda_device) gold_mentions_expanded[:, :gold_mentions.size()[1]] = gold_mentions # 通过get_text_field_mask, 用0-1表示当前位置是否有效 # shape: (batch_size, mac_input_sequence_length) mention_mask = util.get_text_field_mask( {'gold_mentions': gold_mentions_expanded}) for b in range(batch_size): encoder_output = encoder_outputs[b] gold_mention = gold_mentions_expanded[b] # 选择对应mention的output,剩余的用0位置的output填充 # 例如gold_mention = [3,5,0,0], 那么就选择3和5位置的output,并且用0位置的output填充矩阵剩余部分 encoder_selected = torch.index_select(encoder_output, 0, gold_mention.long()) if b == 0: encoder_resorted = encoder_selected.unsqueeze(0) else: encoder_resorted = torch.cat( (encoder_resorted, encoder_selected.unsqueeze(0)), 0) # 通过decoder进行输出 # shape: (batch_size, max_sentence_length, num_classes) decoder_outputs = self._decode(encoder_resorted, mention_mask) # 按照token一个个计算 token_logits = [] token_predictions = [] token_class_probs = [] for i in range(max_input_sequence_length): encoder_slice = encoder_resorted[:, i, :] decoder_hidden = decoder_outputs[:, i, :] # source_mask_slice = source_mask[:, i].float() # TODO decoder hidden需要拼接上 h_encoder_t encoder_weights = self._attention(decoder_hidden, encoder_outputs, source_mask.float()) # 加权求和 # shape: (batch_size, hidden_dim) attended_output = util.weighted_sum(encoder_outputs, encoder_weights) # shape: (batch_size, hidden_dim * 3) hidden_attention_cat = torch.cat( (decoder_hidden, attended_output, encoder_slice), -1) # shape: (batch_size, num_classes) score = self._output_projection_layer(hidden_attention_cat) token_logits.append(score.unsqueeze(1)) class_probabilities = F.softmax(score, dim=-1) token_class_probs.append(class_probabilities.unsqueeze(1)) # shape (predicted_classes): (batch_size,) _, predicted_classes = torch.max(class_probabilities, 1) last_predictions = predicted_classes token_predictions.append(last_predictions.unsqueeze(1)) predictions = torch.cat(token_predictions, 1) class_probs = torch.cat(token_class_probs, 1) # 裁切超过target长度的 output_dict = { 'predictions': predictions, 'class_probs': class_probs.detach() } if target_tokens: targets = target_tokens['tokens'] target_length = targets.size()[1] # 下面的步骤主要在做裁切,因为输出的shape是(batch_size, max_sentence_length, num_classes) # 而target是(batch_size, max_target_length) max_sentence_length 和 max_target_length不相等 predictions_slice = predictions[:, :target_length] class_probs_slice = class_probs[:, :target_length, :] output_dict['predictions'] = predictions_slice output_dict['class_probs'] = class_probs_slice target_length = targets.size()[1] logits = torch.cat(token_logits, 1) # 裁切超过target长度的 logits_slice = logits[:, :target_length, :].contiguous() targets = targets.contiguous() mention_mask = mention_mask[:, :target_length].contiguous() loss = util.sequence_cross_entropy_with_logits( logits_slice.float(), targets, mention_mask.float()) output_dict['loss'] = loss output_dict['logits'] = logits_slice output_dict['mention_mask'] = mention_mask return output_dict
def forward( self, tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None, relation_root_idxs: torch.LongTensor = None, relations: torch.LongTensor = None, binary_coref: torch.FloatTensor = None, spacy_patterns: torch.FloatTensor = None, coarse_tags: torch.LongTensor = None, modifier_tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ,no-member """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : torch.LongTensor An integer tensor containing the gold ner tag label indexes. relation_root_idxs : torch.LongTensor, optional (default = None) An integer tensor containing the gold relation head indexes for training. relations : torch.LongTensor, optional (default = None) An integer tensor containing the gold relation label indexes for training. metadata : ``List[Dict[str, Any]]``, optional, (default = None) Additional information such as the original words and the entity ids. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoder_input_tensors = [embedded_text_input] if binary_coref is not None: encoder_input_tensors.append(binary_coref.unsqueeze(2)) if spacy_patterns is not None: encoder_input_tensors.append(spacy_patterns.permute(0, 2, 1)) if len(encoder_input_tensors) > 1: encoder_input = torch.cat(encoder_input_tensors, dim=2) else: encoder_input = encoder_input_tensors[0] # Shape: batch x seq_len x emb_dim encoded_text = self.encoder(encoder_input, mask) ner_logits = self.tag_projection_layer(encoded_text) best_ner_paths = self.crf.viterbi_tags(ner_logits, mask) # Just get the tags and ignore the score. predicted_ner_tags = [] predicted_ner_tags_tensor = torch.zeros_like(mask) for ner_path, _ in best_ner_paths: batch_idx = len(predicted_ner_tags) predicted_ner_tags.append(ner_path) for token_idx, ner_tag_idx in enumerate(ner_path): predicted_ner_tags_tensor[batch_idx, token_idx] = ner_tag_idx # predicted_ner_tags = [x for x, y in best_ner_paths] output_dict = { "ner_logits": ner_logits, "mask": mask, "tags": predicted_ner_tags } if self._use_aux_ner_labels: coarse_logits = self._coarse_projection_layer(encoded_text) modifier_logits = self._modifier_projection_layer(encoded_text) if self.ner_tag_embedder is not None: embedded_tags = self.ner_tag_embedder(predicted_ner_tags_tensor) encoded_sequence = torch.cat([encoded_text, embedded_tags], dim=2) else: encoded_sequence = torch.cat([ encoded_text, ner_logits, predicted_ner_tags_tensor.unsqueeze(2).float() ], dim=2) re_output = self.relation_scorer(encoded_sequence, mask, relation_root_idxs, relations) # Add a prefix for relation extraction logits output_dict['re_logits'] = re_output['logits'] output_dict['relation_scores'] = re_output['relation_scores'] if tags is not None: # Add negative log-likelihood as loss log_likelihood = self.crf(ner_logits, tags, mask) # It's not clear why, but pylint seems to think `log_likelihood` is tuple # (in fact, it's a torch.Tensor), so we need a disable. output_dict["ner_loss"] = -log_likelihood # pylint: disable=invalid-unary-operand-type # Represent viterbi tags as "class probabilities" that we can # feed into the metrics class_probabilities = torch.zeros_like(ner_logits) for i, instance_tags in enumerate(predicted_ner_tags): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 self.ner_accuracy(class_probabilities, tags, mask.float()) self.ner_f1(class_probabilities, tags, mask.float()) output_dict['loss'] = output_dict[ 'ner_loss'] + self._re_loss_weight * re_output['loss'] if self._use_aux_ner_labels: assert coarse_tags is not None and modifier_tags is not None, 'Auxiliary losses require auxiliary input' self._coarse_acc(coarse_logits, coarse_tags, mask.float()) self._modifier_acc(modifier_logits, modifier_tags, mask.float()) coarse_loss = sequence_cross_entropy_with_logits( coarse_logits, coarse_tags, mask) modifier_loss = sequence_cross_entropy_with_logits( modifier_logits, modifier_tags, mask) output_dict['loss'] += self._aux_loss_weight * (coarse_loss + modifier_loss) # Attach metadata if metadata is not None: for key in metadata[0]: output_dict[key] = [x[key] for x in metadata] return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, target_index: torch.LongTensor, span_starts: torch.LongTensor, span_ends: torch.LongTensor, span_mask: torch.LongTensor, constituents: torch.LongTensor = None, tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. bio : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` tags: shape ``(batch_size, num_spans)`` span_starts: shape ``(batch_size, num_spans)`` span_ends: shape ``(batch_size, num_spans)`` Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ self.batch += 1 embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) batch_size = embedded_text_input.size(0) text_mask = util.get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) embedding_dim_with_binary_feature = embedded_text_with_verb_indicator.size( )[2] if self.stacked_encoder.get_input_dim( ) != embedding_dim_with_binary_feature: raise ConfigurationError( "The SRL model uses an indicator feature, which makes " "the embedding dimension one larger than the value " "specified. Therefore, the 'input_dim' of the stacked_encoder " "must be equal to total_embedding_dim + 1.") encoded_text = self.stacked_encoder(embedded_text_with_verb_indicator, text_mask) span_starts = F.relu(span_starts.float()).long().view(batch_size, -1) span_ends = F.relu(span_ends.float()).long().view(batch_size, -1) target_index = F.relu(target_index.float()).long().view(batch_size) # shape (batch_size, sequence_length * max_span_width, embedding_dim) span_embeddings = span_srl_util.compute_span_representations( self.max_span_width, encoded_text, target_index, span_starts, span_ends, self.span_width_embedding, self.span_direction_embedding, self.span_distance_embedding, self.span_distance_bin, self.head_scorer) span_scores = self.span_feedforward(span_embeddings) srl_logits = self.srl_arg_projection_layer(span_scores) constit_logits = self.constit_arg_projection_layer(span_scores) output_dict = { "srl_logits": srl_logits, "constit_logits": constit_logits, "mask": text_mask } tags = tags.view(batch_size, -1, self.max_span_width) constituents = constituents.view(batch_size, -1, self.max_span_width) # Viterbi decoding if not self.training or (self.training and not self.fast_mode): srl_prediction, srl_probabilities = self.semi_crf.viterbi_tags( srl_logits, text_mask) output_dict["srl_tags"] = srl_prediction output_dict["srl_tag_probabilities"] = srl_probabilities self.metrics["srl"](predictions=srl_prediction.view( batch_size, -1, self.max_span_width), gold_labels=tags, mask=text_mask) reshaped_constit_logits = constit_logits.view( -1, self.num_constit_tags) constit_probabilities = F.softmax(reshaped_constit_logits, dim=-1) constit_predictions = constit_probabilities.max(-1)[1] output_dict["constit_tags"] = constit_predictions output_dict["constit_probabilities"] = constit_probabilities constit_predictions = constit_predictions.view( batch_size, -1, self.max_span_width) self.metrics["constituents"](predictions=constit_predictions, gold_labels=constituents, mask=text_mask) # Loss computation if self.training or (not self.training and not self.fast_mode): if tags is not None: srl_log_likelihood, _ = self.semi_crf(srl_logits, tags, mask=text_mask) output_dict["srl_loss"] = -srl_log_likelihood if constituents is not None: # Flattening it out. constituents = constituents.view(batch_size, -1) constit_loss = util.sequence_cross_entropy_with_logits( constit_logits, constituents, span_mask) output_dict["constit_loss"] = constit_loss if tags is not None and constituents is not None: if self.batch > self.cutoff_batch: output_dict["loss"] = - srl_log_likelihood + self.mixing_ratio * \ constit_loss else: output_dict["loss"] = -srl_log_likelihood if self.fast_mode and not self.training: output_dict["loss"] = Variable(torch.FloatTensor([0.00])) return output_dict
def forward(self, # type: ignore question: Dict[str, torch.LongTensor], passage: Dict[str, torch.LongTensor], answer_impossible:torch.LongTensor = None, span_start: torch.IntTensor = None, span_end: torch.IntTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- question : Dict[str, torch.LongTensor] From a ``TextField``. passage : Dict[str, torch.LongTensor] From a ``TextField``. The model assumes that this passage contains the answer to the question, and predicts the beginning and ending positions of the answer within the passage. span_start : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the beginning position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. span_end : ``torch.IntTensor``, optional From an ``IndexField``. This is one of the things we are trying to predict - the ending position of the answer with the passage. This is an `inclusive` token index. If this is given, we will compute a loss that gets included in the output dictionary. metadata : ``List[Dict[str, Any]]``, optional If present, this should contain the question ID, original passage text, and token offsets into the passage for each instance in the batch. We use this for computing official metrics using the official SQuAD evaluation script. The length of this list should be the batch size, and each dictionary should have the keys ``id``, ``original_passage``, and ``token_offsets``. If you only want the best span string and don't care about official metrics, you can omit the ``id`` key. Returns ------- An output dictionary consisting of: span_start_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span start position. span_start_probs : torch.FloatTensor The result of ``softmax(span_start_logits)``. span_end_logits : torch.FloatTensor A tensor of shape ``(batch_size, passage_length)`` representing unnormalized log probabilities of the span end position (inclusive). span_end_probs : torch.FloatTensor The result of ``softmax(span_end_logits)``. best_span : torch.IntTensor The result of a constrained inference over ``span_start_logits`` and ``span_end_logits`` to find the most probable span. Shape is ``(batch_size, 2)`` and each offset is a token index. loss : torch.FloatTensor, optional A scalar loss to be optimised. best_span_str : List[str] If sufficient metadata was provided for the instances in the batch, we also return the string from the original passage that the model thinks is the best answer to the question. """ embedded_question = self._highway_layer(self._text_field_embedder(question)) embedded_passage = self._highway_layer(self._text_field_embedder(passage)) batch_size = embedded_question.size(0) passage_length = embedded_passage.size(1) question_mask = util.get_text_field_mask(question).float() passage_mask = util.get_text_field_mask(passage).float() question_lstm_mask = question_mask if self._mask_lstms else None passage_lstm_mask = passage_mask if self._mask_lstms else None encoded_question = self._dropout(self._phrase_layer(embedded_question, question_lstm_mask)) encoded_passage = self._dropout(self._phrase_layer(embedded_passage, passage_lstm_mask)) encoding_dim = encoded_question.size(-1) # Shape: (batch_size, passage_length, question_length) passage_question_similarity = self._matrix_attention(encoded_passage, encoded_question) # Shape: (batch_size, passage_length, question_length) passage_question_attention = util.last_dim_softmax(passage_question_similarity, question_mask) # Shape: (batch_size, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention) # We replace masked values with something really negative here, so they don't affect the # max below. masked_similarity = util.replace_masked_values(passage_question_similarity, question_mask.unsqueeze(1), -1e7) # Shape: (batch_size, passage_length) question_passage_similarity = masked_similarity.max(dim=-1)[0].squeeze(-1) # Shape: (batch_size, passage_length) question_passage_attention = util.masked_softmax(question_passage_similarity, passage_mask) # Shape: (batch_size, encoding_dim) question_passage_vector = util.weighted_sum(encoded_passage, question_passage_attention) # Shape: (batch_size, passage_length, encoding_dim) tiled_question_passage_vector = question_passage_vector.unsqueeze(1).expand(batch_size, passage_length, encoding_dim) # Shape: (batch_size, passage_length, encoding_dim * 4) final_merged_passage = torch.cat([encoded_passage, passage_question_vectors, encoded_passage * passage_question_vectors, encoded_passage * tiled_question_passage_vector], dim=-1) modeled_passage = self._dropout(self._modeling_layer(final_merged_passage, passage_lstm_mask)) modeling_dim = modeled_passage.size(-1) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim)) span_start_input = self._dropout(torch.cat([final_merged_passage, modeled_passage], dim=-1)) # Shape: (batch_size, passage_length) span_start_logits = self._span_start_predictor(span_start_input).squeeze(-1) # Shape: (batch_size, passage_length) span_start_probs = sigmoid(span_start_logits) # Shape: (batch_size, modeling_dim) span_start_representation = util.weighted_sum(modeled_passage, span_start_probs) # Shape: (batch_size, passage_length, modeling_dim) tiled_start_representation = span_start_representation.unsqueeze(1).expand(batch_size, passage_length, modeling_dim) # Shape: (batch_size, passage_length, encoding_dim * 4 + modeling_dim * 3) span_end_representation = torch.cat([final_merged_passage, modeled_passage, tiled_start_representation, modeled_passage * tiled_start_representation], dim=-1) # Shape: (batch_size, passage_length, encoding_dim) encoded_span_end = self._dropout(self._span_end_encoder(span_end_representation, passage_lstm_mask)) # Shape: (batch_size, passage_length, encoding_dim * 4 + span_end_encoding_dim) span_end_input = self._dropout(torch.cat([final_merged_passage, encoded_span_end], dim=-1)) span_end_logits = self._span_end_predictor(span_end_input).squeeze(-1) span_start_logits = util.replace_masked_values(span_start_logits, passage_mask, -1e7) span_end_logits = util.replace_masked_values(span_end_logits, passage_mask, -1e7) span_start_probs = sigmoid(span_start_logits) span_end_probs = sigmoid(span_end_logits) best_span = self.get_best_span(span_start_probs,span_end_probs) output_dict = { "passage_question_attention": passage_question_attention, "span_start_logits": span_start_logits, "span_start_probs": span_start_probs, "span_end_logits": span_end_logits, "span_end_probs": span_end_probs, "best_span": best_span, } # Compute the loss for training. if answer_impossible is not None: target_start=torch.arange(0,span_start_logits.size(1),device=span_start_logits.device,dtype=torch.long) target_start=target_start.squeeze(0).expand(span_start_logits.size(0),-1)==span_start target_start=target_start.long()*(-1*(answer_impossible-1).unsqueeze(1).expand(-1,target_start.size(-1))) target_end=torch.arange(0,span_end_logits.size(1),device=span_end_logits.device,dtype=torch.long) target_end=target_end.squeeze(0).expand(span_end_logits.size(0),-1)==span_end target_end=target_end.long()*(-1*(answer_impossible-1).unsqueeze(1).expand(-1,target_start.size(-1))) span_start_logits_for_loss=torch.stack([-1*span_start_logits,span_start_logits],dim=-1) loss = util.sequence_cross_entropy_with_logits(span_start_logits_for_loss,target_start, passage_mask) span_end_logits_for_loss=torch.stack([-1*span_end_logits,span_end_logits],dim=-1) loss += util.sequence_cross_entropy_with_logits(span_end_logits_for_loss,target_end, passage_mask) self._span_start_accuracy((span_start_logits>0).long(), target_start) self._span_end_accuracy((span_end_logits>0).long(), target_end) self._answer_impossible_accuracy(((best_span.narrow(1,0, 1)==-1)*(best_span.narrow(1,1, 1)==-1)).long(), answer_impossible) # self._span_accuracy(best_span, torch.stack([span_start, span_end], -1)) output_dict["loss"] = loss # Compute the EM and F1 on SQuAD and add the tokenized input to the output. if metadata is not None: output_dict['best_span_str'] = [] question_tokens = [] passage_tokens = [] for i in range(batch_size): question_tokens.append(metadata[i]['question_tokens']) passage_tokens.append(metadata[i]['passage_tokens']) passage_str = metadata[i]['original_passage'] offsets = metadata[i]['token_offsets'] predicted_span = tuple(best_span[i].detach().cpu().numpy()) try: if predicted_span[0]!=-1: start_offset = offsets[predicted_span[0]][0] else: start_offset=-1 if predicted_span[1]!=-1: end_offset = offsets[predicted_span[1]][1] else: end_offset=-1 if end_offset!=-1 and start_offset!=-1: best_span_string = passage_str[start_offset:end_offset] else: best_span_string="" output_dict['best_span_str'].append(best_span_string) answer_texts = metadata[i].get('answer_texts', []) if answer_texts: self._squad_metrics(best_span_string, answer_texts) except Exception as e: print(str(e)) output_dict['question_tokens'] = question_tokens output_dict['passage_tokens'] = passage_tokens return output_dict
def forward( self, # type: ignore tokens: TextFieldTensors, spans: torch.LongTensor, metadata: List[Dict[str, Any]], pos_tags: TextFieldTensors = None, span_labels: torch.LongTensor = None, ) -> Dict[str, torch.Tensor]: """ # Parameters tokens : `TextFieldTensors`, required The output of `TextField.as_array()`, which should typically be passed directly to a `TextFieldEmbedder`. This output is a dictionary mapping keys to `TokenIndexer` tensors. At its most basic, using a `SingleIdTokenIndexer` this is : `{"tokens": Tensor(batch_size, num_tokens)}`. This dictionary will have the same keys as were used for the `TokenIndexers` when you created the `TextField` representing your sequence. The dictionary is designed to be passed directly to a `TextFieldEmbedder`, which knows how to combine different word representations into a single vector per token in your input. spans : `torch.LongTensor`, required. A tensor of shape `(batch_size, num_spans, 2)` representing the inclusive start and end indices of all possible spans in the sentence. metadata : `List[Dict[str, Any]]`, required. A dictionary of metadata for each batch element which has keys: tokens : `List[str]`, required. The original string tokens in the sentence. gold_tree : `nltk.Tree`, optional (default = `None`) Gold NLTK trees for use in evaluation. pos_tags : `List[str]`, optional. The POS tags for the sentence. These can be used in the model as embedded features, but they are passed here in addition for use in constructing the tree. pos_tags : `torch.LongTensor`, optional (default = `None`) The output of a `SequenceLabelField` containing POS tags. span_labels : `torch.LongTensor`, optional (default = `None`) A torch tensor representing the integer gold class labels for all possible spans, of shape `(batch_size, num_spans)`. # Returns An output dictionary consisting of: class_probabilities : `torch.FloatTensor` A tensor of shape `(batch_size, num_spans, span_label_vocab_size)` representing a distribution over the label classes per span. spans : `torch.LongTensor` The original spans tensor. tokens : `List[List[str]]`, required. A list of tokens in the sentence for each element in the batch. pos_tags : `List[List[str]]`, required. A list of POS tags in the sentence for each element in the batch. num_spans : `torch.LongTensor`, required. A tensor of shape (batch_size), representing the lengths of non-padded spans in `enumerated_spans`. loss : `torch.FloatTensor`, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) if pos_tags is not None and self.pos_tag_embedding is not None: embedded_pos_tags = self.pos_tag_embedding(pos_tags) embedded_text_input = torch.cat([embedded_text_input, embedded_pos_tags], -1) elif self.pos_tag_embedding is not None: raise ConfigurationError("Model uses a POS embedding, but no POS tags were passed.") mask = get_text_field_mask(tokens) # Looking at the span start index is enough to know if # this is padding or not. Shape: (batch_size, num_spans) span_mask = (spans[:, :, 0] >= 0).squeeze(-1) if span_mask.dim() == 1: # This happens if you use batch_size 1 and encounter # a length 1 sentence in PTB, which do exist. -.- span_mask = span_mask.unsqueeze(-1) if span_labels is not None and span_labels.dim() == 1: span_labels = span_labels.unsqueeze(-1) num_spans = get_lengths_from_binary_sequence_mask(span_mask) encoded_text = self.encoder(embedded_text_input, mask) span_representations = self.span_extractor(encoded_text, spans, mask, span_mask) if self.feedforward_layer is not None: span_representations = self.feedforward_layer(span_representations) logits = self.tag_projection_layer(span_representations) class_probabilities = masked_softmax(logits, span_mask.unsqueeze(-1)) output_dict = { "class_probabilities": class_probabilities, "spans": spans, "tokens": [meta["tokens"] for meta in metadata], "pos_tags": [meta.get("pos_tags") for meta in metadata], "num_spans": num_spans, } if span_labels is not None: loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask) self.tag_accuracy(class_probabilities, span_labels, span_mask) output_dict["loss"] = loss # The evalb score is expensive to compute, so we only compute # it for the validation and test sets. batch_gold_trees = [meta.get("gold_tree") for meta in metadata] if all(batch_gold_trees) and self._evalb_score is not None and not self.training: gold_pos_tags: List[List[str]] = [ list(zip(*tree.pos()))[1] for tree in batch_gold_trees ] predicted_trees = self.construct_trees( class_probabilities.cpu().data, spans.cpu().data, num_spans.data, output_dict["tokens"], gold_pos_tags, ) self._evalb_score(predicted_trees, batch_gold_trees) return output_dict
def forward(self, source_tokens, target_tokens=None) -> Dict[str, torch.Tensor]: inputs = source_tokens targets = target_tokens input_ids, input_mask = inputs["tokens"]["token_ids"], inputs["tokens"]["mask"] outputs = {} # If no targets are provided, then shift input to right by 1. Bart already does this internally # but it does not use them for loss calculation. if targets is not None: target_ids, target_mask = targets["tokens"]["token_ids"], targets["tokens"]["mask"] else: target_ids = input_ids[:, 1:] target_mask = input_mask[:, 1:] if self.training: # training outputs = self.plm(input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=target_ids[:, :-1].contiguous(), decoder_attention_mask=target_mask[:, :-1].contiguous(), use_cache=False, return_dict=True) outputs["decoder_logits"] = outputs.logits outputs["loss"] = sequence_cross_entropy_with_logits( outputs.logits, cast(torch.LongTensor, target_ids[:, 1:].contiguous()), cast(torch.BoolTensor, target_mask[:, 1:].contiguous()), label_smoothing=0.1, average="token", ) elif targets is not None: # validation outputs = self.plm(input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=target_ids[:, :-1].contiguous(), decoder_attention_mask=target_mask[:, :-1].contiguous(), use_cache=False, return_dict=True) outputs["decoder_logits"] = outputs.logits outputs["loss"] = sequence_cross_entropy_with_logits( outputs.logits, cast(torch.LongTensor, target_ids[:, 1:].contiguous()), cast(torch.BoolTensor, target_mask[:, 1:].contiguous()), label_smoothing=0.1, ) self._rouge(torch.argmax(outputs.logits, -1), target_ids) self._bleu(torch.argmax(outputs.logits, -1), target_ids) else: #prediction # Use decoder start id and start of sentence to start decoder initial_decoder_ids = torch.tensor( [[self._decoder_start_id]], dtype=input_ids.dtype, device=input_ids.device, ).repeat(input_ids.shape[0], 1) inital_state = { "input_ids": input_ids, "input_mask": input_mask, } beam_result = self._beam_search.search( initial_decoder_ids, inital_state, self.take_step ) predictions = beam_result[0] logger.info(beam_result) max_pred_indices = ( beam_result[1].argmax(dim=-1).view(-1, 1, 1).expand(-1, -1, predictions.shape[-1]) ) predictions = predictions.gather(dim=1, index=max_pred_indices).squeeze(dim=1) self._rouge(predictions, target_ids) self._bleu(predictions, target_ids) outputs["predictions"] = predictions outputs["log_probabilities"] = ( beam_result[1].gather(dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1) ) self.make_output_human_readable(outputs) return outputs
def forward( self, token_sequence: Dict[str, torch.Tensor], label_sequence: torch.Tensor = None) -> Dict[str, torch.Tensor]: mask = get_text_field_mask(token_sequence) if 'mixture' in self.config.embedding_strategy: word2vec_embedder = self.embedders[1] # Keep in mind to manage any custom models that don't produce 256-dim vectors if any([ strat in self.config.embedding_strategy for strat in ['elmo_original', 'elmo_pubmed'] ]): word2vec_embeddings = word2vec_embedder( {'tokens': token_sequence['tokens']}) word2vec_embeddings = torch.cat([ word2vec_embeddings, word2vec_embeddings, word2vec_embeddings, word2vec_embeddings ], dim=2) else: word2vec_embeddings = word2vec_embedder( {'tokens': token_sequence['tokens']}) # Pad with zeros at BOS and EOS batch_size, _, embedding_dim = word2vec_embeddings.shape zeros = torch.zeros([batch_size, 1, embedding_dim]) if self.config.device == 'gpu': zeros = zeros.cuda() padded_word2vec_embeddings = torch.cat( [zeros, word2vec_embeddings, zeros], dim=1) elmo_embedder = self.embedders[0] embeddings = elmo_embedder( {'characters': token_sequence['characters']}, word2vec_embeddings=padded_word2vec_embeddings) else: embeddings = [] for embedder in self.embedders: if hasattr(embedder, 'token_embedder_characters'): embeddings.append( embedder({'characters': token_sequence['characters']})) elif hasattr(embedder, 'token_embedder_tokens'): embeddings.append( embedder({'tokens': token_sequence['tokens']})) embeddings = torch.cat(embeddings, dim=2) encoder_output = self.encoder(embeddings, mask) label_logits = self.linear_layer(encoder_output) self.accuracy(label_logits, label_sequence, mask) self.F1(label_logits, label_sequence, mask) output = { "label_logits": label_logits, "loss": sequence_cross_entropy_with_logits(label_logits, label_sequence, mask) } return output
def forward(self, # type: ignore tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containing the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ #Davedit print_vocab = False embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, sequence_length, self.num_classes]) output_dict = {"logits": logits, "class_probabilities": class_probabilities} # x = tokens['tokens'] # y = tokens['pos_tag'] # w = tokens['ner_tag'] # import code # code.interact(local=locals()) # import pdb # pdb.set_trace() if tags is not None: if self.do_crossentropy_weighting: # Implementing custom loss function, weight tags = 1 vs tags = 0 # Note this only works for binary tags at present Nt0 = self.Ntags0 # Should correspond to non-blanks Nt1 = self.Ntags1 # Should correspond to blanks if not (Nt0 and Nt1): # If either Nt0 or Nt1 are unspecified Nt0 = sum(sum((tags==0).double())) Nt1 = sum(sum((tags==1).double())) #import pdb; pdb.set_trace() # # Convert N blanks to weights - weight is inversely proportional to number of tags # t0wp = Nt1 / (Nt0 + Nt1) # t0 weighting percent # t1wp = Nt0 / (Nt0 + Nt1) # t1 weighting percent # mask2 = mask.clone() # mask2 = mask2.double() # mask2[tags == 1] = mask2[tags == 1] * t1wp # mask2[tags == 0] = mask2[tags == 0] * t0wp # loss = sequence_cross_entropy_with_logits(logits, tags, mask2) # Convert N blanks to weights - weight is inversely proportional to number of tags t0wp = Nt1 / (Nt0 + Nt1)*100 # t0 weighting percent t1wp = Nt0 / (Nt0 + Nt1)*100 # t1 weighting percent mask2 = mask.clone() mask2[tags == 1] = mask2[tags == 1] * t1wp mask2[tags == 0] = mask2[tags == 0] * t0wp loss = sequence_cross_entropy_with_logits(logits, tags, mask2) # # Old code, hardcoded # Nnonblanks = 5877.15 - 256.16 # Average number of non-blanks per article # Nblanks = 256.16 # Average # blanks per article # blank_weight = Nnonblanks / (Nnonblanks + Nblanks)*100 # nblank_weight = Nblanks / (Nnonblanks + Nblanks)*100 # mask2 = mask # mask2[tags == 1] = mask2[tags == 1] * blank_weight # mask2[tags == 0] = mask2[tags == 0] * nblank_weight # loss = sequence_cross_entropy_with_logits(logits, tags, mask2) else: # Defualt AllenNLP loss loss = sequence_cross_entropy_with_logits(logits, tags, mask) if print_vocab: vocab = self.vocab vo = vocab.get_index_to_token_vocabulary('pos') out = set([v for v in vo.values()]) print(out) vo = vocab.get_index_to_token_vocabulary('ner') out = set([v for v in vo.values()]) print(out) vo = vocab.get_index_to_token_vocabulary('dependencies') out = set([v for v in vo.values()]) print(out) # Results of vocab from 1st 20 articles. Can use these to set embedding dimensions # {'VBP', 'PRP$', 'SYM', 'XX', ':', 'ADD', 'NNS', 'CC', 'VBG', 'RBR', 'NNP', 'IN', 'JJ', 'TO', 'NFP', 'NNPS', 'PRP', 'LS', 'NN', 'CD', 'FW', 'MD', 'AFX', 'PDT', "''", 'RP', 'JJR', 'RB', 'VB', '``', '.', 'VBD', 'VBN', '-RRB-', 'JJS', 'RBS', '$', '@@PADDING@@', 'EX', 'HYPH', 'POS', '-LRB-', 'WP$', 'VBZ', ',', 'UH', 'WP', 'DT', 'WDT', 'WRB', '@@UNKNOWN@@'} # {'PERCENT', 'ORG', 'NONE', 'ORDINAL', 'MONEY', 'CARDINAL', 'NORP', 'LANGUAGE', 'DATE', 'WORK_OF_ART', 'LAW', 'LOC', 'PERSON', 'QUANTITY', 'EVENT', 'GPE', 'TIME', 'FAC', '@@PADDING@@', 'PRODUCT', '@@UNKNOWN@@'} # {'poss', 'attr', 'xcomp', 'npadvmod', 'agent', 'parataxis', 'mark', 'nmod', 'predet', 'compound', 'ROOT', 'intj', 'csubjpass', 'nsubjpass', 'preconj', 'amod', 'csubj', 'ccomp', 'punct', 'advcl', 'conj', 'acomp', 'oprd', 'case', 'nsubj', 'dobj', 'nummod', 'prt', 'cc', 'advmod', 'appos', 'neg', 'pcomp', 'quantmod', 'dep', 'meta', '@@PADDING@@', 'relcl', 'expl', 'acl', 'dative', 'auxpass', 'det', 'aux', 'prep', 'pobj', '@@UNKNOWN@@'} # Vocab size # Vocabulary with namespaces: dependencies, Size: 47 || ner, Size: 21 || pos, Size: 51 || tokens, Size: 21902 || labels, Size: 2 || Non Padded Namespaces: {'*tags', '*labels'} # import pdb # pdb.set_trace() for metric in self.metrics.values(): metric(logits, tags, mask.float()) output_dict["loss"] = loss if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. verb_indicator: torch.LongTensor, required. An integer ``SequenceFeatureField`` representation of the position of the verb in the sentence. This should have shape (batch_size, num_tokens) and importantly, can be all zeros, in the case that the sentence has no verbal predicate. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)`` metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containg the original words in the sentence and the verb to compute the frame for, under 'words' and 'verb' keys, respectively. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.embedding_dropout( self.text_field_embedder(tokens)) mask = get_text_field_mask(tokens) embedded_verb_indicator = self.binary_feature_embedding( verb_indicator.long()) # Concatenate the verb feature onto the embedded text. This now # has shape (batch_size, sequence_length, embedding_dim + binary_feature_dim). embedded_text_with_verb_indicator = torch.cat( [embedded_text_input, embedded_verb_indicator], -1) batch_size, sequence_length, _ = embedded_text_with_verb_indicator.size( ) encoded_text = self.encoder(embedded_text_with_verb_indicator, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if tags is not None: loss = sequence_cross_entropy_with_logits( logits, tags, mask, label_smoothing=self._label_smoothing) if not self.ignore_span_metric: self.span_metric(class_probabilities, tags, mask) output_dict["loss"] = loss # We need to retain the mask in the output dictionary # so that we can crop the sequences to remove padding # when we do viterbi inference in self.decode. output_dict["mask"] = mask words, verbs = zip(*[(x["words"], x["verb"]) for x in metadata]) if metadata is not None: output_dict["words"] = list(words) output_dict["verb"] = list(verbs) return output_dict
def train_model(epochs=50, num_gradients_accumulation=4, batch_size=4, gpu_id=0, lr=1e-5, load_dir='decoder_model'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') model = transformers_model() device = torch.device(f"cuda:0") model.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("../train_data.pth") train_dataset = TensorDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) val_data = torch.load("../validate_data.pth") val_dataset = TensorDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW( optimizer_grouped_parameters,\ lr=lr,\ weight_decay=0.01, ) scheduler = get_linear_schedule_with_warmup( optimizer, \ num_warmup_steps=num_train_optimization_steps // 10, \ num_training_steps=num_train_optimization_steps ) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 lowest_perplexity = 10000000 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ model.train() losses = 0 times = 0 for batch in train_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in val_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') validate_perplexity = perplexity / batch_count direct_path = os.path.join(os.path.abspath('.'), load_dir) if not os.path.exists(direct_path): os.mkdir(direct_path) #torch.save(model.state_dict(), os.path.join(direct_path, str(epoch) + "model.pth")) if validate_perplexity < lowest_perplexity: lowest_perplexity = validate_perplexity torch.save(model.state_dict(), os.path.join(direct_path, "best_model.pth"))
def train_model(epochs=10, num_gradients_accumulation=4, batch_size=8, gpu_id=0, lr=1e-4, load_dir='decoder_model', decoder_model='original_pretrained_model_for_bertGPT.pth'): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') model = BertGPT() model.load_state_dict(torch.load(decoder_model)) # model = nn.DataParallel(model, device_ids = [0]) model = model.to(device) print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD TRAIN DATA------------------ train_data = torch.load("train_data.pth") train_dataset = MyDataset(*train_data) train_dataloader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) val_data = torch.load("validate_data.pth") val_dataset = MyDataset(*val_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=True, batch_size=batch_size, num_workers=2, collate_fn=collate_fn) #------------------------END LOAD TRAIN DATA-------------- #------------------------SET OPTIMIZER------------------- num_train_optimization_steps = len( train_dataset) * epochs // batch_size // num_gradients_accumulation param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in param_optimizer if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] print('train') print(len(optimizer_grouped_parameters[0]['params'])) optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=lr, warmup=0.01, max_grad_norm=1.0, weight_decay=0.01, t_total=num_train_optimization_steps) #------------------------END SET OPTIMIZER-------------- #------------------------START TRAINING------------------- update_count = 0 start = time.time() print('start training....') for epoch in range(epochs): #------------------------training------------------------ model.train() losses = 0 times = 0 for batch in tqdm(train_dataloader, desc='dirs'): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") loss.backward() losses += loss.item() times += 1 update_count += 1 if update_count % num_gradients_accumulation == num_gradients_accumulation - 1: optimizer.step() optimizer.zero_grad() end = time.time() print('-' * 20 + f'epoch {epoch}' + '-' * 20) print(f'time: {(end - start)}') print(f'loss: {losses / times}') start = end #------------------------validate------------------------ model.eval() perplexity = 0 batch_count = 0 print('start calculate the perplexity....') with torch.no_grad(): for batch in tqdm(val_dataloader): batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(encoder_input, mask_encoder_input, decoder_input, mask_decoder_input) out = logits[:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') direct_path = os.path.join(os.path.abspath('.'), load_dir) if not os.path.exists(direct_path): os.mkdir(direct_path) torch.save(model.state_dict(), os.path.join(direct_path, str(epoch) + "model.pth"))
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], spans: torch.LongTensor, span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. spans : ``torch.LongTensor``, required. A tensor of shape ``(batch_size, num_spans, 2)`` representing the inclusive start and end indices of all possible spans in the sentence. span_labels : torch.LongTensor, optional (default = None) A torch tensor representing the integer gold class labels for all possible spans, of shape ``(batch_size, num_spans)``. Returns ------- An output dictionary consisting of: logits : ``torch.FloatTensor`` A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)`` representing unnormalised log probabilities of the label classes for each span. class_probabilities : ``torch.FloatTensor`` A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)`` representing a distribution over the label classes per span. loss : ``torch.FloatTensor``, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) mask = get_text_field_mask(tokens) # Looking at the span start index is enough to know if # this is padding or not. Shape: (batch_size, num_spans) span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long() encoded_text = self.encoder(embedded_text_input, mask) span_representations = self.span_extractor(encoded_text, spans, mask, span_mask) if self.feedforward_layer is not None: span_representations = self.feedforward_layer(span_representations) logits = self.tag_projection_layer(span_representations) class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1)) output_dict = { "class_probabilities": class_probabilities, "spans": spans, # TODO(Mark): This relies on having tokens represented with a SingleIdTokenIndexer... "tokens": tokens["tokens"], "token_mask": mask } if span_labels is not None: loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask) for metric in self.metrics.values(): metric(logits, span_labels, span_mask) output_dict["loss"] = loss return output_dict
def forward(self, prev_tokens: Dict[str, torch.LongTensor], prev_tags: Dict[str, torch.LongTensor], fol_tokens: Dict[str, torch.LongTensor], fol_tags: Dict[str, torch.LongTensor], prev_labels: torch.Tensor = None, fol_labels: torch.Tensor = None, conflicts: List[Any] = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: prev_mask = get_text_field_mask(prev_tokens) # embedding sequence prev_embedding_seq = self.token_field_embedding(prev_tokens) # embedding tag prev_tag_embedding = self.char_field_embedding(prev_tags) fol_mask = get_text_field_mask(fol_tokens) # embedding sequence fol_embedding_seq = self.token_field_embedding(fol_tokens) # embedding tag fol_tag_embedding = self.char_field_embedding(fol_tags) batch_size, _ = prev_mask.size() # initialization in specific gpu devices gpu_device = prev_embedding_seq.device prev_phrase_tensor = torch.tensor([0.0], device=gpu_device) fol_phrase_tensor = torch.tensor([1.0], device=gpu_device) prev_phrase_embedding_seq = prev_phrase_tensor.repeat( prev_embedding_seq.size(0), prev_embedding_seq.size(1), 1 ) fol_phrase_embedding_seq = fol_phrase_tensor.repeat( fol_embedding_seq.size(0), fol_embedding_seq.size(1), 1 ) # concat embedding and phrase prev_embedding_seq = torch.cat([prev_embedding_seq, prev_phrase_embedding_seq, prev_tag_embedding], dim=2) fol_embedding_seq = torch.cat([fol_embedding_seq, fol_phrase_embedding_seq, fol_tag_embedding], dim=2) prev_embedding_seq = self.projection_layer(prev_embedding_seq) fol_embedding_seq = self.projection_layer(fol_embedding_seq) # embedding phrase label 0 means prev, 1 means follow-up if self.training: embedding = torch.cat([prev_embedding_seq, fol_embedding_seq], dim=1) embedding_var = self._variational_dropout(embedding) prev_mask_len = prev_mask.size(1) prev_embedding_seq_var = embedding_var[:, :prev_mask_len] fol_embedding_seq_var = embedding_var[:, prev_mask_len:] else: prev_embedding_seq_var = prev_embedding_seq fol_embedding_seq_var = fol_embedding_seq # encode sequence prev_encoder_out = self.tokens_encoder(prev_embedding_seq_var, prev_mask) fol_encoder_out = self.tokens_encoder(fol_embedding_seq_var, fol_mask) prev_forward_output = prev_encoder_out[:, :, :self.hidden_size] prev_backward_output = prev_encoder_out[:, :, self.hidden_size:] fol_forward_output = fol_encoder_out[:, :, :self.hidden_size] fol_backward_output = fol_encoder_out[:, :, self.hidden_size:] prev_attn_mask = prev_mask.view(batch_size, -1, 1) * fol_mask.view(batch_size, 1, -1) prev_forward_attn_matrix = self._self_attention(prev_forward_output, fol_forward_output) / self._scaled_value prev_backward_attn_matrix = self._self_attention(prev_backward_output, fol_backward_output) / self._scaled_value prev_mean_pooling_attn = util.masked_softmax(prev_forward_attn_matrix + prev_backward_attn_matrix, prev_attn_mask) # take max pooling rather than average prev_attn_vec = torch.matmul(prev_mean_pooling_attn, fol_encoder_out) fol_attn_mask = fol_mask.view(batch_size, -1, 1) * prev_mask.view(batch_size, 1, -1) fol_forward_attn_matrix = self._self_attention(fol_forward_output, prev_forward_output) / self._scaled_value fol_backward_attn_matrix = self._self_attention(fol_backward_output, prev_backward_output) / self._scaled_value fol_mean_pooling_attn = util.masked_softmax(fol_forward_attn_matrix + fol_backward_attn_matrix, fol_attn_mask) # take max pooling rather than average fol_attn_vec = torch.matmul(fol_mean_pooling_attn, prev_encoder_out) # non_linear_output = self._non_linear(torch.cat([encoder_out, self_attention_vec], dim=2)) # prev_linear = torch.cat([prev_encoder_out, prev_attn_vec], dim=2) # fol_linear = torch.cat([fol_encoder_out, fol_attn_vec], dim=2) prev_attn_multiply = prev_encoder_out * prev_attn_vec zero_tensor = torch.zeros((batch_size, 1, prev_attn_multiply.size(2)), device=gpu_device, dtype=torch.float) prev_attn_shift = torch.cat((zero_tensor, prev_attn_multiply[:, :-1, :]), dim=1) # shift attn vector to right, and then subtract them prev_linear = torch.cat([prev_encoder_out, prev_attn_multiply, prev_attn_shift], dim=2) fol_attn_multiply = fol_encoder_out * fol_attn_vec fol_attn_shift = torch.cat((zero_tensor, fol_attn_multiply[:, :-1, :]), dim=1) # shift attn vector to right, and then subtract them fol_linear = torch.cat([fol_encoder_out, fol_attn_multiply, fol_attn_shift], dim=2) prev_tag_logistics = self.policy_net(prev_linear) fol_tag_logistics = self.policy_net(fol_linear) # project to space prev_tag_prob = F.softmax(prev_tag_logistics, dim=2) prev_predict_labels = torch.argmax(prev_tag_prob, dim=2) fol_tag_prob = F.softmax(fol_tag_logistics, dim=2) fol_predict_labels = torch.argmax(fol_tag_prob, dim=2) predict_restate_str_list = [] predict_restate_tag_list = [] max_bleu_list = [] # debug information _debug_batch_conflict_map = {} # using predict labels to cut utterance into span and fetch representations of span for batch_ind in range(batch_size): _debug_batch_conflict_map[batch_ind] = [] # batch reference object batch_origin_obj = metadata[batch_ind]["origin_obj"] prev_start_end, fol_start_end = predict_span_start_end( prev_predict_labels[batch_ind, :sum(prev_mask[batch_ind])], fol_predict_labels[batch_ind, :sum(fol_mask[batch_ind])]) # Phase 2: Predict actual fusion str via span start/end and similar gate predict_restate_str, predict_restate_tag \ = self.predict_restate(batch_origin_obj, fol_start_end, prev_start_end, prev_forward_output, prev_backward_output, fol_forward_output, fol_backward_output, batch_ind, gpu_device, _debug_batch_conflict_map) # add it to batch predict_restate_str_list.append(predict_restate_str) predict_restate_tag_list.append(predict_restate_tag) batch_golden_restate_str = [" ".join(single_metadata["origin_obj"]["restate"].utterance) for single_metadata in metadata] batch_golden_restate_tag = [single_metadata["origin_obj"]["restate"].tags for single_metadata in metadata] output = { "probs": prev_tag_prob, "prev_labels": prev_predict_labels, "fol_labels": fol_predict_labels, "restate": predict_restate_str_list, "max_bleu": max_bleu_list } avg_bleu = self.metrics["bleu"](predict_restate_str_list, batch_golden_restate_str) avg_symbol = self.metrics["symbol"](predict_restate_tag_list, batch_golden_restate_tag) # overall measure self.metrics["overall"]([0.4 * avg_bleu + 0.6 * avg_symbol] * batch_size) conflict_confidences = [] # condition on training to if self.training: if prev_labels is not None: labels = torch.cat([prev_labels, fol_labels], dim=1) # Initialization pre-training with longest common string logistics = torch.cat([prev_tag_logistics, fol_tag_logistics], dim=1) mask = torch.cat([prev_mask, fol_mask], dim=1) loss_snippet = sequence_cross_entropy_with_logits(logistics, labels, mask, label_smoothing=0.2) # for pre-training, we regard them as optimal ground truth conflict_confidences = [1.0] * batch_size else: if DEBUG: rl_sample_count = 1 else: rl_sample_count = 20 batch_loss_snippet = [] batch_sample_conflicts = [] # Training Phase 2: train conflict model via margin loss for batch_ind in range(batch_size): dynamic_conflicts = [] dynamic_confidence = [] # batch reference object batch_origin_obj = metadata[batch_ind]["origin_obj"] prev_mask_len = prev_mask[batch_ind].sum().view(1).data.cpu().numpy()[0] fol_mask_len = fol_mask[batch_ind].sum().view(1).data.cpu().numpy()[0] sample_data = [] for _ in range(rl_sample_count): prev_multi = Categorical(logits=prev_tag_logistics[batch_ind]) fol_multi = Categorical(logits=fol_tag_logistics[batch_ind]) prev_label_tensor = prev_multi.sample() prev_label_tensor.data[0].fill_(1) prev_sample_label = prev_label_tensor.data.cpu().numpy().astype(int)[:prev_mask_len] fol_label_tensor = fol_multi.sample() fol_label_tensor.data[0].fill_(1) fol_sample_label = fol_label_tensor.data.cpu().numpy().astype(int)[:fol_mask_len] log_prob = torch.cat( [prev_multi.log_prob(prev_label_tensor), fol_multi.log_prob(fol_label_tensor)], dim=-1) conflict_prob_mat = self.calculate_conflict_prob_matrix(prev_sample_label, fol_sample_label, batch_ind, prev_forward_output, prev_backward_output, fol_forward_output, fol_backward_output, gpu_device) self.policy_net.saved_log_probs.append(log_prob) sample_data.append((prev_sample_label, fol_sample_label, batch_origin_obj, conflict_prob_mat)) if DEBUG: ret_data = [sample_action(row) for row in sample_data] else: # Parallel to speed up the sampling process with ThreadPool(4) as p: chunk_size = rl_sample_count // 4 ret_data = p.map(sample_action, sample_data, chunksize=chunk_size) for conflict_confidence, reinforce_reward, conflict_pair in ret_data: self.policy_net.rewards.append(reinforce_reward) dynamic_conflicts.append(conflict_pair) dynamic_confidence.append(conflict_confidence) rewards = torch.tensor(self.policy_net.rewards, device=gpu_device).float() self.metrics["reward"](self.policy_net.rewards) rewards -= rewards.mean().detach() self.metrics["reward_var"]([rewards.std().data.cpu().numpy()]) loss_snippet = [] # reward high, optimize it; reward low, reversal optimization for log_prob, reward in zip(self.policy_net.saved_log_probs, rewards): loss_snippet.append((- log_prob * reward).unsqueeze(0)) loss_snippet = torch.cat(loss_snippet).mean(dim=1).sum().view(1) batch_loss_snippet.append(loss_snippet) # random select one best_conflict_id = choice(range(rl_sample_count)) # best_conflict_id = np.argmax(self.policy_net.rewards) batch_sample_conflicts.append(dynamic_conflicts[best_conflict_id]) conflict_confidences.append(dynamic_confidence[best_conflict_id]) self.policy_net.reset() loss_snippet = torch.cat(batch_loss_snippet).mean() # according to confidence conflicts = [] for conflict_batch_id in range(batch_size): conflicts.append(batch_sample_conflicts[conflict_batch_id]) # Training Phase 1: train snippet model total_loss = loss_snippet border = torch.tensor([0.0], device=gpu_device) pos_target = torch.tensor([1.0], device=gpu_device) neg_target = torch.tensor([-1.0], device=gpu_device) # Training Phase 2: train conflict model via margin loss loss_conflict = torch.tensor([0.0], device=gpu_device)[0] # random decision on which to use for batch_ind in range(0, batch_size): batch_conflict_list = conflicts[batch_ind] # use prediction results to conflict temp_loss_conflict = torch.tensor([0.0], device=gpu_device)[0] if batch_conflict_list and len(batch_conflict_list) > 0: for conflict in batch_conflict_list: (prev_start, prev_end), (fol_start, fol_end), conflict_mode = conflict fol_span_repr = get_span_repr(fol_forward_output[batch_ind], fol_backward_output[batch_ind], fol_start, fol_end) prev_span_repr = get_span_repr(prev_forward_output[batch_ind], prev_backward_output[batch_ind], prev_start, prev_end) inter_prob = self.cosine_similar(fol_span_repr, prev_span_repr).view(1) # actual conflict if conflict_mode == 1: temp_loss_conflict += self.margin_loss(inter_prob, border, pos_target) else: temp_loss_conflict += self.margin_loss(inter_prob, border, neg_target) temp_confidence = conflict_confidences[batch_ind] loss_conflict += temp_confidence * temp_loss_conflict / len(batch_conflict_list) loss_conflict = loss_conflict / batch_size # for larger margin total_loss += loss_conflict output["loss"] = total_loss return output
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], spans: torch.LongTensor, metadata: List[Dict[str, Any]], span_labels: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. spans : ``torch.LongTensor``, required. A tensor of shape ``(batch_size, num_spans, 2)`` representing the inclusive start and end indices of all possible spans in the sentence. span_labels : ``torch.LongTensor``, optional (default = None) A torch tensor representing the integer gold class labels for all possible spans, of shape ``(batch_size, num_spans)``. metadata : List[Dict[str, Any]], required. A dictionary of metadata for each batch element which has keys: tokens : ``List[str]``, required. The original string tokens in the sentence. gold_tree : ``nltk.Tree``, optional (default = None) Gold NLTK trees for use in evaluation. Returns ------- An output dictionary consisting of: class_probabilities : ``torch.FloatTensor`` A tensor of shape ``(batch_size, num_spans, span_label_vocab_size)`` representing a distribution over the label classes per span. spans : ``torch.LongTensor`` The original spans tensor. tokens : ``List[List[str]]``, required. A list of tokens in the sentence for each element in the batch. num_spans : ``torch.LongTensor``, required. A tensor of shape (batch_size), representing the lengths of non-padded spans in ``enumerated_spans``. loss : ``torch.FloatTensor``, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) mask = get_text_field_mask(tokens) # Looking at the span start index is enough to know if # this is padding or not. Shape: (batch_size, num_spans) span_mask = (spans[:, :, 0] >= 0).squeeze(-1).long() if span_mask.dim() == 1: # This happens if you use batch_size 1 and encounter # a length 1 sentence in PTB, which do exist. -.- span_mask = span_mask.unsqueeze(-1) num_spans = get_lengths_from_binary_sequence_mask(span_mask) encoded_text = self.encoder(embedded_text_input, mask) span_representations = self.span_extractor(encoded_text, spans, mask, span_mask) if self.feedforward_layer is not None: span_representations = self.feedforward_layer(span_representations) logits = self.tag_projection_layer(span_representations) class_probabilities = last_dim_softmax(logits, span_mask.unsqueeze(-1)) output_dict = { "class_probabilities": class_probabilities, "spans": spans, "tokens": [meta["tokens"] for meta in metadata], "num_spans": num_spans } if span_labels is not None: loss = sequence_cross_entropy_with_logits(logits, span_labels, span_mask) for metric in self.metrics.values(): metric(logits, span_labels, span_mask) output_dict["loss"] = loss # The evalb score is expensive to compute, so we only compute # it for the validation and test sets. batch_gold_trees = [meta.get("gold_tree") for meta in metadata] if all(batch_gold_trees ) and self._evalb_score is not None and not self.training: # TODO(Mark): Predict POS and use here instead of using the gold ones. gold_pos_tags: List[List[str]] = [ list(zip(*tree.pos()))[1] for tree in batch_gold_trees ] predicted_trees = self.construct_trees( class_probabilities.cpu().data, spans.cpu().data, num_spans.data, output_dict["tokens"], gold_pos_tags) self._evalb_score(predicted_trees, batch_gold_trees) return output_dict
def calculate_perplexity( batch_size=1, gpu_id=0, model_path='./BERT/model-10.pth' ): # make sure your model is on GPU device = torch.device(f"cuda:{gpu_id}") #------------------------LOAD MODEL----------------- print('load the model....') # encoder = BartModel.from_pretrained("facebook/bart-base") # encoder = encoder.to(device) # encoder.eval() bert_encoder = BertConfig.from_pretrained('bert-base-uncased') bert_decoder = BertConfig.from_pretrained('bert-base-uncased',is_decoder = True) config = EncoderDecoderConfig.from_encoder_decoder_configs(bert_encoder,bert_decoder) model = EncoderDecoderModel(config) model = model.to(device) model.load_state_dict(torch.load(model_path,map_location='cuda')) model.eval() print('load success') #------------------------END LOAD MODEL-------------- #------------------------LOAD VAL DATA------------------ val_data = torch.load("/content/validate_data.pth") val_dataset = TensorDataset(*val_data) train_data = torch.load("/content/train_data.pth") train_dataset = TensorDataset(*train_data) test_data = torch.load("/content/test_data.pth") test_dataset = TensorDataset(*test_data) val_dataloader = DataLoader(dataset=val_dataset, shuffle=False, batch_size=batch_size) train_dataloader = DataLoader(dataset=train_dataset, shuffle=False, batch_size=batch_size) test_dataloader = DataLoader(dataset=test_dataset, shuffle=False, batch_size=batch_size) #------------------------END LOAD VAL DATA-------------- #------------------------START VAL------------------- perplexity = 0 batch_count = 0 print('start calculate the train perplexity....') with torch.no_grad(): for batch in train_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input, decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input) out = logits[0][:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'train perplexity: {perplexity / batch_count}') perplexity = 0 batch_count = 0 print('start calculate the validate perplexity....') with torch.no_grad(): for batch in val_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input, decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input) out = logits[0][:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'validate perplexity: {perplexity / batch_count}') perplexity = 0 batch_count = 0 print('start calculate the test perplexity....') with torch.no_grad(): for batch in test_dataloader: batch = [item.to(device) for item in batch] encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch logits = model(input_ids = encoder_input,attention_mask = mask_encoder_input, decoder_input_ids = decoder_input, decoder_attention_mask = mask_decoder_input) out = logits[0][:, :-1].contiguous() target = decoder_input[:, 1:].contiguous() target_mask = mask_decoder_input[:, 1:].contiguous() loss = util.sequence_cross_entropy_with_logits(out, target, target_mask, average="token") perplexity += np.exp(loss.item()) batch_count += 1 print(f'test perplexity: {perplexity / batch_count}')
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], labels: torch.LongTensor = None, d_tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. lables : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. d_tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containing the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ encoded_text = self.text_field_embedder(tokens) batch_size, sequence_length, _ = encoded_text.size() mask = get_text_field_mask(tokens) logits_labels = self.tag_labels_projection_layer( self.predictor_dropout(encoded_text)) logits_d = self.tag_detect_projection_layer(encoded_text) class_probabilities_labels = F.softmax(logits_labels, dim=-1).view( [batch_size, sequence_length, self.num_labels_classes]) class_probabilities_d = F.softmax(logits_d, dim=-1).view( [batch_size, sequence_length, self.num_detect_classes]) error_probs = class_probabilities_d[:, :, self.incorr_index] * mask incorr_prob = torch.max(error_probs, dim=-1)[0] if self.confidence > 0: probability_change = [self.confidence ] + [0] * (self.num_labels_classes - 1) if self.use_cpu: class_probabilities_labels += torch.FloatTensor( probability_change).repeat( (batch_size, sequence_length, 1)) else: class_probabilities_labels += torch.cuda.FloatTensor( probability_change).repeat( (batch_size, sequence_length, 1)) output_dict = { "logits_labels": logits_labels, "logits_d_tags": logits_d, "class_probabilities_labels": class_probabilities_labels, "class_probabilities_d_tags": class_probabilities_d, "max_error_probability": incorr_prob } if labels is not None and d_tags is not None: loss_labels = sequence_cross_entropy_with_logits( logits_labels, labels, mask, label_smoothing=self.label_smoothing) loss_d = sequence_cross_entropy_with_logits(logits_d, d_tags, mask) for metric in self.metrics.values(): metric(logits_labels, labels, mask.float()) metric(logits_d, d_tags, mask.float()) output_dict["loss"] = loss_labels + loss_d if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None, metadata: List[Dict[str, Any]] = None, ) -> Dict[str, torch.Tensor]: """ Parameters ---------- tokens : Dict[str, torch.LongTensor], required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. metadata : ``List[Dict[str, Any]]``, optional, (default = None) metadata containing the original words in the sentence to be tagged under a 'words' key. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ embedded_text_input = self.text_field_embedder(tokens) batch_size, sequence_length, _ = embedded_text_input.size() mask = get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.tag_projection_layer(encoded_text) reshaped_log_probs = logits.view(-1, self.num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self.num_classes]) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if tags is not None: loss = sequence_cross_entropy_with_logits(logits, tags, mask) for metric in self.metrics.values(): metric(logits, tags, mask.float()) if self._f1_metric is not None: self._f1_metric(logits, tags, mask.float()) output_dict["loss"] = loss if metadata is not None: output_dict["words"] = [x["words"] for x in metadata] return output_dict
def forward( self, source_tokens: TextFieldTensors, target_tokens: TextFieldTensors = None) -> Dict[str, torch.Tensor]: """ Performs the forward step of Bart. # Parameters source_tokens : `TextFieldTensors`, required The source tokens for the encoder. We assume they are stored under the `tokens` key. target_tokens : `TextFieldTensors`, optional (default = `None`) The target tokens for the decoder. We assume they are stored under the `tokens` key. If no target tokens are given, the source tokens are shifted to the right by 1. # Returns `Dict[str, torch.Tensor]` During training, this dictionary contains the `decoder_logits` of shape `(batch_size, max_target_length, target_vocab_size)` and the `loss`. During inference, it contains `predictions` of shape `(batch_size, max_decoding_steps)` and `log_probabilities` of shape `(batch_size,)`. """ inputs = source_tokens targets = target_tokens input_ids, input_mask = inputs["tokens"]["token_ids"], inputs[ "tokens"]["mask"] outputs = {} # If no targets are provided, then shift input to right by 1. Bart already does this internally # but it does not use them for loss calculation. if targets is not None: target_ids, target_mask = targets["tokens"]["token_ids"], targets[ "tokens"]["mask"] else: target_ids = input_ids[:, 1:] target_mask = input_mask[:, 1:] if self.training: decoder_logits = self.bart( input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=target_ids[:, :-1].contiguous(), decoder_attention_mask=target_mask[:, :-1].contiguous(), use_cache=False, )[0] outputs["decoder_logits"] = decoder_logits # The BART paper mentions label smoothing of 0.1 for sequence generation tasks outputs["loss"] = sequence_cross_entropy_with_logits( decoder_logits, target_ids[:, 1:].contiguous(), target_mask[:, 1:].contiguous(), label_smoothing=0.1, average="token", ) else: # Use decoder start id and start of sentence to start decoder initial_decoder_ids = torch.tensor( [[self._decoder_start_id, self._start_id]], dtype=input_ids.dtype, device=input_ids.device, ).repeat(input_ids.shape[0], 1) inital_state = { "input_ids": input_ids, "input_mask": input_mask, "encoder_states": None, } beam_result = self._beam_search.search(initial_decoder_ids, inital_state, self.take_step) predictions = beam_result[0] max_pred_indices = (beam_result[1].argmax(dim=-1).view( -1, 1, 1).expand(-1, -1, predictions.shape[-1])) predictions = predictions.gather( dim=1, index=max_pred_indices).squeeze(dim=1) self._rouge(predictions, target_ids) self._bleu(predictions, target_ids) outputs["predictions"] = predictions outputs["log_probabilities"] = (beam_result[1].gather( dim=-1, index=max_pred_indices[..., 0]).squeeze(dim=-1)) self.make_output_human_readable(outputs) return outputs