def test_elmo_contextualizer_without_grad_frozen_scalar_mix(self): weights_path = self.model_paths / "lm_weights.hdf5" options_path = self.model_paths / "options.json" params = Params({ "type": "elmo_contextualizer", "batch_size": 2, "layer_num": 1, "freeze_scalar_mix": True, "elmo": { "options_file": options_path, "weight_file": weights_path, "dropout": 0.0, "num_output_representations": 1, "requires_grad": False, } }) elmo_contextualizer = Contextualizer.from_params(params) unpadded_representations = elmo_contextualizer([ self.sentence_1, self.sentence_2, self.sentence_3]) token_representations, mask = pad_contextualizer_output( unpadded_representations) loss = token_representations.sum() # Nothing in the contextualizer is requires_grad=True, so this # should be requires_grad=False and grad_fn should be None assert loss.grad_fn is None assert loss.requires_grad is False
def test_elmo_contextualizer_with_grad(self): weights_path = self.model_paths / "lm_weights.hdf5" options_path = self.model_paths / "options.json" params = Params({ "type": "elmo_contextualizer", "batch_size": 2, "elmo": { "options_file": options_path, "weight_file": weights_path, "dropout": 0.0, "num_output_representations": 1, "requires_grad": True, } }) elmo_contextualizer = Contextualizer.from_params(params) unpadded_representations = elmo_contextualizer([ self.sentence_1, self.sentence_2, self.sentence_3]) token_representations, mask = pad_contextualizer_output( unpadded_representations) loss = token_representations.sum() loss.backward() elmo_grads = [param.grad for name, param in elmo_contextualizer.named_parameters() if '_elmo_lstm' in name] assert all([grad is not None for grad in elmo_grads])
def test_elmo_contextualizer_with_grad_frozen_scalar_mix(self): weights_path = self.model_paths / "lm_weights.hdf5" options_path = self.model_paths / "options.json" params = Params({ "type": "elmo_contextualizer", "batch_size": 2, "layer_num": 1, "freeze_scalar_mix": True, "elmo": { "options_file": options_path, "weight_file": weights_path, "dropout": 0.0, "num_output_representations": 1, "requires_grad": True, } }) elmo_contextualizer = Contextualizer.from_params(params) unpadded_representations = elmo_contextualizer([ self.sentence_1, self.sentence_2, self.sentence_3]) token_representations, mask = pad_contextualizer_output( unpadded_representations) loss = token_representations.sum() loss.backward() for name, param in elmo_contextualizer.named_parameters(): if "scalar_mix" in name: assert param.grad is None, "Parameter {} should not have grad.".format(name) else: assert param.grad is not None, "Parameter {} should have grad.".format(name)
def test_pad_contextualizer_output(self): contextualizer_output = [ torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]), torch.Tensor([[0.1, 0.2], [0.3, 0.4]]), torch.Tensor([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]]), torch.Tensor([[0.1, 0.2]]) ] padded_output, mask = pad_contextualizer_output(contextualizer_output) assert_allclose( padded_output.cpu().numpy(), np.array([[[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0, 0]], [[0.1, 0.2], [0.3, 0.4], [0, 0], [0, 0]], [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6], [0.7, 0.8]], [[0.1, 0.2], [0, 0], [0, 0], [0, 0]]])) assert_allclose( mask.cpu().numpy(), np.array([[1, 1, 1, 0], [1, 1, 0, 0], [1, 1, 1, 1], [1, 0, 0, 0]]))
def test_glove_contextualizer_frozen(self): params = Params({ "type": "glove_contextualizer", "glove_path": self.glove_path, "embedding_dim": self.representation_dim, "trainable": False }) glove_contextualizer = Contextualizer.from_params(params) unpadded_representations = glove_contextualizer( [self.sentence_1, self.sentence_2, self.sentence_3]) token_representations, mask = pad_contextualizer_output( unpadded_representations) loss = token_representations.sum() # Nothing in the contextualizer is requires_grad=True, so this # should be requires_grad=False and grad_fn should be None assert loss.grad_fn is None assert loss.requires_grad is False
def test_glove_contextualizer_trainable(self): params = Params({ "type": "glove_contextualizer", "glove_path": self.glove_path, "embedding_dim": self.representation_dim, "trainable": True }) glove_contextualizer = Contextualizer.from_params(params) unpadded_representations = glove_contextualizer( [self.sentence_1, self.sentence_2, self.sentence_3]) token_representations, mask = pad_contextualizer_output( unpadded_representations) loss = token_representations.sum() loss.backward() glove_grads = [ param.grad for name, param in glove_contextualizer.named_parameters() ] assert all([grad is not None for grad in glove_grads])
def forward( self, # type: ignore label_indices: torch.LongTensor, token_representations: torch.FloatTensor = None, raw_tokens: List[List[str]] = None, labels: torch.LongTensor = None, **kwargs) -> Dict[str, torch.Tensor]: """ If ``token_representations`` is provided, ``tokens`` is not required. If ``token_representations`` is ``None``, then ``tokens`` is required. Parameters ---------- label_indices : torch.LongTensor A LongTensor of shape (batch_size, max_num_adpositions) with the tokens to predict a label for for each element (sentence) in the batch. token_representations : torch.FloatTensor, optional (default = None) A tensor of shape (batch_size, sequence_length, representation_dim) with the represenatation of the first token. If None, we use a contextualizer within this model to produce the token representation. raw_tokens : List[List[str]], optional (default = None) A batch of lists with the raw token strings. Used to compute token_representations, if either are None. labels : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_label_indices)``. Returns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing unnormalized log probabilities of the classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_label_indices, num_classes)`` representing a distribution of the tag classes. loss : torch.FloatTensor, optional A scalar loss to be optimized. """ # Convert to LongTensor # TODO: add PR to ArrayField to preserve array types. label_indices = label_indices.long() if token_representations is None: if self._contextualizer is None: raise ConfigurationError( "token_representation not provided as input to the model, and no " "contextualizer was specified. Either add a contextualizer to your " "dataset reader (preferred if your contextualizer is frozen) or to " "this model (if you wish to train your contextualizer).") if raw_tokens is None: raise ValueError( "Input raw_tokens is ``None`` --- make sure to set " "include_raw_tokens in the DatasetReader to True.") if label_indices is None: raise ValueError("Did not recieve any token indices, needed " "if the contextualizer is within the model.") # Convert contextualizer output into a tensor # Shape: (batch_size, max_seq_len, representation_dim) token_representations, _ = pad_contextualizer_output( self._contextualizer(raw_tokens)) # Move token representation to the same device as the # module (CPU or CUDA). TODO(nfliu): This only works if the module # is on one device. device = next(self._decoder._linear_layers[0].parameters()).device token_representations = token_representations.to(device) text_mask = get_text_mask_from_representations(token_representations) text_mask = text_mask.to(device) label_mask = self._get_label_mask_from_label_indices(label_indices) label_mask = label_mask.to(device) # Mask out the -1 padding in the label_indices, since that doesn't # work with indexing. Note that we can't 0 pad because 0 is actually # a valid label index, so we pad with -1 just for the purposes of # proper mask calculation and then convert to 0-padding by applying # the mask. label_indices = label_indices * label_mask # Encode the token representation. encoded_token_representations = self._encoder(token_representations, text_mask) batch_size = label_indices.size(0) # Index into the encoded_token_representations to get tensors corresponding # to the representations of the tokens to predict labels for. # Shape: (batch_size, num_label_indices, representation_dim) range_vector = get_range_vector( batch_size, get_device_of(label_indices)).unsqueeze(1) selected_token_representations = encoded_token_representations[ range_vector, label_indices] selected_token_representations = selected_token_representations.contiguous( ) # Decode out a label from the token representation # Shape: (batch_size, num_label_indices, num_classes) logits = self._decoder(selected_token_representations) class_probabilities = F.softmax(logits, dim=-1) output_dict = { "logits": logits, "class_probabilities": class_probabilities } if labels is not None: loss = sequence_cross_entropy_with_logits( logits, labels, label_mask, average=self.loss_average) for name, metric in self.metrics.items(): # When not running in error analysis mode, skip # metrics that start with "_" if not self.error_analysis and name.startswith("_"): continue metric(logits, labels, label_mask.float()) output_dict["loss"] = loss return output_dict
def forward(self, # type: ignore token_representations: torch.FloatTensor = None, raw_tokens: List[List[str]] = None, labels: torch.LongTensor = None, **kwargs) -> Dict[str, torch.Tensor]: """ Parameters ---------- token_representations : torch.FloatTensor, optional (default = None) A padded tensor of shape (batch_size, seq_len, representation_dim), with the represenatations of the tokens. If None, we use a contextualizer within this model to produce the token representation. raw_tokens : List[List[str]], optional (default = None) A batch of lists with the raw token strings. Used to compute token_representations if it is None. labels : torch.LongTensor, optional (default = None) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. Retpurns ------- An output dictionary consisting of: logits : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing unnormalised log probabilities of the tag classes. class_probabilities : torch.FloatTensor A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing a distribution of the tag classes per word. loss : torch.FloatTensor, optional A scalar loss to be optimised. """ if token_representations is None: if self._contextualizer is None: raise ConfigurationError( "token_representations not provided as input to the model, and no " "contextualizer was specified. Either add a contextualizer to your " "dataset reader (preferred if your contextualizer is frozen) or to " "this model (if you wish to train your contextualizer).") if raw_tokens is None: raise ValueError("Input raw_tokens is ``None`` and token representations " "were not provided!") token_representations, mask = pad_contextualizer_output( self._contextualizer(raw_tokens)) # Move token representations to the same device as the # module (CPU or CUDA). TODO(nfliu): This only works if the module # is on one device. device = next(self._decoder._module._linear_layers[0].parameters()).device token_representations = token_representations.to(device) mask = mask.to(device) else: mask = get_text_mask_from_representations(token_representations) batch_size, sequence_length = mask.size() # Encode the token representations. encoded_token_representations = self._encoder(token_representations, mask) logits = self._decoder(encoded_token_representations) output_dict = {} # Run CRF if provided and calculate class_probabilities if self._crf: best_paths = self._crf.viterbi_tags(logits, mask) # Just get the tags and ignore the score. predicted_tags = [x for x, y in best_paths] # Add tags to output dict output_dict["tags"] = predicted_tags # Get the class probabilities from the viterbi tags class_probabilities = logits * 0. for i, instance_tags in enumerate(predicted_tags): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 else: reshaped_log_probs = logits.view(-1, self._num_classes) class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view( [batch_size, sequence_length, self._num_classes]) output_dict["logits"] = logits output_dict["mask"] = mask output_dict["class_probabilities"] = class_probabilities if labels is not None: if self._crf: # Add negative log-likelihood as loss log_likelihood = self._crf(logits, labels, mask) loss = -log_likelihood else: loss = sequence_cross_entropy_with_logits(logits, labels, mask, average=self.loss_average) for name, metric in self.metrics.items(): # When not running in error analysis mode, skip # metrics that start with "_" if not self.error_analysis and name.startswith("_"): continue if name == "perplexity": # Perplexity metric API is a bit different from the others. metric(loss, mask.float().sum()) else: metric(class_probabilities, labels, mask.float()) output_dict["loss"] = loss return output_dict