class InferenceLayer(nn.Module): def __init__(self, input_dim, n_classes, use_crf): super(InferenceLayer, self).__init__() self.use_crf = use_crf self.input_dim = input_dim self.output_dim = n_classes self.proj = nn.Linear(input_dim, n_classes) if self.use_crf: self.crf = ConditionalRandomField( n_classes, constraints=None, include_start_end_transitions=True) else: self.xent = nn.CrossEntropyLoss(reduction='mean') def crf_forward(self, logits, mask, target): mask = mask.long() loss = -self.crf.forward(logits, target, mask) # neg log-likelihood loss loss = loss / torch.sum(mask) return loss, logits def fc_forward(self, logits, mask, target): if mask is not None: mask = mask.long() mask = mask.view(-1) == 1 logits_ = logits.view(-1, logits.size(-1))[mask] target_ = target.view(-1)[mask] loss = self.xent(logits_, target_) else: loss = self.xent(logits.view(-1, logits.size(-1)), target.view(-1)) return loss, logits def forward(self, vectors, mask, targets): logits = self.proj(vectors) if self.use_crf: loss, logits = self.crf_forward(logits, mask, targets) else: loss, logits = self.fc_forward(logits, mask, targets) return loss, logits
class CrfTagger(Model): """ The ``CrfTagger`` encodes a sequence of text with a ``Seq2SeqEncoder``, then uses a Conditional Random Field model to predict a tag for each token in the sequence. Parameters ---------- vocab : ``Vocabulary``, required A Vocabulary, required in order to compute sizes for input/output projections. text_field_embedder : ``TextFieldEmbedder``, required Used to embed the tokens ``TextField`` we get as input to the model. encoder : ``Seq2SeqEncoder`` The encoder that we will use in between embedding tokens and predicting output tags. label_namespace : ``str``, optional (default=``labels``) This is needed to compute the SpanBasedF1Measure metric. Unless you did something unusual, the default value should be what you want. initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) Used to initialize the model parameters. regularizer : ``RegularizerApplicator``, optional (default=``None``) If provided, will be used to calculate the regularization penalty during training. """ def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, label_namespace: str = "labels", initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self.label_namespace = label_namespace self.text_field_embedder = text_field_embedder self.num_tags = self.vocab.get_vocab_size(label_namespace) self.encoder = encoder self.tag_projection_layer = TimeDistributed( Linear(self.encoder.get_output_dim(), self.num_tags)) self.crf = ConditionalRandomField(self.num_tags) self.span_metric = SpanBasedF1Measure(vocab, tag_namespace=label_namespace) if text_field_embedder.get_output_dim() != encoder.get_input_dim(): raise ConfigurationError( "The output dimension of the text_field_embedder must match the " "input dimension of the phrase_encoder. Found {} and {}, " "respectively.".format(text_field_embedder.get_output_dim(), encoder.get_input_dim())) initializer(self) @overrides def forward( self, # type: ignore tokens: Dict[str, torch.LongTensor], tags: torch.LongTensor = None) -> Dict[str, torch.Tensor]: # pylint: disable=arguments-differ """ Parameters ---------- tokens : ``Dict[str, torch.LongTensor]``, required The output of ``TextField.as_array()``, which should typically be passed directly to a ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used for the ``TokenIndexers`` when you created the ``TextField`` representing your sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, which knows how to combine different word representations into a single vector per token in your input. tags : ``torch.LongTensor``, optional (default = ``None``) A torch tensor representing the sequence of integer gold class labels of shape ``(batch_size, num_tokens)``. Returns ------- An output dictionary consisting of: logits : ``torch.FloatTensor`` The logits that are the output of the ``tag_projection_layer`` mask : ``torch.LongTensor`` The text field mask for the input tokens tags : ``List[List[str]]`` The predicted tags using the Viterbi algorithm. loss : ``torch.FloatTensor``, optional A scalar loss to be optimised. Only computed if gold label ``tags`` are provided. """ embedded_text_input = self.text_field_embedder(tokens) mask = util.get_text_field_mask(tokens) encoded_text = self.encoder(embedded_text_input, mask) logits = self.tag_projection_layer(encoded_text) predicted_tags = self.crf.viterbi_tags(logits, mask) output = {"logits": logits, "mask": mask, "tags": predicted_tags} if tags is not None: # Add negative log-likelihood as loss log_likelihood = self.crf.forward(logits, tags, mask) output["loss"] = -log_likelihood # Represent viterbi tags as "class probabilities" that we can # feed into the `span_metric` class_probabilities = logits * 0. for i, instance_tags in enumerate(predicted_tags): for j, tag_id in enumerate(instance_tags): class_probabilities[i, j, tag_id] = 1 self.span_metric(class_probabilities, tags, mask) return output @overrides def get_metrics(self, reset: bool = False) -> Dict[str, float]: metric_dict = self.span_metric.get_metric(reset=reset) return {x: y for x, y in metric_dict.items() if "overall" in x} @classmethod def from_params(cls, vocab: Vocabulary, params: Params) -> 'CrfTagger': embedder_params = params.pop("text_field_embedder") text_field_embedder = TextFieldEmbedder.from_params( vocab, embedder_params) encoder = Seq2SeqEncoder.from_params(params.pop("encoder")) label_namespace = params.pop("label_namespace", "labels") initializer = InitializerApplicator.from_params( params.pop('initializer', [])) regularizer = RegularizerApplicator.from_params( params.pop('regularizer', [])) params.assert_empty(cls.__name__) return cls(vocab=vocab, text_field_embedder=text_field_embedder, encoder=encoder, label_namespace=label_namespace, initializer=initializer, regularizer=regularizer)
class BiLSTMCRFSequenceTagger(Model): def __init__(self, vocab, text_field_embedder, hidden_size=128, num_layers=2, dropout=0.5, tag_namespace='tags', initializer=None, metric=None): if initializer is None: initializer = InitializerApplicator() if metric is None: metric = SpanBasedF1Measure(vocab, tag_namespace=tag_namespace) super().__init__(vocab) self.text_field_embedder = text_field_embedder self.hidden_size = hidden_size self.num_layers = num_layers self.dropout = dropout self.tag_namespace = tag_namespace self.initializer = initializer self.metric = metric self.seq2seq_encoder = Seq2SeqEncoder.from_params(Params({ 'type': 'lstm', 'input_size': text_field_embedder.get_output_dim(), 'hidden_size': hidden_size, 'num_layers': num_layers, 'dropout': dropout, 'bidirectional': True, })) self.num_tags = vocab.get_vocab_size(tag_namespace) self.tags_projection_layer = TimeDistributed( Linear(self.seq2seq_encoder.get_output_dim(), self.num_tags)) self.crf = CRF(self.num_tags) self.initializer(self) def forward(self, sentence, tags=None): """Forward computation. Arguments --------- sentence : Dict[str, Variable[torch.LongTensor]] Mapping from indexer name to a tensor of indices. The indices tensor can have shape like ``(batch_size, num_tokens)`` if indexed by tokens or ``(batch_size, num_tokens, num_chars)`` if indexed by characters. tags : Variable[torch.LongTensor] Tag indices for this batch. This should have a shape ``(batch_size, num_tokens)``. Returns ------- output : Dict[str, Variable] Output dictionary with keys ``logits``, ``mask``, and ``loss``. """ mask = get_text_field_mask(sentence) embedded = self.text_field_embedder(sentence) # (bsize, n_tokens, emb_dim) encoded = self.seq2seq_encoder(embedded, mask) # (bsize, n_tokens, out_dim) logits = self.tags_projection_layer(encoded) # (bsize, n_tokens, n_tags) output = {'logits': logits, 'mask': mask} if tags is not None: llh = self.crf.forward(logits, tags, mask=mask) output['loss'] = -llh self.metric(logits, tags, mask=mask) return output def decode(self, output): """Compute best tag sequence. Arguments --------- output : Dict[str, Variable] Output dictionary returned by ``.forward()``. Returns ------- output : Dict[str, Variable] The same dictionary given as input but updated with keys ``predicted_tags`` and ``prediction_probs``. """ predicted_tags = self.crf.viterbi_tags(output['logits'], output['mask']) prediction_probs = output['logits'] * 0. for i, sentence_tags in enumerate(predicted_tags): for j, tag_id in enumerate(sentence_tags): prediction_probs[i, j, tag_id] = 1. output.update({'predicted_tags': predicted_tags, 'prediction_probs': prediction_probs}) return output def get_metrics(self, reset=False): return self.metric.get_metric(reset) @classmethod def from_params(cls, vocab, params): text_field_embedder = TextFieldEmbedder.from_params( vocab, params.pop('text_field_embedder')) hidden_size = params.pop('hidden_size', 128) num_layers = params.pop('num_layers', 2) dropout = params.pop('dropout', 0.5) tag_namespace = params.pop('tag_namespace', 'tags') initializer = None initializer_params = params.pop('initializer', None) if initializer_params is not None: initializer = Initializer.from_params(initializer_params) metric = None metric_params = params.pop('metric', None) if metric_params is not None: metric = Metric.from_params(metric_params) params.assert_empty(cls.__name__) return cls(vocab, text_field_embedder, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, tag_namespace=tag_namespace, initializer=initializer, metric=metric)
class InferenceLayer(nn.Module): def __init__(self, input_dim, n_classes, use_crf): super(InferenceLayer, self).__init__() self.use_crf = use_crf self.input_dim = input_dim self.output_dim = n_classes self.proj = nn.Linear(input_dim, n_classes) if self.use_crf: self.crf = ConditionalRandomField( n_classes, constraints=None, include_start_end_transitions=True) else: self.xent = nn.CrossEntropyLoss(reduction='mean') def crf_forward(self, logits, mask, target): mask = mask.long() best_paths = self.crf.viterbi_tags(logits, mask) tags, viterbi_scores = zip(*best_paths) loss = -self.crf.forward(logits, target, mask) # neg log-likelihood loss loss = loss / torch.sum(mask) return { 'loss': loss, 'logits': logits, 'tags': tags, 'path_scores': viterbi_scores } def fc_forward(self, logits, mask, target): assert len(logits.size()) == 3 if mask is not None: mask = mask.long() tags = torch.softmax(logits, dim=2).max(-1) tags = tags[1].cpu().tolist() for i in range(len(tags)): tags[i] = tags[i][:mask[i].sum().item()] mask = mask.view(-1) == 1 logits_ = logits.view(-1, logits.size(-1)) target_ = target.view(-1) loss = self.xent(logits_[mask], target_[mask]) else: tags = torch.softmax(logits, dim=2).max(-1) tags = tags[1].cpu().tolist() for i in range(len(tags)): tags[i] = tags[i][:] logits_ = logits.view(-1, logits.size(-1)) target_ = target.view(-1) loss = self.xent(logits_, target_) return {'loss': loss, 'logits': logits, 'tags': tags} def forward(self, vectors, mask, targets): logits = self.proj(vectors) if self.use_crf: results = self.crf_forward(logits, mask, targets) else: results = self.fc_forward(logits, mask, targets) results['mask'] = mask.data if mask is not None else None return results
class TestConditionalRandomField(AllenNlpTestCase): def setUp(self): super().setUp() self.logits = Variable(torch.Tensor([ [[0, 0, .5, .5, .2], [0, 0, .3, .3, .1], [0, 0, .9, 10, 1]], [[0, 0, .2, .5, .2], [0, 0, 3, .3, .1], [0, 0, .9, 1, 1]], ])) self.tags = Variable(torch.LongTensor([ [2, 3, 4], [3, 2, 2] ])) self.transitions = torch.Tensor([ [0.1, 0.2, 0.3, 0.4, 0.5], [0.8, 0.3, 0.1, 0.7, 0.9], [-0.3, 2.1, -5.6, 3.4, 4.0], [0.2, 0.4, 0.6, -0.3, -0.4], [1.0, 1.0, 1.0, 1.0, 1.0] ]) self.transitions_from_start = torch.Tensor([0.1, 0.2, 0.3, 0.4, 0.6]) self.transitions_to_end = torch.Tensor([-0.1, -0.2, 0.3, -0.4, -0.4]) # Use the CRF Module with fixed transitions to compute the log_likelihood self.crf = ConditionalRandomField(5) self.crf.transitions = torch.nn.Parameter(self.transitions) self.crf.start_transitions = torch.nn.Parameter(self.transitions_from_start) self.crf.end_transitions = torch.nn.Parameter(self.transitions_to_end) def score(self, logits, tags): """ Computes the likelihood score for the given sequence of tags, given the provided logits (and the transition weights in the CRF model) """ # Start with transitions from START and to END total = self.transitions_from_start[tags[0]] + self.transitions_to_end[tags[-1]] # Add in all the intermediate transitions for tag, next_tag in zip(tags, tags[1:]): total += self.transitions[tag, next_tag] # Add in the logits for the observed tags for logit, tag in zip(logits, tags): total += logit[tag] return total def test_forward_works_without_mask(self): log_likelihood = self.crf.forward(self.logits, self.tags).data[0] # Now compute the log-likelihood manually manual_log_likelihood = 0.0 # For each instance, manually compute the numerator # (which is just the score for the logits and actual tags) # and the denominator # (which is the log-sum-exp of the scores for the logits across all possible tags) for logits_i, tags_i in zip(self.logits, self.tags): numerator = self.score(logits_i.data, tags_i.data) all_scores = [self.score(logits_i.data, tags_j) for tags_j in itertools.product(range(5), repeat=3)] denominator = math.log(sum(math.exp(score) for score in all_scores)) # And include them in the manual calculation. manual_log_likelihood += numerator - denominator # The manually computed log likelihood should equal the result of crf.forward. assert manual_log_likelihood == approx(log_likelihood) def test_forward_works_with_mask(self): # Use a non-trivial mask mask = Variable(torch.LongTensor([ [1, 1, 1], [1, 1, 0] ])) log_likelihood = self.crf.forward(self.logits, self.tags, mask).data[0] # Now compute the log-likelihood manually manual_log_likelihood = 0.0 # For each instance, manually compute the numerator # (which is just the score for the logits and actual tags) # and the denominator # (which is the log-sum-exp of the scores for the logits across all possible tags) for logits_i, tags_i, mask_i in zip(self.logits, self.tags, mask): # Find the sequence length for this input and only look at that much of each sequence. sequence_length = torch.sum(mask_i.data) logits_i = logits_i.data[:sequence_length] tags_i = tags_i.data[:sequence_length] numerator = self.score(logits_i, tags_i) all_scores = [self.score(logits_i, tags_j) for tags_j in itertools.product(range(5), repeat=sequence_length)] denominator = math.log(sum(math.exp(score) for score in all_scores)) # And include them in the manual calculation. manual_log_likelihood += numerator - denominator # The manually computed log likelihood should equal the result of crf.forward. assert manual_log_likelihood == approx(log_likelihood) def test_viterbi_tags(self): mask = Variable(torch.LongTensor([ [1, 1, 1], [1, 1, 0] ])) viterbi_tags = self.crf.viterbi_tags(self.logits, mask) # Check that the viterbi tags are what I think they should be. assert viterbi_tags == [ [2, 4, 3], [4, 2] ] # We can also iterate over all possible tag sequences and use self.score # to check the likelihood of each. The most likely sequence should be the # same as what we get from viterbi_tags. most_likely_tags = [] for logit, mas in zip(self.logits, mask): sequence_length = torch.sum(mas.data) most_likely, most_likelihood = None, -float('inf') for tags in itertools.product(range(5), repeat=sequence_length): score = self.score(logit.data, tags) if score > most_likelihood: most_likely, most_likelihood = tags, score # Convert tuple to list; otherwise == complains. most_likely_tags.append(list(most_likely)) assert viterbi_tags == most_likely_tags