def _joint_likelihood(
        self, logits: torch.Tensor, tags: torch.Tensor, mask: torch.BoolTensor
    ) -> torch.Tensor:
        """
        Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
        """
        batch_size, sequence_length, _ = logits.data.shape

        # Transpose batch size and sequence dimensions:
        logits = logits.transpose(0, 1).contiguous()
        mask = mask.transpose(0, 1).contiguous()
        tags = tags.transpose(0, 1).contiguous()

        start_index = 1 if (~mask[0]).all() else 0 # to skip [CLS] used in BERT

        # Start with the transition scores from start_tag to the first tag in each input
        if self.include_start_end_transitions:
            score = self.start_transitions.index_select(0, tags[start_index] * mask[start_index])
        else:
            score = 0.0

        # Add up the scores for the observed transitions and all the inputs but the last
        current_valid_tag = tags[start_index] * mask[start_index]
        for i in range(start_index, sequence_length-1):
            # Each is shape (batch_size,)
            current_tag, next_tag = tags[i], tags[i + 1]

            if i > start_index: # preserve the tag for subwords
                current_valid_tag = torch.where(mask[i] > 0, current_tag, current_valid_tag)

            # The scores for transitioning from current_tag to next_tag
            transition_score = self.transitions[current_valid_tag.view(-1), next_tag.view(-1) * mask[i + 1]]

            # The score for using current_tag
            emit_score = logits[i].gather(1, (current_tag * mask[i]).view(batch_size, 1)).squeeze(1)

            # Include transition score if next element is unmasked,
            # input_score if this element is unmasked.
            score = score + transition_score * mask[i + 1] + emit_score * mask[i]

        # Transition from last state to "stop" state. To start with, we need to find the last tag
        # for each instance.
        # last_tag_index = mask.sum(0).long() - 1
        # last_tags = tags.gather(0, last_tag_index.view(1, batch_size)).squeeze(0)
        last_tags = current_valid_tag

        # Compute score of transitioning to `stop_tag` from each "last tag".
        if self.include_start_end_transitions:
            last_transition_score = self.end_transitions.index_select(0, last_tags)
        else:
            last_transition_score = 0.0

        # Add the last input if it's not masked.
        # last_inputs = logits[-1]  # (batch_size, num_tags)
        # last_input_score = last_inputs.gather(1, last_tags.view(-1, 1))  # (batch_size, 1)
        # last_input_score = last_input_score.squeeze()  # (batch_size,)

        score = score + last_transition_score # + last_input_score * mask[-1]

        return score
Exemple #2
0
    def _joint_likelihood(self, logits: torch.Tensor, tags: torch.Tensor,
                          mask: torch.BoolTensor) -> torch.Tensor:
        """
        Computes the numerator term for the log-likelihood, which is just score(inputs, tags)
        """
        batch_size, sequence_length, _ = logits.shape

        # Transpose batch size and sequence dimensions:
        logits = logits.transpose(0, 1)
        mask = mask.transpose(0, 1).float()
        tags = tags.transpose(0, 1)

        # Start with the transition scores from start_tag to the first tag in each input
        if self.include_start_end_transitions:
            score = self.start_transitions.index_select(0, tags[0])
        else:
            score = 0.0

        # Add up the scores for the observed transitions and all the inputs but the last
        for i in range(sequence_length - 1):
            # Each is shape (batch_size,)
            current_tag, next_tag = tags[i], tags[i + 1]

            # The scores for transitioning from current_tag to next_tag
            transition_score = self.transitions[current_tag, next_tag]

            # The score for using current_tag
            emit_score = logits[i].gather(1, current_tag.view(batch_size,
                                                              1)).squeeze()

            # Include t
            score = score + transition_score * mask[i +
                                                    1] + emit_score * mask[i]

        # Transition from last state to "stop" state. To start with, we need to find the last tag
        # for each instance.
        last_tag_index = mask.sum(0).long() - 1
        last_tags = tags.gather(0, last_tag_index.view(1,
                                                       batch_size)).squeeze(0)

        # Compute score of transitioning to `stop_tag` from each "last tag".
        if self.include_start_end_transitions:
            last_transition_score = self.end_transitions.index_select(
                0, last_tags)
        else:
            last_transition_score = 0.0

        # Add the last input if it's not masked.
        last_inputs = logits[-1]  # (batch_size, num_tags)
        last_input_score = last_inputs.gather(1, last_tags.view(
            -1, 1)).squeeze()  # (batch_size)
        # last_input_score = last_input_score.squeeze()  # (batch_size,)

        score = score + last_transition_score + last_input_score * mask[-1]

        return score
    def _input_likelihood(self, logits: torch.Tensor, mask: torch.BoolTensor) -> torch.Tensor:
        """
        Computes the (batch_size,) denominator term for the log-likelihood, which is the
        sum of the likelihoods across all possible state sequences.
        """
        batch_size, sequence_length, num_tags = logits.size()

        # Transpose batch size and sequence dimensions
        mask = mask.transpose(0, 1).contiguous()
        logits = logits.transpose(0, 1).contiguous()

        start_index = 1 if (~mask[0]).all() else 0 # to skip [CLS] used in BERT

        # Initial alpha is the (batch_size, num_tags) tensor of likelihoods combining the
        # transitions to the initial states and the logits for the first timestep.
        if self.include_start_end_transitions:
            alpha = self.start_transitions.view(1, num_tags) + logits[start_index]
        else:
            alpha = logits[start_index]

        # For each i we compute logits for the transitions from timestep i-1 to timestep i.
        # We do so in a (batch_size, num_tags, num_tags) tensor where the axes are
        # (instance, current_tag, next_tag)
        for i in range(start_index, sequence_length):
            # The emit scores are for time i ("next_tag") so we broadcast along the current_tag axis.
            emit_scores = logits[i].view(batch_size, 1, num_tags)
            # Transition scores are (current_tag, next_tag) so we broadcast along the instance axis.
            transition_scores = self.transitions.view(1, num_tags, num_tags)
            # Alpha is for the current_tag, so we broadcast along the next_tag axis.
            broadcast_alpha = alpha.view(batch_size, num_tags, 1)

            # Add all the scores together and logexp over the current_tag axis.
            inner = broadcast_alpha + emit_scores + transition_scores

            # In valid positions (mask == True) we want to take the logsumexp over the current_tag dimension
            # of `inner`. Otherwise (mask == False) we want to retain the previous alpha.
            alpha = logsumexp(inner, 1) * mask[i].view(batch_size, 1) + alpha * (
                ~mask[i]
            ).view(batch_size, 1)

        # Every sequence needs to end with a transition to the stop_tag.
        if self.include_start_end_transitions:
            stops = alpha + self.end_transitions.view(1, num_tags)
        else:
            stops = alpha

        # Finally we log_sum_exp along the num_tags dim, result is (batch_size,)
        return logsumexp(stops)