def test_loading_from_pretrained_weights_using_model_name(
        pretrained_name, relevant_module):

    torch.manual_seed(1234)
    module = T5Attention.from_pretrained_module(
        pretrained_name, relevant_module=relevant_module)

    torch.manual_seed(1234)
    pretrained_module = dict(
        AutoModel.from_pretrained(
            pretrained_name).named_modules())[relevant_module]

    batch_size = 2
    seq_len = 3
    dim = module.query.in_features
    hidden_states = torch.randn(batch_size, seq_len, dim)
    attention_mask = torch.tensor([[1, 1, 0], [1, 0, 1]])[:, None, None, :]

    # setting to eval mode to avoid non-deterministic dropout.
    module = module.eval()
    pretrained_module = pretrained_module.eval()

    torch.manual_seed(1234)
    output = module(hidden_states, mask=attention_mask.squeeze()).hidden_states

    # The attn_mask is processed outside the self attention module in HF bert models.
    attention_mask = (~(attention_mask == 1)) * min_value_of_dtype(
        hidden_states.dtype)
    torch.manual_seed(1234)
    hf_output = pretrained_module(hidden_states, mask=attention_mask)[0]

    assert torch.allclose(output, hf_output)
Exemple #2
0
def replace_masked_values_with_big_negative_number(x: torch.Tensor,
                                                   mask: torch.Tensor):
    """
    Replace the masked values in a tensor something really negative so that they won't
    affect a max operation.
    """
    return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
Exemple #3
0
 def __init__(self,
              k: int = 1,
              temperature: float = 1.0,
              filter_val: float = -float("inf")):
     assert k >= 1, f'{"k must be >= 1"}'
     self.k = k
     self.temperature = temperature or 1.0
     self.filter_val = min_value_of_dtype(torch.float)
Exemple #4
0
 def __init__(self,
              p: float = 0.9,
              temperature: float = 1.0,
              filter_val: float = -float("inf")):
     assert p <= 1.0, f'{"p must be <= 0"}'
     self.p = p
     self.temperature = temperature or 1.0
     self.filter_val = min_value_of_dtype(torch.float)
Exemple #5
0
    def sample_nodes(
            self, log_probs: torch.Tensor, per_node_beam_size: int,
            state: StateType) -> Tuple[torch.Tensor, torch.Tensor, StateType]:
        if not per_node_beam_size <= log_probs.size()[1]:
            raise ValueError(
                "per_node_beam_size cannot be greater than vocabulary size")

        # First apply temperature coefficient:
        if self.temperature != 1.0:
            _log_probs = torch.nn.functional.log_softmax(log_probs /
                                                         self.temperature,
                                                         dim=-1)
        else:
            _log_probs = log_probs

        # Sort the probabilities in descending order to then find cumulative sum
        log_probs_descending, sorting_indices = torch.sort(_log_probs,
                                                           descending=True)

        # shape: (batch_size, num_classes)
        probabilities_descending = log_probs_descending.exp()
        probabilities_summed = torch.cumsum(probabilities_descending, dim=-1)

        # Create a mask for filtering out probabilities that don't make the top `p`.
        # shape: (batch_size, num_classes)
        exclusion_mask = probabilities_summed >= self.p

        # We want to include the first index where probabilities_summed >= p, so we shift over one.
        exclusion_mask[..., 1:] = exclusion_mask[..., :-1].clone()
        exclusion_mask[..., 0] = False

        # Make sure there's at least `per_node_beam_size` options to be selected.
        if not self.with_replacement:
            exclusion_mask[..., :per_node_beam_size] = False

        log_probs_descending[exclusion_mask] = min_value_of_dtype(
            log_probs.dtype)

        # Now re-normalized the included log probs.
        # shape: (batch_size, num_classes)
        filtered_probabilities = torch.nn.functional.softmax(
            log_probs_descending, dim=-1)

        # Sample from the re-normalized subset.
        # NOTE: These indices are not indices into `log_probs`, they are indices into `log_probs_descending`.
        # shape: (batch_size, per_node_beam_size)
        sampled_indices = torch.multinomial(filtered_probabilities,
                                            per_node_beam_size,
                                            replacement=self.with_replacement)

        # Convert `sampled_indices` back to indices in the original `log_probs` tensor.
        # shape: (batch_size, per_node_beam_size)
        selected_indices = sorting_indices.gather(-1, sampled_indices)

        # Return (selected log probabilities, selected classes)
        # shape: (len(log_probs),1) , (len(log_probs), 1)
        return torch.gather(log_probs, 1,
                            selected_indices), selected_indices, state
Exemple #6
0
    def test_loading_from_pretrained_weights_using_model_name(self, pretrained_name):

        torch.manual_seed(1234)
        pretrained = cached_transformers.get(pretrained_name, False)

        if "distilbert" in pretrained_name:
            encoder = pretrained.transformer
        else:
            encoder = pretrained.encoder
        # Hacky way to get a bert layer.
        for i, pretrained_module in enumerate(encoder.layer.modules()):
            if i == 1:
                break

        # Get the self attention layer.
        if "distilbert" in pretrained_name:
            pretrained_module = pretrained_module.attention
        else:
            pretrained_module = pretrained_module.attention.self

        torch.manual_seed(1234)
        module = SelfAttention.from_pretrained_module(pretrained_name)
        mapping = {
            val: key
            for key, val in module._construct_default_mapping(
                pretrained_module, "huggingface", {}
            ).items()
        }
        assert_equal_parameters(pretrained_module, module, mapping=mapping)

        batch_size = 2
        seq_len = 3
        dim = module.query.in_features
        hidden_states = torch.randn(batch_size, seq_len, dim)
        attention_mask = torch.randint(0, 2, (batch_size, 1, 1, seq_len))

        # setting to eval mode to avoid non-deterministic dropout.
        module = module.eval()
        pretrained_module = pretrained_module.eval()

        torch.manual_seed(1234)
        output = module.forward(hidden_states, attention_mask=attention_mask.squeeze())[0]
        if "distilbert" in pretrained_name:
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(
                hidden_states, hidden_states, hidden_states, mask=attention_mask
            )[0]
        else:
            # The attn_mask is processed outside the self attention module in HF bert models.
            attention_mask = (~(attention_mask == 1)) * min_value_of_dtype(hidden_states.dtype)
            torch.manual_seed(1234)
            hf_output = pretrained_module.forward(hidden_states, attention_mask=attention_mask)[0]

        assert torch.allclose(output, hf_output)
Exemple #7
0
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0,
                                    ai2_util.min_value_of_dtype(scores.dtype))
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    # return torch.matmul(p_attn, value), scores.squeeze(1).squeeze(1)
    return torch.matmul(p_attn, value), p_attn
Exemple #8
0
def attention(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    mask: torch.BoolTensor = None,
    dropout: Callable = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
    """Compute 'Scaled Dot Product Attention'"""
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(~mask, util.min_value_of_dtype(scores.dtype))
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn
Exemple #9
0
def attention_with_relations(query,
                             key,
                             value,
                             relation_k,
                             relation_v,
                             mask=None,
                             dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = relative_attention_logits(query, key, relation_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0,
                                    ai2_util.min_value_of_dtype(scores.dtype))
    p_attn_orig = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn_orig)
    return relative_attention_values(p_attn, value, relation_v), p_attn_orig
Exemple #10
0
    def test_repeated_ngram_blocking_constraint_apply(self):
        ngram_size = 3
        batch_size = 2
        beam_size = 2
        num_classes = 10
        constraint = RepeatedNGramBlockingConstraint(ngram_size)

        state = [
            [
                {
                    "current_prefix": [0, 1],
                    "seen_ngrams": {}
                },
                {
                    "current_prefix": [2, 3],
                    "seen_ngrams": {
                        (2, 3): [4]
                    }
                },
            ],
            [
                {
                    "current_prefix": [4, 5],
                    "seen_ngrams": {
                        (8, 9): []
                    }
                },
                {
                    "current_prefix": [6, 7],
                    "seen_ngrams": {
                        (6, 7): [0, 1, 2]
                    }
                },
            ],
        ]
        log_probabilities = torch.rand(batch_size, beam_size, num_classes)
        constraint.apply(state, log_probabilities)

        disallowed_locations = torch.nonzero(
            log_probabilities == min_value_of_dtype(
                log_probabilities.dtype)).tolist()
        assert len(disallowed_locations) == 4
        assert [0, 1, 4] in disallowed_locations
        assert [1, 1, 0] in disallowed_locations
        assert [1, 1, 1] in disallowed_locations
        assert [1, 1, 2] in disallowed_locations
Exemple #11
0
    def _get_best_span_yesno_followup(
        span_start_logits: torch.Tensor,
        span_end_logits: torch.Tensor,
        span_yesno_logits: torch.Tensor,
        span_followup_logits: torch.Tensor,
        max_span_length: int,
    ) -> torch.Tensor:
        # Returns the index of highest-scoring span that is not longer than 30 tokens, as well as
        # yesno prediction bit and followup prediction bit from the predicted span end token.
        if span_start_logits.dim() != 2 or span_end_logits.dim() != 2:
            raise ValueError(
                "Input shapes must be (batch_size, passage_length)")
        batch_size, passage_length = span_start_logits.size()
        max_span_log_prob = [util.min_value_of_dtype(span_start_logits.dtype)
                             ] * batch_size
        span_start_argmax = [0] * batch_size

        best_word_span = span_start_logits.new_zeros((batch_size, 4),
                                                     dtype=torch.long)

        span_start_logits = span_start_logits.data.cpu().numpy()
        span_end_logits = span_end_logits.data.cpu().numpy()
        span_yesno_logits = span_yesno_logits.data.cpu().numpy()
        span_followup_logits = span_followup_logits.data.cpu().numpy()
        for b_i in range(batch_size):
            for j in range(passage_length):
                val1 = span_start_logits[b_i, span_start_argmax[b_i]]
                if val1 < span_start_logits[b_i, j]:
                    span_start_argmax[b_i] = j
                    val1 = span_start_logits[b_i, j]
                val2 = span_end_logits[b_i, j]
                if val1 + val2 > max_span_log_prob[b_i]:
                    if j - span_start_argmax[b_i] > max_span_length:
                        continue
                    best_word_span[b_i, 0] = span_start_argmax[b_i]
                    best_word_span[b_i, 1] = j
                    max_span_log_prob[b_i] = val1 + val2
        for b_i in range(batch_size):
            j = best_word_span[b_i, 1]
            yesno_pred = np.argmax(span_yesno_logits[b_i, j])
            followup_pred = np.argmax(span_followup_logits[b_i, j])
            best_word_span[b_i, 2] = int(yesno_pred)
            best_word_span[b_i, 3] = int(followup_pred)
        return best_word_span
Exemple #12
0
    def forward(self,
                hidden_states,
                context,
                attention_mask=None,
                output_attentions=False):
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(context)
        mixed_value_layer = self.value(context)

        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)

        # Take the dot product between "query" and "key" to get the raw attention scores.
        attention_scores = torch.matmul(query_layer,
                                        key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(
            self.attention_head_size)
        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
        if attention_mask is not None:
            # print(attention_scores.shape)
            # print(attention_mask.shape)
            attention_mask = attention_mask.bool().unsqueeze(1).unsqueeze(1)
            attention_scores = attention_scores.clone().masked_fill(
                ~attention_mask,
                ai2_util.min_value_of_dtype(attention_scores.dtype))
            # attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
        attention_probs = nn.Softmax(dim=-1)(attention_scores)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.
        attention_probs = self.dropout(attention_probs)

        context_layer = torch.matmul(attention_probs, value_layer)
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
        new_context_layer_shape = context_layer.size()[:-2] + (
            self.head_size, )
        context_layer = context_layer.view(*new_context_layer_shape)

        outputs = ((context_layer, attention_probs) if output_attentions else
                   (context_layer, ))
        return outputs
Exemple #13
0
def test_loading_from_pretrained_weights_using_model_name(
        pretrained_name, relevant_module):
    torch.manual_seed(1234)
    module = SelfAttention.from_pretrained_module(
        pretrained_name, relevant_module=relevant_module)

    torch.manual_seed(1234)
    pretrained_module = dict(
        AutoModel.from_pretrained(pretrained_name).named_modules()
    )[
        # Module name will exclude the top-level part (e.g. 'bert.', 'electra.') for some reason.
        relevant_module[relevant_module.index(".") + 1:]]

    batch_size = 2
    seq_len = 3
    dim = module.query.in_features
    hidden_states = torch.randn(batch_size, seq_len, dim)
    attention_mask = torch.tensor([[1, 1, 0], [1, 0, 1]])[:, None, None, :]

    # setting to eval mode to avoid non-deterministic dropout.
    module = module.eval()
    pretrained_module = pretrained_module.eval()

    torch.manual_seed(1234)
    output = module(hidden_states,
                    attention_mask=attention_mask.squeeze()).hidden_states
    if "distilbert" in pretrained_name:
        torch.manual_seed(1234)
        hf_output = pretrained_module(hidden_states,
                                      hidden_states,
                                      hidden_states,
                                      mask=attention_mask)[0]
    else:
        # The attn_mask is processed outside the self attention module in HF bert models.
        attention_mask = (~(attention_mask == 1)) * min_value_of_dtype(
            hidden_states.dtype)
        torch.manual_seed(1234)
        hf_output = pretrained_module(hidden_states,
                                      attention_mask=attention_mask)[0]

    assert torch.allclose(output, hf_output)
def test_forward_against_huggingface_output(params_dict):
    hidden_states = torch.randn(2, 3, 6)
    attention_mask = torch.tensor([[0, 1, 0], [1, 1, 0]])

    hf_kwargs = {
        "d_model":
        params_dict["hidden_size"],
        "d_kv":
        params_dict["key_value_proj_dim"],
        "num_heads":
        params_dict["num_heads"],
        "relative_attention_num_buckets":
        params_dict["relative_attention_num_buckets"],
        "dropout_rate":
        params_dict["dropout"],
    }

    torch.manual_seed(1234)
    hf_module = HFT5Attention(T5Config(**hf_kwargs),
                              has_relative_attention_bias=False)

    torch.manual_seed(1234)

    params = copy.deepcopy(params_dict)
    params[
        "normalize"] = False  # only for this test, as HF does not normalize.
    t5_attention = T5Attention(**params)

    # setting to eval mode to avoid non-deterministic dropout.
    t5_attention = t5_attention.eval()
    hf_module = hf_module.eval()

    output = t5_attention.forward(hidden_states, mask=attention_mask)
    attention_mask_hf = (attention_mask == 0).view((2, 1, 1, 3)).expand(
        2, 2, 3, 3) * min_value_of_dtype(hidden_states.dtype)
    hf_output = hf_module.forward(hidden_states, mask=attention_mask_hf)

    hs = output.hidden_states

    assert torch.allclose(hs, hf_output[0])
def apply_mask(
    values: torch.FloatTensor, mask: Union[torch.BoolTensor, torch.IntTensor,
                                           torch.FloatTensor]
) -> torch.FloatTensor:
    """
    # Parameters

    values : `torch.FloatTensor`
        Shape `batch_size x num_attention_heads x source_seq_len x target_seq_len`
    mask : `torch.BoolTensor`
        Shape `batch_size x target_seq_len` OR `batch_size x 1 x 1 x target_seq_len`
    """
    # We create a 4D attention mask from a 2D or 3D tensor mask.
    if mask.dim() == 2:
        # The shape is `batch_size x 1 x 1 x target_seq_len` which is broadcast
        # to `batch_size x num_attention_heads x source_seq_len x target_seq_len`
        mask = mask[:, None, None, :]
    elif mask.dim() == 3:
        mask = mask[:, None, :, :]
    mask = mask.to(values.dtype)
    mask = (1.0 - mask) * min_value_of_dtype(values.dtype)
    return values + mask
Exemple #16
0
def replace_masked_values_with_big_negative_number(x: torch.Tensor,
                                                   mask: torch.Tensor):
    """
    mask.dim() should be equal to x.dim()
    """
    return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
    def forward(
        self,  # type: ignore
        tokens: TextFieldTensors,
        pos_tags: torch.LongTensor = None,
        metadata: List[Dict[str, Any]] = None,
        arc_tags: torch.LongTensor = None,
    ) -> Dict[str, torch.Tensor]:
        """
        # Parameters

        tokens : TextFieldTensors, required
            The output of `TextField.as_array()`.
        pos_tags : torch.LongTensor, optional (default = None)
            The output of a `SequenceLabelField` containing POS tags.
        metadata : List[Dict[str, Any]], optional (default = None)
            A dictionary of metadata for each batch element which has keys:
                tokens : `List[str]`, required.
                    The original string tokens in the sentence.
        arc_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer indices denoting the parent of every
            word in the dependency parse. Has shape `(batch_size, sequence_length, sequence_length)`.

        # Returns

        An output dictionary.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        if pos_tags is not None and self._pos_tag_embedding is not None:
            embedded_pos_tags = self._pos_tag_embedding(pos_tags)
            embedded_text_input = torch.cat(
                [embedded_text_input, embedded_pos_tags], -1)
        elif self._pos_tag_embedding is not None:
            raise ConfigurationError(
                "Model uses a POS embedding, but no POS tags were passed.")

        mask = get_text_field_mask(tokens)
        embedded_text_input = self._input_dropout(embedded_text_input)
        encoded_text = self.encoder(embedded_text_input, mask)

        batch_size, _, encoding_dim = encoded_text.size()

        head_sentinel = self._head_sentinel.expand(batch_size, 1, encoding_dim)
        # Concatenate the head sentinel onto the sentence representation.
        encoded_text = torch.cat([head_sentinel, encoded_text], 1)
        mask = torch.cat([mask.new_ones(batch_size, 1), mask], 1)

        encoded_text = self._dropout(encoded_text)

        # shape (batch_size, sequence_length, arc_representation_dim)
        head_arc_representation = self._dropout(
            self.head_arc_feedforward(encoded_text))
        child_arc_representation = self._dropout(
            self.child_arc_feedforward(encoded_text))

        # shape (batch_size, sequence_length, tag_representation_dim)
        head_tag_representation = self._dropout(
            self.head_tag_feedforward(encoded_text))
        child_tag_representation = self._dropout(
            self.child_tag_feedforward(encoded_text))
        # shape (batch_size, sequence_length, sequence_length)
        arc_scores = self.arc_attention(head_arc_representation,
                                        child_arc_representation)
        # shape (batch_size, num_tags, sequence_length, sequence_length)
        arc_tag_logits = self.tag_bilinear(head_tag_representation,
                                           child_tag_representation)
        # Switch to (batch_size, sequence_length, sequence_length, num_tags)
        arc_tag_logits = arc_tag_logits.permute(0, 2, 3, 1).contiguous()

        # Since we'll be doing some additions, using the min value will cause underflow
        minus_mask = ~mask * min_value_of_dtype(arc_scores.dtype) / 10
        arc_scores = arc_scores + minus_mask.unsqueeze(
            2) + minus_mask.unsqueeze(1)

        arc_probs, arc_tag_probs = self._greedy_decode(arc_scores,
                                                       arc_tag_logits, mask)

        output_dict = {
            "arc_probs": arc_probs,
            "arc_tag_probs": arc_tag_probs,
            "mask": mask
        }

        if metadata:
            output_dict["tokens"] = [meta["tokens"] for meta in metadata]

        if arc_tags is not None:
            arc_nll, tag_nll = self._construct_loss(
                arc_scores=arc_scores,
                arc_tag_logits=arc_tag_logits,
                arc_tags=arc_tags,
                mask=mask)
            output_dict["loss"] = arc_nll + tag_nll
            output_dict["arc_loss"] = arc_nll
            output_dict["tag_loss"] = tag_nll

            # Make the arc tags not have negative values anywhere
            # (by default, no edge is indicated with -1).
            arc_indices = (arc_tags != -1).float()
            tag_mask = mask.unsqueeze(1) & mask.unsqueeze(2)
            one_minus_arc_probs = 1 - arc_probs
            # We stack scores here because the f1 measure expects a
            # distribution, rather than a single value.
            self._unlabelled_f1(
                torch.stack([one_minus_arc_probs, arc_probs], -1), arc_indices,
                tag_mask)

        return output_dict
Exemple #18
0
def replace_masked_values_with_big_negative_number(x: torch.Tensor,
                                                   mask: torch.Tensor):
    return replace_masked_values(x, mask, min_value_of_dtype(x.dtype))
Exemple #19
0
    def forward(
        self,  # type: ignore
        tokens: TextFieldTensors,
        lemmas: torch.LongTensor = None,
        upos: torch.LongTensor = None,
        xpos: torch.LongTensor = None,
        feats: torch.LongTensor = None,
        deprels: torch.LongTensor = None,
        heads: torch.LongTensor = None,
        enhanced_tags: torch.LongTensor = None,
        metadata: List[Dict[str, Any]] = None,
    ) -> Dict[str, torch.Tensor]:
        """
        # Parameters
        tokens : TextFieldTensors, required
            The output of `TextField.as_array()`.
        pos_tags : torch.LongTensor, optional (default = None)
            The output of a `SequenceLabelField` containing POS tags.
        metadata : List[Dict[str, Any]], optional (default = None)
            A dictionary of metadata for each batch element which has keys:
                tokens : `List[str]`, required.
                    The original string tokens in the sentence.
        enhanced_tags : torch.LongTensor, optional (default = None)
            A torch tensor representing the sequence of integer indices denoting the parent of every
            word in the dependency parse. Has shape ``(batch_size, sequence_length, sequence_length)``.

        # Returns

        An output dictionary.
        """
        embedded_text_input = self.text_field_embedder(tokens)
        concatenated_input = [embedded_text_input]
        if upos is not None and self._upos_tag_embedding is not None:
            concatenated_input.append(self._upos_tag_embedding(upos))
        elif self._upos_tag_embedding is not None:
            raise ConfigurationError(
                "Model uses a POS embedding, but no POS tags were passed.")

        if lemmas is not None and self._lemma_tag_embedding is not None:
            concatenated_input.append(self._lemma_tag_embedding(lemmas))
        if xpos is not None and self._xpos_tag_embedding is not None:
            concatenated_input.append(self._xpos_tag_embedding(xpos))
        if feats is not None and self._feats_tag_embedding is not None:
            batch_size, sequence_len, max_len = feats.size()
            # shape: (batch, seq_len, max_len)
            feats_mask = (feats != -1).long()
            feats = feats * feats_mask
            # tensor corresponding to the number of active components, e.g. morphological features
            number_active_components = feats_mask.sum(-1)
            # a padding token's summed vector will be filled with 0s and when this is divided by 0
            # it will return a NaN so we replaces 0s with 1s in the denominator tensor to avoid this.
            number_active_components[number_active_components == 0] = 1

            feats_embeddings = []
            # shape: (seq_len, max_len)
            for feat_tensor in feats:
                # shape: (seq_len, max_len, emb_dim)
                embedded_feats = self._feats_tag_embedding(feat_tensor)
                feats_embeddings.append(embedded_feats)
            # shape: (batch, seq_len, max_len, emb_dim)
            stacked_feats_tensor = torch.stack(feats_embeddings)
            tag_embedding_dim = stacked_feats_tensor.size(-1)
            feats_mask_expanded = feats_mask.unsqueeze_(-1).expand(
                batch_size, sequence_len, max_len, tag_embedding_dim)
            # shape: (batch, seq_len, max_len, emb_dim)
            masked_feats = stacked_feats_tensor * feats_mask_expanded
            # shape: (batch, seq_len, tag_embedding_dim)
            combined_masked_feats = masked_feats.sum(2)
            expanded_number_active_components = number_active_components.unsqueeze(
                -1).expand(batch_size, sequence_len, tag_embedding_dim)
            # divide the summed feats vectors by the number of non-padded elements
            averaged_feats = combined_masked_feats / expanded_number_active_components
            concatenated_input.append(averaged_feats)

        if deprels is not None and self._head_tag_embedding is not None:
            concatenated_input.append(self._head_tag_embedding(deprels))

        # TODO BASIC TREE
        if heads is not None and self._head_information_embedding is not None:
            batch_size, sequence_len, max_len = heads.size()
            # shape: (batch, seq_len, max_len)
            head_information_mask = (heads != -1).long()
            heads = heads * head_information_mask
            # tensor corresponding to the number of active components, e.g. morphological features
            number_active_components = head_information_mask.sum(-1)
            # a padding token's summed vector will be filled with 0s and when this is divided by 0
            # it will return a NaN so we replaces 0s with 1s in the denominator tensor to avoid this.
            number_active_components[number_active_components == 0] = 1

            head_information_embeddings = []
            # shape: (seq_len, max_len)
            for head_information_tensor in heads:
                # shape: (seq_len, max_len, emb_dim)
                embedded_head_information = self._head_information_embedding(
                    head_information_tensor)
                head_information_embeddings.append(embedded_head_information)
            # shape: (batch, seq_len, max_len, emb_dim)
            stacked_head_information_tensor = torch.stack(
                head_information_embeddings)
            tag_embedding_dim = stacked_head_information_tensor.size(-1)
            head_information_mask_expanded = head_information_mask.unsqueeze_(
                -1).expand(batch_size, sequence_len, max_len,
                           tag_embedding_dim)
            # shape: (batch, seq_len, max_len, emb_dim)
            masked_head_information = stacked_head_information_tensor * head_information_mask_expanded
            # shape: (batch, seq_len, tag_embedding_dim)
            combined_masked_head_information = masked_head_information.sum(2)
            expanded_number_active_components = number_active_components.unsqueeze(
                -1).expand(batch_size, sequence_len, tag_embedding_dim)
            # divide the summed head information vectors by the number of non-padded elements
            averaged_head_information = combined_masked_head_information / expanded_number_active_components
            concatenated_input.append(averaged_head_information)

        if len(concatenated_input) > 1:
            embedded_text_input = torch.cat(concatenated_input, -1)

        mask = get_text_field_mask(tokens)
        embedded_text_input = self._input_dropout(embedded_text_input)
        encoded_text = self.encoder(embedded_text_input, mask)

        batch_size, _, encoding_dim = encoded_text.size()

        head_sentinel = self._head_sentinel.expand(batch_size, 1, encoding_dim)
        # Concatenate the head sentinel onto the sentence representation.
        encoded_text = torch.cat([head_sentinel, encoded_text], 1)
        mask = torch.cat([mask.new_ones(batch_size, 1), mask], 1)
        encoded_text = self._dropout(encoded_text)

        # shape (batch_size, sequence_length, arc_representation_dim)
        head_arc_representation = self._dropout(
            self.head_arc_feedforward(encoded_text))
        child_arc_representation = self._dropout(
            self.child_arc_feedforward(encoded_text))

        # shape (batch_size, sequence_length, tag_representation_dim)
        head_tag_representation = self._dropout(
            self.head_tag_feedforward(encoded_text))
        child_tag_representation = self._dropout(
            self.child_tag_feedforward(encoded_text))

        # shape (batch_size, sequence_length, sequence_length)
        arc_scores = self.arc_attention(head_arc_representation,
                                        child_arc_representation)

        # shape (batch_size, num_tags, sequence_length, sequence_length)
        arc_tag_logits = self.tag_bilinear(head_tag_representation,
                                           child_tag_representation)

        # Switch to (batch_size, sequence_length, sequence_length, num_tags)
        arc_tag_logits = arc_tag_logits.permute(0, 2, 3, 1).contiguous()

        # Since we'll be doing some additions, using the min value will cause underflow
        minus_mask = ~mask * min_value_of_dtype(arc_scores.dtype) / 10
        arc_scores = arc_scores + minus_mask.unsqueeze(
            2) + minus_mask.unsqueeze(1)

        arc_probs, arc_tag_probs = self._greedy_decode(arc_scores,
                                                       arc_tag_logits, mask)

        output_dict = {
            "arc_probs": arc_probs,
            "arc_tag_probs": arc_tag_probs,
            "mask": mask
        }

        if metadata:
            output_dict["conllu_metadata"] = [
                meta["conllu_metadata"] for meta in metadata
            ]
            output_dict["ids"] = [meta["ids"] for meta in metadata]
            output_dict["tokens"] = [meta["tokens"] for meta in metadata]
            output_dict["lemmas"] = [meta["lemmas"] for meta in metadata]
            output_dict["upos"] = [meta["upos_tags"] for meta in metadata]
            output_dict["xpos"] = [meta["xpos_tags"] for meta in metadata]
            output_dict["feats"] = [meta["feats"] for meta in metadata]
            output_dict["head_tags"] = [meta["head_tags"] for meta in metadata]
            output_dict["head_indices"] = [
                meta["head_indices"] for meta in metadata
            ]
            output_dict["original_to_new_indices"] = [
                meta["original_to_new_indices"] for meta in metadata
            ]
            output_dict["misc"] = [meta["misc"] for meta in metadata]
            output_dict["multiword_ids"] = [
                x["multiword_ids"] for x in metadata if "multiword_ids" in x
            ]
            output_dict["multiword_forms"] = [
                x["multiword_forms"] for x in metadata
                if "multiword_forms" in x
            ]

        if enhanced_tags is not None:
            arc_nll, tag_nll = self._construct_loss(
                arc_scores=arc_scores,
                arc_tag_logits=arc_tag_logits,
                enhanced_tags=enhanced_tags,
                mask=mask)

            output_dict["loss"] = arc_nll + tag_nll
            output_dict["arc_loss"] = arc_nll
            output_dict["tag_loss"] = tag_nll

            # get human readable output to computed enhanced graph metrics
            output_dict = self.make_output_human_readable(output_dict)

            # predicted arcs, arc_tags
            predicted_arcs = output_dict["arcs"]
            predicted_arc_tags = output_dict["arc_tags"]
            predicted_labeled_arcs = output_dict["labeled_arcs"]

            # gold arcs, arc_tags
            gold_arcs = [meta["arc_indices"] for meta in metadata]
            gold_arc_tags = [meta["arc_tags"] for meta in metadata]
            gold_labeled_arcs = [meta["labeled_arcs"] for meta in metadata]

            tag_mask = mask.unsqueeze(1) & mask.unsqueeze(2)
            self._enhanced_attachment_scores(predicted_arcs, predicted_arc_tags, predicted_labeled_arcs, \
                                             gold_arcs, gold_arc_tags, gold_labeled_arcs, tag_mask)

        return output_dict
    def forward(
        self,  # type: ignore
        tokens: TextFieldTensors,
        label: torch.LongTensor = None,
    ) -> Dict[str, torch.Tensor]:

        """
        # Parameters

        tokens : `TextFieldTensors`, required
            The output of `TextField.as_array()`.
        label : `torch.LongTensor`, optional (default = `None`)
            A variable representing the label for each instance in the batch.

        # Returns

        An output dictionary consisting of:
             - `class_probabilities` (`torch.FloatTensor`) :
                 A tensor of shape `(batch_size, num_classes)` representing a
                 distribution over the label classes for each instance.
             - `loss` (`torch.FloatTensor`, optional) :
                 A scalar loss to be optimised.        """
        text_mask = util.get_text_field_mask(tokens)
        # Pop elmo tokens, since elmo embedder should not be present.
        elmo_tokens = tokens.pop("elmo", None)
        if tokens:
            embedded_text = self._text_field_embedder(tokens)
        else:
            # only using "elmo" for input
            embedded_text = None

        # Add the "elmo" key back to "tokens" if not None, since the tests and the
        # subsequent training epochs rely not being modified during forward()
        if elmo_tokens is not None:
            tokens["elmo"] = elmo_tokens

        # Create ELMo embeddings if applicable
        if self._elmo:
            if elmo_tokens is not None:
                elmo_representations = self._elmo(elmo_tokens["elmo_tokens"])[
                    "elmo_representations"
                ]
                # Pop from the end is more performant with list
                if self._use_integrator_output_elmo:
                    integrator_output_elmo = elmo_representations.pop()
                if self._use_input_elmo:
                    input_elmo = elmo_representations.pop()
                assert not elmo_representations
            else:
                raise ConfigurationError(
                    "Model was built to use Elmo, but input text is not tokenized for Elmo."
                )

        if self._use_input_elmo:
            if embedded_text is not None:
                embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
            else:
                embedded_text = input_elmo

        dropped_embedded_text = self._embedding_dropout(embedded_text)
        pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text)
        encoded_tokens = self._encoder(pre_encoded_text, text_mask)

        # Compute biattention. This is a special case since the inputs are the same.
        attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous())
        attention_weights = util.masked_softmax(attention_logits, text_mask)
        encoded_text = util.weighted_sum(encoded_tokens, attention_weights)

        # Build the input to the integrator
        integrator_input = torch.cat(
            [encoded_tokens, encoded_tokens - encoded_text, encoded_tokens * encoded_text], 2
        )
        integrated_encodings = self._integrator(integrator_input, text_mask)

        # Concatenate ELMo representations to integrated_encodings if specified
        if self._use_integrator_output_elmo:
            integrated_encodings = torch.cat([integrated_encodings, integrator_output_elmo], dim=-1)

        # Simple Pooling layers
        max_masked_integrated_encodings = util.replace_masked_values(
            integrated_encodings,
            text_mask.unsqueeze(2),
            util.min_value_of_dtype(integrated_encodings.dtype),
        )
        max_pool = torch.max(max_masked_integrated_encodings, 1)[0]
        min_masked_integrated_encodings = util.replace_masked_values(
            integrated_encodings,
            text_mask.unsqueeze(2),
            util.max_value_of_dtype(integrated_encodings.dtype),
        )
        min_pool = torch.min(min_masked_integrated_encodings, 1)[0]
        mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True)

        # Self-attentive pooling layer
        # Run through linear projection. Shape: (batch_size, sequence length, 1)
        # Then remove the last dimension to get the proper attention shape (batch_size, sequence length).
        self_attentive_logits = self._self_attentive_pooling_projection(
            integrated_encodings
        ).squeeze(2)
        self_weights = util.masked_softmax(self_attentive_logits, text_mask)
        self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights)

        pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1)
        pooled_representations_dropped = self._integrator_dropout(pooled_representations)

        logits = self._output_layer(pooled_representations_dropped)
        class_probabilities = F.softmax(logits, dim=-1)

        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict
Exemple #21
0
    def forward(self, tokens: torch.Tensor, mask: torch.BoolTensor):
        if mask is not None:
            tokens = tokens * mask.unsqueeze(-1)
        else:
            # If mask doesn't exist create one of shape (batch_size, num_tokens)
            mask = torch.ones(tokens.shape[0],
                              tokens.shape[1],
                              device=tokens.device).bool()

        # Our input is expected to have shape `(batch_size, num_tokens, embedding_dim)`.  The
        # convolution layers expect input of shape `(batch_size, in_channels, sequence_length)`,
        # where the conv layer `in_channels` is our `embedding_dim`.  We thus need to transpose the
        # tensor first.
        tokens = torch.transpose(tokens, 1, 2)
        # Each convolution layer returns output of size `(batch_size, num_filters, pool_length)`,
        # where `pool_length = num_tokens - ngram_size + 1`.  We then do an activation function,
        # masking, then do max pooling over each filter for the whole input sequence.
        # Because our max pooling is simple, we just use `torch.max`.  The resultant tensor has shape
        # `(batch_size, num_conv_layers * num_filters)`, which then gets projected using the
        # projection layer, if requested.

        # To ensure the cnn_encoder respects masking we add a large negative value to
        # the activations of all filters that convolved over a masked token. We do this by
        # first enumerating all filters for a given convolution size (torch.arange())
        # then by comparing it to an index of the last filter that does not involve a masked
        # token (.ge()) and finally adjusting dimensions to allow for addition and multiplying
        # by a large negative value (.unsqueeze())
        filter_outputs = []
        batch_size = tokens.shape[0]
        # shape: (batch_size, 1)
        last_unmasked_tokens = mask.sum(dim=1).unsqueeze(dim=-1)
        for i in range(len(self._convolution_layers)):
            convolution_layer = getattr(self, "conv_layer_{}".format(i))
            pool_length = tokens.shape[2] - convolution_layer.kernel_size[0] + 1

            # Forward pass of the convolutions.
            # shape: (batch_size, num_filters, pool_length)
            activations = self._activation(convolution_layer(tokens))

            # Create activation mask.
            # shape: (batch_size, pool_length)
            indices = (torch.arange(
                pool_length, device=activations.device).unsqueeze(0).expand(
                    batch_size, pool_length))
            # shape: (batch_size, pool_length)
            activations_mask = indices.ge(last_unmasked_tokens -
                                          convolution_layer.kernel_size[0] + 1)
            # shape: (batch_size, num_filters, pool_length)
            activations_mask = activations_mask.unsqueeze(1).expand_as(
                activations)

            # Replace masked out values with smallest possible value of the dtype so
            # that max pooling will ignore these activations.
            # shape: (batch_size, pool_length)
            activations = activations + (activations_mask *
                                         min_value_of_dtype(activations.dtype))

            # Pick out the max filters
            filter_outputs.append(activations.max(dim=2)[0])

        # Now we have a list of `num_conv_layers` tensors of shape `(batch_size, num_filters)`.
        # Concatenating them gives us a tensor of shape `(batch_size, num_filters * num_conv_layers)`.
        maxpool_output = (torch.cat(filter_outputs, dim=1)
                          if len(filter_outputs) > 1 else filter_outputs[0])

        # Replace the maxpool activations that picked up the masks with 0s
        maxpool_output[maxpool_output == min_value_of_dtype(
            maxpool_output.dtype)] = 0.0

        if self.projection_layer:
            result = self.projection_layer(maxpool_output)
        else:
            result = maxpool_output
        return result
    def _compute_coreference_scores(
        self,
        top_span_embeddings: torch.FloatTensor,
        top_antecedent_embeddings: torch.FloatTensor,
        top_partial_coreference_scores: torch.FloatTensor,
        top_antecedent_mask: torch.BoolTensor,
        top_antecedent_offsets: torch.FloatTensor,
    ) -> torch.FloatTensor:
        """
        Computes scores for every pair of spans. Additionally, a dummy label is included,
        representing the decision that the span is not coreferent with anything. For the dummy
        label, the score is always zero. For the true antecedent spans, the score consists of
        the pairwise antecedent score and the unary mention scores for the span and its
        antecedent. The factoring allows the model to blame many of the absent links on bad
        spans, enabling the pruning strategy used in the forward pass.

        # Parameters

        top_span_embeddings : `torch.FloatTensor`, required.
            Embedding representations of the kept spans. Has shape
            (batch_size, num_spans_to_keep, embedding_size)
        top_antecedent_embeddings: `torch.FloatTensor`, required.
            The embeddings of antecedents for each span candidate. Has shape
            (batch_size, num_spans_to_keep, max_antecedents, embedding_size)
        top_partial_coreference_scores : `torch.FloatTensor`, required.
            Sum of span mention score and antecedent mention score. The coarse to fine settings
            has an additional term which is the coarse bilinear score.
            (batch_size, num_spans_to_keep, max_antecedents).
        top_antecedent_mask : `torch.BoolTensor`, required.
            The mask for valid antecedents.
            (batch_size, num_spans_to_keep, max_antecedents).
        top_antecedent_offsets : `torch.FloatTensor`, required.
            The distance between the span and each of its antecedents in terms of the number
            of considered spans (i.e not the word distance between the spans).
            (batch_size, num_spans_to_keep, max_antecedents).

        # Returns

        coreference_scores : `torch.FloatTensor`
            A tensor of shape (batch_size, num_spans_to_keep, max_antecedents + 1),
            representing the unormalised score for each (span, antecedent) pair
            we considered.

        """
        # Shape: (batch_size, num_spans_to_keep, max_antecedents, embedding_size)
        span_pair_embeddings = self._compute_span_pair_embeddings(
            top_span_embeddings, top_antecedent_embeddings, top_antecedent_offsets
        )

        # Shape: (batch_size, num_spans_to_keep, max_antecedents)
        antecedent_scores = self._antecedent_scorer(
            self._antecedent_feedforward(span_pair_embeddings)
        ).squeeze(-1)
        antecedent_scores += top_partial_coreference_scores
        antecedent_scores = util.replace_masked_values(
            antecedent_scores, top_antecedent_mask, util.min_value_of_dtype(antecedent_scores.dtype)
        )

        # Shape: (batch_size, num_spans_to_keep, 1)
        shape = [antecedent_scores.size(0), antecedent_scores.size(1), 1]
        dummy_scores = antecedent_scores.new_zeros(*shape)

        # Shape: (batch_size, num_spans_to_keep, max_antecedents + 1)
        coreference_scores = torch.cat([dummy_scores, antecedent_scores], -1)
        return coreference_scores
Exemple #23
0
 def __init__(self,
              temperature: float = 1.0,
              filter_val: float = -float("inf")) -> None:
     self.temperature = temperature
     self.filter_val = min_value_of_dtype(torch.float)