def test_indices_to_tokens(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)
        indexer_no_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")
        string_no_specials = "AllenNLP is great"

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer_no_max_length.tokens_to_indices(
            allennlp_tokens, vocab)
        tokens_from_indices = indexer_no_max_length.indices_to_tokens(
            indexed, vocab)

        self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)

        indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_max_length.indices_to_tokens(
            indexed, vocab)

        # For now we are not removing special tokens introduced from max_length
        sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
        expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] +
                    sep_cls + allennlp_tokens[5:])

        self._assert_tokens_equal(expected, tokens_from_indices)
Beispiel #2
0
    def test_mask(self):
        # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1.
        for model in ["bert-base-uncased", "roberta-base"]:
            allennlp_tokenizer = PretrainedTransformerTokenizer(model)
            indexer = PretrainedTransformerIndexer(model_name=model)
            string_no_specials = "AllenNLP is great"
            allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
            vocab = Vocabulary()
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            expected_masks = [1] * len(indexed["token_ids"])
            assert indexed["mask"] == expected_masks
            max_length = 10
            padding_lengths = {key: max_length for key in indexed.keys()}
            padded_tokens = indexer.as_padded_tensor_dict(
                indexed, padding_lengths)
            padding_length = max_length - len(indexed["mask"])
            expected_masks = expected_masks + ([0] * padding_length)
            assert len(padded_tokens["mask"]) == max_length
            assert padded_tokens["mask"].tolist() == expected_masks

            assert len(padded_tokens["token_ids"]) == max_length
            padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id
                              ] * padding_length
            assert padded_tokens["token_ids"][-padding_length:].tolist(
            ) == padding_suffix
    def test_as_array_produces_token_sequence(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")
        tokens = tokenizer.tokenize("AllenNLP is great")
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        allennlp_tokens = [Token(token) for token in tokens]
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
        assert indexed["key"] == expected_ids

        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
        tokens = tokenizer.tokenize("AllenNLP is great")
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        allennlp_tokens = [Token(token) for token in tokens]
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
        assert indexed["key"] == expected_ids
 def test_as_array_produces_token_sequence(self):
     tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased',
                                               do_lowercase=True)
     indexer = PretrainedTransformerIndexer(model_name='bert-base-uncased',
                                            do_lowercase=True)
     tokens = tokenizer.tokenize('AllenNLP is great')
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = [Token(token) for token in tokens]
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, 'key')
     assert indexed['key'] == expected_ids
Beispiel #5
0
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Beispiel #6
0
 def test_as_array_produces_token_sequence_bert_cased_sentence_pair(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     default_format = "[CLS] AllenNLP is great! [SEP] Really it is! [SEP]"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.tokenize_sentence_pair(
         "AllenNLP is great!", "Really it is!")
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
Beispiel #7
0
 def test_as_array_produces_token_sequence_roberta(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     string_specials = "<s> AllenNLP is great </s>"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Beispiel #8
0
 def test_transformers_vocabs_added_correctly(self):
     namespace, model_name = "tags", "roberta-base"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_token_to_index_vocabulary(
         namespace=namespace) == tokenizer.encoder
Beispiel #9
0
 def check_vocab_size(model_name: str):
     namespace = "tags"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Beispiel #10
0
 def test_as_array_produces_token_sequence_bert_cased(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     string_specials = "[CLS] AllenNLP is great [SEP]"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
 def test_transformers_vocab_sizes(self, model_name):
     namespace = "tags"
     tokenizer = cached_transformers.get_tokenizer(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Beispiel #12
0
 def test_as_array_produces_token_sequence_roberta_sentence_pair(self):
     tokenizer = cached_transformers.get_tokenizer("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "roberta-base", add_special_tokens=False)
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     default_format = "<s> AllenNLP is great! </s> </s> Really it is! </s>"
     tokens = tokenizer.tokenize(default_format)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     allennlp_tokens = allennlp_tokenizer.add_special_tokens(
         allennlp_tokenizer.tokenize("AllenNLP is great!"),
         allennlp_tokenizer.tokenize("Really it is!"),
     )
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
    def test_type_ids_when_folding(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased", add_special_tokens=False)
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=6)
        first_string = "How do trees get online?"
        second_string = "They log in!"

        tokens = allennlp_tokenizer.add_special_tokens(
            allennlp_tokenizer.tokenize(first_string),
            allennlp_tokenizer.tokenize(second_string))
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(tokens, vocab)
        assert min(indexed["type_ids"]) == 0
        assert max(indexed["type_ids"]) == 1
 def test_mask(self):
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "bert-base-uncased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")
     string_no_specials = "AllenNLP is great"
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     expected_masks = [1] * len(indexed["token_ids"])
     assert indexed["mask"] == expected_masks
     max_length = 10
     padding_lengths = {"token_ids": max_length, "mask": max_length}
     padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths)
     padding_length = max_length - len(indexed["mask"])
     expected_masks = expected_masks + ([0] * padding_length)
     assert len(padded_tokens["mask"]) == max_length
     assert padded_tokens["mask"].tolist() == expected_masks
Beispiel #15
0
    def test_long_sequence_splitting(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=4)
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(
            expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (expected_ids[:3] + [sep_id, cls_id] +
                        expected_ids[3:5] + [sep_id, cls_id] +
                        expected_ids[5:])

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["segment_concat_mask"] == [1] * len(expected_ids)
        assert indexed["mask"] == [1] * 7  # original length
class PretrainedTransformerMismatchedIndexer(TokenIndexer):
    """
    Use this indexer when (for whatever reason) you are not using a corresponding
    `PretrainedTransformerTokenizer` on your input. We assume that you used a tokenizer that splits
    strings into words, while the transformer expects wordpieces as input. This indexer splits the
    words into wordpieces and flattens them out. You should use the corresponding
    `PretrainedTransformerMismatchedEmbedder` to embed these wordpieces and then pull out a single
    vector for each original word.

    # Parameters

    model_name : `str`
        The name of the `transformers` model to use.
    namespace : `str`, optional (default=`tags`)
        We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
        tokens to this namespace, which would break on loading because we wouldn't find our default
        OOV token.
    """
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 **kwargs) -> None:
        super().__init__(**kwargs)
        # The matched version v.s. mismatched
        self._matched_indexer = PretrainedTransformerIndexer(
            model_name, namespace, **kwargs)

        # add_special_tokens=False since we don't want wordpieces to be surrounded by special tokens
        self._allennlp_tokenizer = PretrainedTransformerTokenizer(
            model_name, add_special_tokens=False)
        self._tokenizer = self._allennlp_tokenizer.tokenizer

        (
            self._num_added_start_tokens,
            self._num_added_end_tokens,
        ) = self._determine_num_special_tokens_added()

    @overrides
    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        return self._matched_indexer.count_vocab_items(token, counter)

    @overrides
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> IndexedTokenList:
        orig_token_mask = [1] * len(tokens)
        tokens, offsets = self._intra_word_tokenize(tokens)

        # {"token_ids": ..., "mask": ...}
        output = self._matched_indexer.tokens_to_indices(tokens, vocabulary)

        # Insert type ids for the special tokens.
        output[
            "type_ids"] = self._tokenizer.create_token_type_ids_from_sequences(
                output["token_ids"])
        # Insert the special tokens themselves.
        output["token_ids"] = self._tokenizer.build_inputs_with_special_tokens(
            output["token_ids"])
        output["mask"] = orig_token_mask
        output["offsets"] = [(start + self._num_added_start_tokens,
                              end + self._num_added_start_tokens)
                             for start, end in offsets]
        output["wordpiece_mask"] = [1] * len(output["token_ids"])
        return output

    @overrides
    def get_empty_token_list(self) -> IndexedTokenList:
        output = self._matched_indexer.get_empty_token_list()
        output["offsets"] = []
        output["wordpiece_mask"] = []
        return output

    @overrides
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tokens = tokens.copy()
        padding_lengths = padding_lengths.copy()

        offsets_tokens = tokens.pop("offsets")
        offsets_padding_lengths = padding_lengths.pop("offsets")

        tensor_dict = self._matched_indexer.as_padded_tensor_dict(
            tokens, padding_lengths)
        tensor_dict["offsets"] = torch.LongTensor(
            pad_sequence_to_length(offsets_tokens,
                                   offsets_padding_lengths,
                                   default_value=lambda: (0, 0)))
        return tensor_dict

    def __eq__(self, other):
        if isinstance(other, PretrainedTransformerMismatchedIndexer):
            for key in self.__dict__:
                if key == "tokenizer":
                    # This is a reference to a function in the huggingface code, which we can't
                    # really modify to make this clean.  So we special-case it.
                    continue
                if self.__dict__[key] != other.__dict__[key]:
                    return False
            return True
        return NotImplemented

    def _intra_word_tokenize(
            self,
            tokens: List[Token]) -> Tuple[List[Token], List[Tuple[int, int]]]:
        """
        Tokenizes each word into wordpieces separately. Also calculates offsets such that
        wordpices[offsets[i][0]:offsets[i][1] + 1] corresponds to the original i-th token.
        Does not insert special tokens.
        """
        wordpieces: List[Token] = []
        offsets = []
        cumulative = 0
        for token in tokens:
            subword_wordpieces = self._allennlp_tokenizer.tokenize(token.text)
            wordpieces.extend(subword_wordpieces)

            start_offset = cumulative
            cumulative += len(subword_wordpieces)
            end_offset = cumulative - 1  # inclusive
            offsets.append((start_offset, end_offset))

        return wordpieces, offsets

    def _determine_num_special_tokens_added(self) -> Tuple[int, int]:
        """
        Determines the number of tokens self._tokenizer adds to a sequence (currently doesn't
        consider sequence pairs) in the start & end.

        # Returns
        The number of tokens (`int`) that are inserted in the start & end of a sequence.
        """
        # Uses a slightly higher index to avoid tokenizer doing special things to lower-indexed
        # tokens which might be special.
        dummy = [1000]
        inserted = self._tokenizer.build_inputs_with_special_tokens(dummy)

        num_start = num_end = 0
        seen_dummy = False
        for idx in inserted:
            if idx == dummy[0]:
                if seen_dummy:  # seeing it twice
                    raise ValueError(
                        "Cannot auto-determine the number of special tokens added."
                    )
                seen_dummy = True
                continue

            if not seen_dummy:
                num_start += 1
            else:
                num_end += 1

        assert num_start + num_end == self._tokenizer.num_added_tokens()
        return num_start, num_end