def test_max_length(self):
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased",
                                                max_length=10,
                                                add_special_tokens=False)
     tokens = tokenizer.tokenize(
         "hi there, this should be at least 10 tokens, but some will be truncated"
     )
     assert len(tokens) == 10
Beispiel #2
0
 def test_no_max_length(self):
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased",
                                                max_length=None,
                                                add_special_tokens=False)
     # Even though the bert model has a max input length of 512, when we tokenize
     # with `max_length = None`, we should not get any truncation.
     tokens = tokenizer.tokenize(" ".join(["a"] * 550))
     assert len(tokens) == 550
    def test_end_to_end(self):
        tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {"type": "pretrained_transformer", "model_name": "bert-base-uncased"}
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
Beispiel #4
0
 def test_splits_into_wordpieces(self):
     tokenizer = PretrainedTransformerTokenizer('bert-base-cased',
                                                do_lowercase=False)
     sentence = "A, [MASK] AllenNLP sentence."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence",
         ".", "[SEP]"
     ]
     assert tokens == expected_tokens
Beispiel #5
0
def main():
    tokenizer = PretrainedTransformerTokenizer(model_name=BERT_MODEL,
                                               add_special_tokens=False)
    result = tokenizer.tokenize('The best movie ever!')
    print(result)
    reader = SnliReader(tokenizer=tokenizer)
    for instance in reader.read(
            'https://realworldnlpbook.s3.amazonaws.com/data/snli/snli_1.0_dev.jsonl'
    ):
        print(instance)
Beispiel #6
0
 def test_token_idx_wikipedia(self):
     sentence = (
         "Tokyo (東京 Tōkyō, English: /ˈtoʊkioʊ/,[7] Japanese: [toːkʲoː]), officially "
         "Tokyo Metropolis (東京都 Tōkyō-to), is one of the 47 prefectures of Japan."
     )
     for tokenizer_name in [
             "roberta-base", "bert-base-uncased", "bert-base-cased"
     ]:
         tokenizer = PretrainedTransformerTokenizer(tokenizer_name)
         tokenized = tokenizer.tokenize(sentence)
         assert tokenized[-2].text == "."
         assert tokenized[-2].idx == len(sentence) - 1
Beispiel #7
0
 def test_token_idx_wikipedia(self):
     # This will produce lots of problems with the index calculation. We check whether it catches back up at the
     # end.
     sentence = "Tokyo (東京 Tōkyō, English: /ˈtoʊkioʊ/,[7] Japanese: [toːkʲoː]), officially Tokyo Metropolis (東京都 Tōkyō-to), is one of the 47 prefectures of Japan."
     for tokenizer_name in [
             "roberta-base", "bert-base-uncased", "bert-base-cased"
     ]:
         tokenizer = PretrainedTransformerTokenizer(
             tokenizer_name, calculate_character_offsets=True)
         tokenized = tokenizer.tokenize(sentence)
         assert tokenized[-2].text == "."
         assert tokenized[-2].idx == len(sentence) - 1
 def test_transformers_vocab_sizes(self, model_name):
     namespace = "tags"
     tokenizer = cached_transformers.get_tokenizer(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Beispiel #9
0
 def test_as_array_produces_token_sequence_roberta(self):
     tokenizer = AutoTokenizer.from_pretrained("roberta-base")
     allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base")
     indexer = PretrainedTransformerIndexer(model_name="roberta-base")
     string_specials = "<s> AllenNLP is great </s>"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key")
     assert indexed["key"] == expected_ids
Beispiel #10
0
 def test_transformers_vocabs_added_correctly(self):
     namespace, model_name = "tags", "roberta-base"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_token_to_index_vocabulary(
         namespace=namespace) == tokenizer.encoder
Beispiel #11
0
 def check_vocab_size(model_name: str):
     namespace = "tags"
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
     indexer = PretrainedTransformerIndexer(model_name=model_name,
                                            namespace=namespace)
     allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!")
     vocab = Vocabulary()
     # here we copy entire transformers vocab
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     del indexed
     assert vocab.get_vocab_size(
         namespace=namespace) == tokenizer.vocab_size
Beispiel #12
0
 def test_as_array_produces_token_sequence_bert_cased(self):
     tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
     allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-cased")
     string_specials = "[CLS] AllenNLP is great [SEP]"
     string_no_specials = "AllenNLP is great"
     tokens = tokenizer.tokenize(string_specials)
     expected_ids = tokenizer.convert_tokens_to_ids(tokens)
     # tokens tokenized with our pretrained tokenizer have indices in them
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     assert indexed["token_ids"] == expected_ids
Beispiel #13
0
class MockOldDatasetReader(DatasetReader):
    def __init__(self,
                 model: str = "epwalsh/bert-xsmall-dummy",
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self.tokenizer = PretrainedTransformerTokenizer(model)
        self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)}

    def _read(self, file_path: str):
        for i in range(10):
            source = f"Hi there, I'm the {i}th instance"
            target = f"Hello, {i}th instance!"
            yield self.text_to_instance(source, target)

    def text_to_instance(self,
                         source: str,
                         target: str = None) -> Instance:  # type: ignore
        fields = {}
        fields["source"] = TextField(self.tokenizer.tokenize(source),
                                     self.token_indexers)  # type: ignore
        if target is not None:
            fields["target"] = TextField(self.tokenizer.tokenize(target),
                                         self.token_indexers)  # type: ignore
        return Instance(fields)  # type: ignore
Beispiel #14
0
 def test_tokenizer_kwargs_default(self):
     text = "Hello there! General Kenobi."
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     original_tokens = [
         "[CLS]",
         "Hello",
         "there",
         "!",
         "General",
         "Ken",
         "##ob",
         "##i",
         ".",
         "[SEP]",
     ]
     tokenized = [token.text for token in tokenizer.tokenize(text)]
     assert tokenized == original_tokens
Beispiel #15
0
 def test_splits_uncased_bert(self):
     sentence = "A, [MASK] AllenNLP sentence."
     expected_tokens = [
         "[CLS]",
         "a",
         ",",
         "[MASK]",
         "allen",
         "##nl",
         "##p",
         "sentence",
         ".",
         "[SEP]",
     ]
     tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     assert tokens == expected_tokens
 def test_mask(self):
     allennlp_tokenizer = PretrainedTransformerTokenizer(
         "bert-base-uncased")
     indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")
     string_no_specials = "AllenNLP is great"
     allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
     vocab = Vocabulary()
     indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
     expected_masks = [1] * len(indexed["token_ids"])
     assert indexed["mask"] == expected_masks
     max_length = 10
     padding_lengths = {"token_ids": max_length, "mask": max_length}
     padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths)
     padding_length = max_length - len(indexed["mask"])
     expected_masks = expected_masks + ([0] * padding_length)
     assert len(padded_tokens["mask"]) == max_length
     assert padded_tokens["mask"].tolist() == expected_masks
Beispiel #17
0
    def test_splits_roberta(self):
        tokenizer = PretrainedTransformerTokenizer("roberta-base")

        sentence = "A, <mask> AllenNLP sentence."
        expected_tokens = [
            "<s>",
            "ĠA",
            ",",
            "<mask>",
            "ĠAllen",
            "N",
            "LP",
            "Ġsentence",
            ".",
            "</s>",
        ]
        tokens = [t.text for t in tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens
Beispiel #18
0
    def test_splits_cased_bert(self):
        tokenizer = PretrainedTransformerTokenizer("bert-base-cased")

        sentence = "A, [MASK] AllenNLP sentence."
        expected_tokens = [
            "[CLS]",
            "A",
            ",",
            "[MASK]",
            "Allen",
            "##NL",
            "##P",
            "sentence",
            ".",
            "[SEP]",
        ]
        tokens = [t.text for t in tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

        # sentence pair
        sentence_1 = "A, [MASK] AllenNLP sentence."
        sentence_2 = "A sentence."
        expected_tokens = [
            "[CLS]",
            "A",
            ",",
            "[MASK]",
            "Allen",
            "##NL",
            "##P",
            "sentence",
            ".",
            "[SEP]",
            "A",
            "sentence",
            ".",
            "[SEP]",
        ]
        tokens = [
            t.text
            for t in tokenizer.tokenize_sentence_pair(sentence_1, sentence_2)
        ]
        assert tokens == expected_tokens
Beispiel #19
0
class TransformersTokenizer(Tokenizer):
    """This tokenizer uses the pretrained tokenizers from huggingface's transformers library.

    This means the output will very likely be word pieces depending on the specified pretrained model.

    Parameters
    ----------
    config
        A `TokenizerConfiguration` object
    """

    def __init__(self, config):
        super().__init__(config)
        self.pretrained_tokenizer = PretrainedTransformerTokenizer(
            **config.transformers_kwargs
        )

    def tokenize_document(self, document: List[str]) -> List[List[Token]]:
        texts = [
            self.text_cleaning(text)[: self.config.truncate_input] for text in document
        ]
        if not self.config.segment_sentences:
            return list(map(self._tokenize, texts[: self.config.max_nr_of_sentences]))

        sentences = [
            sentence.text.strip()[: self.config.truncate_sentence]
            for doc in self.__nlp__.pipe(texts)
            for sentence in doc.sents
            if (
                self.config.min_sentence_length
                < len(sentence.text.strip())
                < self.config.max_sentence_length
            )
        ]
        return list(map(self._tokenize, sentences[: self.config.max_nr_of_sentences]))

    def _tokenize(self, text: str) -> List[Token]:
        return self.pretrained_tokenizer.tokenize(text)

    @property
    def nlp(self) -> Language:
        raise NotImplementedError("For the TransformerTokenizer we have no spaCy nlp")
Beispiel #20
0
 def test_tokenizer_kwargs_forced_lowercase(self):
     text = "Hello there! General Kenobi."
     forced_lowercase_tokenizer = PretrainedTransformerTokenizer(
         "bert-base-cased", tokenizer_kwargs={"do_lower_case": True})
     assert forced_lowercase_tokenizer._tokenizer_lowercases
     tokenized = [
         token.text for token in forced_lowercase_tokenizer.tokenize(text)
     ]
     lowercase_tokens = [
         "[CLS]",
         "hello",
         "there",
         "!",
         "general",
         "k",
         "##eno",
         "##bi",
         ".",
         "[SEP]",
     ]
     assert tokenized == lowercase_tokens
Beispiel #21
0
 def test_token_idx_bert_uncased(self):
     sentence = "A, naïve [MASK] AllenNLP sentence."
     expected_tokens = [
         "[CLS]",
         "a",
         ",",
         "naive",  # BERT normalizes this away
         "[MASK]",
         "allen",
         "##nl",
         "##p",
         "sentence",
         ".",
         "[SEP]",
     ]
     expected_idxs = [None, 0, 1, 3, 9, 16, 21, 23, 25, 33, None]
     tokenizer = PretrainedTransformerTokenizer("bert-base-uncased")
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
Beispiel #22
0
 def test_token_idx_roberta(self):
     sentence = "A, naïve <mask> AllenNLP sentence."
     expected_tokens = [
         "<s>",
         "ĠA",
         ",",
         "Ġnaïve",  # RoBERTa mangles this. Or maybe it "encodes"?
         "<mask>",
         "ĠAllen",
         "N",
         "LP",
         "Ġsentence",
         ".",
         "</s>",
     ]
     expected_idxs = [None, 0, 1, 3, 9, 16, 21, 22, 25, 33, None]
     tokenizer = PretrainedTransformerTokenizer("roberta-base")
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
Beispiel #23
0
class CustomTextDatasetReader(DatasetReader):
    def __init__(self,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 balance_classes=False,
                 **kwargs):

        self.line = 0
        super().__init__(**kwargs)
        # max_length ensures that we truncate the input
        self._tokenizer = PretrainedTransformerTokenizer(
            model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT)
        self._token_indexers = token_indexers
        self.balance_classes = balance_classes

    @overrides
    def text_to_instance(self, doc, label=None):
        # self.line += 1
        fields: Dict[str, Field] = {}
        tokens = self._tokenizer.tokenize(doc)
        if len(tokens) == 0 or tokens is None:
            print("Data contains empty examples, needs fixing...")
            raise Exception
        fields["tokens"] = TextField(tokens,
                                     token_indexers=self._token_indexers)
        if label is not None:
            fields["label"] = LabelField(label)
        return Instance(fields)

    @overrides
    def _read(self, filepath):
        with open(filepath) as f:
            data = pd.read_csv(f, header=None, names=['reviews', 'labels'])
            for i, (idx, row) in enumerate(data.iterrows()):
                doc = row['reviews']
                label = str(row['labels'])
                instance = self.text_to_instance(doc, label)
                if instance is not None:
                    yield instance
Beispiel #24
0
    def test_long_sequence_splitting(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased",
                                               max_length=4)
        string_specials = "[CLS] AllenNLP is great [SEP]"
        string_no_specials = "AllenNLP is great"
        tokens = tokenizer.tokenize(string_specials)
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(
            expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (expected_ids[:3] + [sep_id, cls_id] +
                        expected_ids[3:5] + [sep_id, cls_id] +
                        expected_ids[5:])

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["segment_concat_mask"] == [1] * len(expected_ids)
        assert indexed["mask"] == [1] * 7  # original length
Beispiel #25
0
 def test_token_idx_roberta(self):
     sentence = "A, naïve <mask> AllenNLP sentence."
     expected_tokens = [
         "<s>",
         "A",
         ",",
         "Ġnaïve",  # RoBERTa has a funny way of encoding combining characters.
         "<mask>",
         "Allen",
         "N",
         "LP",
         "Ġsentence",
         ".",
         "</s>",
     ]
     expected_idxs = [None, 0, 1, None, 9, 16, 21, 22, 25, 33, None]
     tokenizer = PretrainedTransformerTokenizer(
         "roberta-base", calculate_character_offsets=True)
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
 def test_token_idx_bert_cased(self):
     sentence = "A, naïve [MASK] AllenNLP sentence."
     expected_tokens = [
         "[CLS]",
         "A",
         ",",
         "na",
         "##ï",
         "##ve",
         "[MASK]",
         "Allen",
         "##NL",
         "##P",
         "sentence",
         ".",
         "[SEP]",
     ]
     expected_idxs = [None, 0, 1, 3, 5, 6, 9, 16, 21, 23, 25, 33, None]
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased")
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
Beispiel #27
0
class PretrainedTransformerIndexer(TokenIndexer):
    """
    This `TokenIndexer` assumes that Tokens already have their indexes in them (see `text_id` field).
    We still require `model_name` because we want to form allennlp vocabulary from pretrained one.
    This `Indexer` is only really appropriate to use if you've also used a
    corresponding :class:`PretrainedTransformerTokenizer` to tokenize your input.  Otherwise you'll
    have a mismatch between your tokens and your vocabulary, and you'll get a lot of UNK tokens.

    Registered as a `TokenIndexer` with name "pretrained_transformer".

    # Parameters

    model_name : `str`
        The name of the `transformers` model to use.
    namespace : `str`, optional (default=`tags`)
        We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace.
        We use a somewhat confusing default value of `tags` so that we do not add padding or UNK
        tokens to this namespace, which would break on loading because we wouldn't find our default
        OOV token.
    max_length : `int`, optional (default = `None`)
        If not None, split the document into segments of this many tokens (including special tokens)
        before feeding into the embedder. The embedder embeds these segments independently and
        concatenate the results to get the original document representation. Should be set to
        the same value as the `max_length` option on the `PretrainedTransformerEmbedder`.
    """
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 max_length: int = None,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._namespace = namespace
        self._allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
        self._tokenizer = self._allennlp_tokenizer.tokenizer
        self._added_to_vocabulary = False

        self._num_added_start_tokens = len(
            self._allennlp_tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(
            self._allennlp_tokenizer.single_sequence_end_tokens)

        self._max_length = max_length
        if self._max_length is not None:
            num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1
            self._effective_max_length = (  # we need to take into account special tokens
                self._max_length - num_added_tokens)
            if self._effective_max_length <= 0:
                raise ValueError(
                    "max_length needs to be greater than the number of special tokens inserted."
                )

    def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None:
        """
        Copies tokens from ```transformers``` model's vocab to the specified namespace.
        """
        if self._added_to_vocabulary:
            return

        try:
            vocab_items = self._tokenizer.get_vocab().items()
        except NotImplementedError:
            vocab_items = ((self._tokenizer.convert_ids_to_tokens(idx), idx)
                           for idx in range(self._tokenizer.vocab_size))
        for word, idx in vocab_items:
            vocab._token_to_index[self._namespace][word] = idx
            vocab._index_to_token[self._namespace][idx] = word

        self._added_to_vocabulary = True

    @overrides
    def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str,
                                                                      int]]):
        # If we only use pretrained models, we don't need to do anything here.
        pass

    @overrides
    def tokens_to_indices(self, tokens: List[Token],
                          vocabulary: Vocabulary) -> IndexedTokenList:
        self._add_encoding_to_vocabulary_if_needed(vocabulary)

        indices, type_ids = self._extract_token_and_type_ids(tokens)
        # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to.
        output: IndexedTokenList = {
            "token_ids": indices,
            "mask": [True] * len(indices),
            "type_ids": type_ids,
        }

        return self._postprocess_output(output)

    @overrides
    def indices_to_tokens(self, indexed_tokens: IndexedTokenList,
                          vocabulary: Vocabulary) -> List[Token]:
        token_ids = indexed_tokens["token_ids"]
        type_ids = indexed_tokens.get("type_ids")

        return [
            Token(
                text=vocabulary.get_token_from_index(token_ids[i],
                                                     self._namespace),
                text_id=token_ids[i],
                type_id=type_ids[i] if type_ids is not None else None,
            ) for i in range(len(token_ids))
        ]

    def _extract_token_and_type_ids(
            self,
            tokens: List[Token]) -> Tuple[List[int], Optional[List[int]]]:
        """
        Roughly equivalent to `zip(*[(token.text_id, token.type_id) for token in tokens])`,
        with some checks.
        """
        indices: List[int] = []
        type_ids: List[int] = []
        for token in tokens:
            if getattr(token, "text_id", None) is not None:
                # `text_id` being set on the token means that we aren't using the vocab, we just use
                # this id instead. Id comes from the pretrained vocab.
                # It is computed in PretrainedTransformerTokenizer.
                indices.append(token.text_id)
            else:
                raise KeyError(
                    "Using PretrainedTransformerIndexer but field text_id is not set"
                    f" for the following token: {token.text}")

            if type_ids is not None and getattr(token, "type_id",
                                                None) is not None:
                type_ids.append(token.type_id)
            else:
                type_ids.append(0)

        return indices, type_ids

    def _postprocess_output(self,
                            output: IndexedTokenList) -> IndexedTokenList:
        """
        Takes an IndexedTokenList about to be returned by `tokens_to_indices()` and adds any
        necessary postprocessing, e.g. long sequence splitting.

        The input should have a `"token_ids"` key corresponding to the token indices. They should
        have special tokens already inserted.
        """
        if self._max_length is not None:
            # We prepare long indices by converting them to (assuming max_length == 5)
            # [CLS] A B C [SEP] [CLS] D E F [SEP] ...
            # Embedder is responsible for folding this 1-d sequence to 2-d and feed to the
            # transformer model.
            # TODO(zhaofengw): we aren't respecting word boundaries when segmenting wordpieces.

            indices = output["token_ids"]
            # Strips original special tokens
            indices = indices[self._num_added_start_tokens:-self.
                              _num_added_end_tokens]
            # Folds indices
            folded_indices = [
                indices[i:i + self._effective_max_length]
                for i in range(0, len(indices), self._effective_max_length)
            ]
            # Adds special tokens to each segment
            folded_indices = [
                self._tokenizer.build_inputs_with_special_tokens(segment)
                for segment in folded_indices
            ]
            # Flattens
            indices = [i for segment in folded_indices for i in segment]

            output["token_ids"] = indices
            output["type_ids"] = [0] * len(indices)
            output["segment_concat_mask"] = [True] * len(indices)

        return output

    @overrides
    def get_empty_token_list(self) -> IndexedTokenList:
        output: IndexedTokenList = {
            "token_ids": [],
            "mask": [],
            "type_ids": []
        }
        if self._max_length is not None:
            output["segment_concat_mask"] = []
        return output

    @overrides
    def as_padded_tensor_dict(
            self, tokens: IndexedTokenList,
            padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]:
        tensor_dict = {}
        for key, val in tokens.items():
            if key == "type_ids":
                padding_value = 0
                mktensor = torch.LongTensor
            elif key == "mask" or key == "wordpiece_mask":
                padding_value = False
                mktensor = torch.BoolTensor
            elif len(val) > 0 and isinstance(val[0], bool):
                padding_value = False
                mktensor = torch.BoolTensor
            else:
                padding_value = self._tokenizer.pad_token_id
                if padding_value is None:
                    padding_value = (
                        0  # Some tokenizers don't have padding tokens and rely on the mask only.
                    )
                mktensor = torch.LongTensor

            tensor = mktensor(
                pad_sequence_to_length(val,
                                       padding_lengths[key],
                                       default_value=lambda: padding_value))

            tensor_dict[key] = tensor
        return tensor_dict

    def __eq__(self, other):
        if isinstance(other, PretrainedTransformerIndexer):
            for key in self.__dict__:
                if key == "_tokenizer":
                    # This is a reference to a function in the huggingface code, which we can't
                    # really modify to make this clean.  So we special-case it.
                    continue
                if self.__dict__[key] != other.__dict__[key]:
                    return False
            return True
        return NotImplemented
class TweetCandidateSpanDatasetReader(DatasetReader):
    def __init__(
        self,
        lazy: bool = False,
        cache_directory: Optional[str] = None,
        max_instances: Optional[int] = None,
        min_num_candidate: int = 3,
        max_num_candidate: int = 5,
        transformer_model_name_or_archive_path: str = "bert-base-uncased",
    ) -> None:
        super().__init__(lazy=lazy,
                         cache_directory=cache_directory,
                         max_instances=max_instances)
        if "tar.gz" in transformer_model_name_or_archive_path:
            config = extract_config_from_archive(
                transformer_model_name_or_archive_path)
            model_name = config.as_dict(
            )["dataset_reader"]["tokenizer"]["model_name"]
        else:
            model_name = transformer_model_name_or_archive_path
        self._tokenizer = PretrainedTransformerTokenizer(
            model_name=model_name, add_special_tokens=False)
        self._tokenindexer = PretrainedTransformerIndexer(
            model_name=model_name)
        self._min_num_candidate = min_num_candidate
        self._max_num_candidate = max_num_candidate

    def _read(self, file_path: str) -> Iterable[Instance]:
        file_path = cached_path(file_path)
        df = pd.read_json(file_path, lines=True)
        for record in df.to_dict("records"):
            if record["selected_text"]:
                text = record["text"]
                if not isinstance(text, str):
                    continue
                elif text.strip() == "":
                    continue
                elif len(record["candidate_spans"]) < self._min_num_candidate:
                    continue
                else:
                    yield self.text_to_instance(
                        " " + text.strip(),
                        record["sentiment"],
                        record["candidate_spans"],
                        record["textID"],
                        record.get("selected_text"),
                        record.get("selected_text_span"),
                    )

    def text_to_instance(
        self,
        text: str,
        sentiment: str,
        candidate_spans: list,
        text_id: Optional[str] = None,
        selected_text: Optional[str] = None,
        selected_text_span: Optional[tuple] = None,
    ) -> Instance:
        fields = {}
        text_tokens = self._tokenizer.tokenize(text)
        sentiment_tokens = self._tokenizer.tokenize(sentiment)
        text_with_sentiment_tokens = self._tokenizer.add_special_tokens(
            text_tokens, sentiment_tokens)
        fields["text_with_sentiment"] = TextField(
            text_with_sentiment_tokens, {"tokens": self._tokenindexer})
        candidate_spans = [
            tuple(i) for i in candidate_spans[:self._max_num_candidate]
        ]
        additional_metadata = {}
        if selected_text_span is not None:
            selected_text_span = tuple(selected_text_span)
            additional_metadata["selected_text_span"] = selected_text_span
            if selected_text_span not in candidate_spans:
                candidate_spans.append(selected_text_span)
                fields["label"] = LabelField(len(candidate_spans) - 1,
                                             skip_indexing=True)
                have_truth = False
            else:
                fields["label"] = LabelField(
                    candidate_spans.index(selected_text_span),
                    skip_indexing=True)
                have_truth = True
            additional_metadata["have_truth"] = have_truth
            additional_metadata["candidate_num"] = len(candidate_spans)
        fields["candidate_span_pairs"] = SpanPairsField(
            candidate_spans, fields["text_with_sentiment"])
        metadata = {
            "text": text,
            "sentiment": sentiment,
            "selected_text": selected_text,
            "text_with_sentiment_tokens": text_with_sentiment_tokens
        }
        if text_id is not None:
            metadata["text_id"] = text_id
        if additional_metadata:
            metadata.update(additional_metadata)
        fields["metadata"] = MetadataField(metadata)
        return Instance(fields)

    def span_to_str(self, text, span_start, span_end):
        text_tokens = self._tokenizer.tokenize(text)
        text_tokens = self._tokenizer.add_special_tokens(text_tokens)
        return span_tokens_to_text(text, text_tokens, span_start, span_end)
Beispiel #29
0
class TransformerSuperGlueRteReader(DatasetReader):
    """
    Dataset reader for the SuperGLUE Recognizing Textual Entailment task, to be used with a transformer
    model such as RoBERTa. The dataset is in the JSON Lines format.

    It will generate `Instances` with the following fields:

     * `tokens`, a `TextField` that contains the concatenation of premise and hypothesis,
     * `label`, a `LabelField` containing the label, if one exists.
     * `metadata`, a `MetadataField` that stores the instance's index in the file, the original premise,
       the original hypothesis, both of these in tokenized form, and the gold label, accessible as
       `metadata['index']`, `metadata['premise']`, `metadata['hypothesis']`, `metadata['tokens']`,
       and `metadata['label']`.

    # Parameters

    type : `str`, optional (default=`'roberta-base'`)
        This reader chooses tokenizer according to this setting.
    """

    def __init__(
        self,
        transformer_model_name: str = "roberta-base",
        tokenizer_kwargs: Dict[str, Any] = None,
        **kwargs
    ) -> None:
        super().__init__(
            manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs
        )
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(
                transformer_model_name, tokenizer_kwargs=tokenizer_kwargs, max_length=512
            )
        }

    @overrides
    def _read(self, file_path: str):
        # if `file_path` is a URL, redirect to the cache
        file_path = cached_path(file_path, extract_archive=True)

        logger.info("Reading file at %s", file_path)
        yielded_relation_count = 0
        from allennlp.common.file_utils import json_lines_from_file

        for relation in self.shard_iterable(json_lines_from_file(file_path)):
            premise = relation["premise"]
            hypothesis = relation["hypothesis"]
            if "label" in relation:
                label = relation["label"]
            else:
                label = None
            index = relation["idx"]

            # todo: see if we even need this to be in a separate method
            instance = self.text_to_instance(index, label, premise, hypothesis)

            yield instance
            yielded_relation_count += 1

    @overrides
    def text_to_instance(
        self,
        index: int,
        label: str,
        premise: str,
        hypothesis: str,
    ) -> Instance:
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)

        fields = {}

        premise_and_hypothesis = TextField(
            self._tokenizer.add_special_tokens(tokenized_premise, tokenized_hypothesis),
        )
        fields["tokens"] = TextField(premise_and_hypothesis)

        # make the metadata
        metadata = {
            "premise": premise,
            "premise_tokens": tokenized_premise,
            "hypothesis": hypothesis,
            "hypothesis_tokens": tokenized_hypothesis,
            "index": index,
        }
        if label:
            fields["label"] = LabelField(label)
            metadata["label"] = label

        fields["metadata"] = MetadataField(metadata)

        return Instance(fields)

    @overrides
    def apply_token_indexers(self, instance: Instance) -> None:
        instance["tokens"].token_indexers = self._token_indexers
class TransformerMCReader(DatasetReader):
    """
    Read input data for the TransformerMC model. This is the base class for all readers that produce
    data for TransformerMC.

    Instances have two fields:
     * `alternatives`, a `ListField` of `TextField`
     * `correct_alternative`, `IndexField` with the correct answer among `alternatives`
     * `qid`, a `MetadataField` containing question ids

    Parameters
    ----------
    transformer_model_name : `str`, optional (default=`"roberta-large"`)
        This reader chooses tokenizer and token indexer according to this setting.
    length_limit : `int`, optional (default=`512`)
        We will make sure that the length of an alternative never exceeds this many word pieces.
    """
    def __init__(self,
                 transformer_model_name: str = "roberta-large",
                 length_limit: int = 512,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        from allennlp.data.tokenizers import PretrainedTransformerTokenizer

        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name, add_special_tokens=False)
        from allennlp.data.token_indexers import PretrainedTransformerIndexer

        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit

    def text_to_instance(
        self,  # type: ignore
        qid: str,
        start: str,
        alternatives: List[str],
        label: Optional[int] = None,
    ) -> Instance:
        # tokenize
        start = self._tokenizer.tokenize(start)

        sequences = []
        for alternative in alternatives:
            alternative = self._tokenizer.tokenize(alternative)
            length_for_start = (self.length_limit - len(alternative) -
                                self._tokenizer.num_special_tokens_for_pair())
            if length_for_start < 0:
                # If the alternative is too long by itself, we take the beginning and add no tokens from the start.
                alternative = alternative[:length_for_start]
                length_for_start = 0
            sequences.append(
                self._tokenizer.add_special_tokens(start[:length_for_start],
                                                   alternative))

        # make fields
        from allennlp.data.fields import TextField

        sequences = [
            TextField(sequence, self._token_indexers) for sequence in sequences
        ]
        from allennlp.data.fields import ListField

        sequences = ListField(sequences)

        from allennlp.data.fields import MetadataField

        fields = {
            "alternatives": sequences,
            "qid": MetadataField(qid),
        }

        if label is not None:
            if label < 0 or label >= len(sequences):
                raise ValueError("Alternative %d does not exist", label)
            from allennlp.data.fields import IndexField

            fields["correct_alternative"] = IndexField(label, sequences)

        return Instance(fields)