Exemple #1
0
def tmp():
    config = "configs/bert_pretrain.jsonnet"
    serialization_dir = "models"
    output_dir = "bert_out"
    tokenizer_conllu_path = "data/coptic/converted/train"
    documents = read_conllu_files(tokenizer_conllu_path)
    sentences = []
    for document in documents:
        for sentence in document:
            sentences.append(" ".join([t['form'] for t in sentence]))
    print("Training tokenizer...")
    os.environ["TOKENIZER_PATH"] = output_dir

    t = train_bert_tokenizer(sentences,
                             serialize_path=output_dir,
                             vocab_size=6000)
    tok = PretrainedTransformerTokenizer("./bert_out/")
    idx = PretrainedTransformerMismatchedIndexer("./bert_out/")
    vocab = Vocabulary()
    vocab.set_from_file("bert_out/vocab.txt",
                        oov_token="[UNK]",
                        is_padded=True)
    s = tok.tokenize(sentences[1])
    i = idx.tokens_to_indices(s, vocab)
    i
    print(t)
Exemple #2
0
    def __init__(self,
                 transformer_model_name: str = "bert-base-cased",
                 length_limit: int = 384,
                 question_length_limit: int = 64,
                 stride: int = 128,
                 raise_errors: bool = False,
                 tokenizer_kwargs: Dict[str, Any] = None,
                 one_instance_per_query: bool = False,
                 max_instances: int = None,
                 **kwargs) -> None:
        """
        Initialize the RecordTaskReader.
        """
        super(RecordTaskReader,
              self).__init__(manual_distributed_sharding=True, **kwargs)

        # Save the values passed to __init__ to protected attributes
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            tokenizer_kwargs=tokenizer_kwargs,
        )
        self._token_indexers = {
            "tokens":
            PretrainedTransformerIndexer(transformer_model_name,
                                         tokenizer_kwargs=tokenizer_kwargs)
        }
        self._length_limit = length_limit
        self._query_len_limit = question_length_limit
        self._stride = stride
        self._raise_errors = raise_errors
        self._cls_token = '@placeholder'
        self._max_instances = max_instances
        self._one_instance_per_query = one_instance_per_query
    def __init__(self, model_path=None, cuda_device=1):
        # model_path = model_path or LSTM_MODEL_PATH
        model_path = model_path or ROBERTA_MODEL_PATH
        self.predictor = Predictor.from_path(model_path,
                                             cuda_device=cuda_device)

        _tokenizer = PretrainedTransformerTokenizer(
            model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT)
        class_name_mapper = {"0": "Negative", "1": "Positive"}
        _model = self.predictor._model
        _label_namespace = _model._label_namespace
        class_names = [
            class_name_mapper[_model.vocab.get_index_to_token_vocabulary(
                _label_namespace).get(0)],
            class_name_mapper[_model.vocab.get_index_to_token_vocabulary(
                _label_namespace).get(1)]
        ]
        # reset the tokenizer to remove separators
        self.tokenizer = lambda s: [
            t.text.replace("Ġ", "").replace('Ċ', '').replace('ĉ', "")
            for t in _tokenizer.tokenize(s)
        ][1:-1]
        self.explainer_lime = LimeTextExplainer(
            class_names=class_names, split_expression=self.tokenizer)
        self.explainer_integrate = IntegratedGradient(self.predictor)
        self.explainer_simple = SimpleGradient(self.predictor)
    def __init__(self,
                 pretrained_model: str,
                 max_pieces: int = 512,
                 num_choices: int = 4,
                 add_prefix: Dict[str, str] = None,
                 sample: int = -1) -> None:
        super().__init__()

        self._tokenizer = PretrainedTransformerTokenizer(pretrained_model)
        self._tokenizer_no_special_tokens = PretrainedTransformerTokenizer(
            pretrained_model, add_special_tokens=False)
        # self._tokenizer_internal = self._tokenizer._tokenizer
        self._tokenizer_internal = self._tokenizer.tokenizer
        token_indexer = PretrainedTransformerIndexer(pretrained_model)
        self._token_indexers = {'tokens': token_indexer}

        self._max_pieces = max_pieces
        self._sample = sample
        self._num_choices = num_choices
        self._add_prefix = add_prefix or {}

        for model in [
                "roberta", "bert", "openai-gpt", "gpt2", "transfo-xl", "xlnet",
                "xlm"
        ]:
            if model in pretrained_model:
                self._model_type = model
                break
Exemple #5
0
    def __init__(self,
                 model_name: str,
                 namespace: str = "tags",
                 max_length: int = None,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._namespace = namespace
        self._allennlp_tokenizer = PretrainedTransformerTokenizer(model_name)
        self._tokenizer = self._allennlp_tokenizer.tokenizer
        self._added_to_vocabulary = False

        self._num_added_start_tokens = len(
            self._allennlp_tokenizer.single_sequence_start_tokens)
        self._num_added_end_tokens = len(
            self._allennlp_tokenizer.single_sequence_end_tokens)

        self._max_length = max_length
        if self._max_length is not None:
            num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1
            self._effective_max_length = (  # we need to take into account special tokens
                self._max_length - num_added_tokens)
            if self._effective_max_length <= 0:
                raise ValueError(
                    "max_length needs to be greater than the number of special tokens inserted."
                )
    def __init__(self,
                 transformer_model_name: str = "bert-base-cased",
                 length_limit: int = 384,
                 stride: int = 128,
                 skip_invalid_examples: bool = False,
                 max_query_length: int = 64,
                 **kwargs) -> None:
        super().__init__(**kwargs)
        self._tokenizer = PretrainedTransformerTokenizer(
            transformer_model_name,
            add_special_tokens=False,
            calculate_character_offsets=True)
        self._token_indexers = {
            "tokens": PretrainedTransformerIndexer(transformer_model_name)
        }
        self.length_limit = length_limit
        self.stride = stride
        self.skip_invalid_examples = skip_invalid_examples
        self.max_query_length = max_query_length
        self.non_content_type_id = max(
            self._tokenizer.tokenizer.encode_plus(
                "left", "right", return_token_type_ids=True)["token_type_ids"])

        # workaround for a bug in the transformers library
        if "distilbert" in transformer_model_name:
            self.non_content_type_id = 0
Exemple #7
0
 def __init__(
     self,
     sent1_col: str,
     sent2_col: str = None,
     label_col: str = 'label',
     bert_model: str = 'bert-base-uncased',
     max_sequence_length: int = 500,
     skip_label_indexing: bool = False,
     lower: bool = True,
     lazy: bool = False,
 ) -> None:
     super().__init__(lazy=lazy)
     self._sent1_col = sent1_col
     self._sent2_col = sent2_col
     self._label_col = label_col
     self._tokenizer = PretrainedTransformerTokenizer(
         bert_model,
         add_special_tokens=False,
         max_length=max_sequence_length
     )  # type: PretrainedTransformerTokenizer
     self._max_sequence_length = max_sequence_length
     self._skip_label_indexing = skip_label_indexing
     self._lower = lower
     self._token_indexers = {
         "tokens": PretrainedTransformerIndexer(model_name=bert_model)
     }
Exemple #8
0
    def test_end_to_end(self, train_parameters: bool, last_layer_only: bool):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "train_parameters": train_parameters,
                    "last_layer_only": last_layer_only,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
        assert bert_vectors.requires_grad == (train_parameters
                                              or not last_layer_only)
    def test_indices_to_tokens(self):
        allennlp_tokenizer = PretrainedTransformerTokenizer(
            "bert-base-uncased")
        indexer_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)
        indexer_no_max_length = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")
        string_no_specials = "AllenNLP is great"

        allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
        vocab = Vocabulary()
        indexed = indexer_no_max_length.tokens_to_indices(
            allennlp_tokens, vocab)
        tokens_from_indices = indexer_no_max_length.indices_to_tokens(
            indexed, vocab)

        self._assert_tokens_equal(allennlp_tokens, tokens_from_indices)

        indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab)
        tokens_from_indices = indexer_max_length.indices_to_tokens(
            indexed, vocab)

        # For now we are not removing special tokens introduced from max_length
        sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]]
        expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] +
                    sep_cls + allennlp_tokens[5:])

        self._assert_tokens_equal(expected, tokens_from_indices)
Exemple #10
0
    def test_mask(self):
        # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1.
        for model in ["bert-base-uncased", "roberta-base"]:
            allennlp_tokenizer = PretrainedTransformerTokenizer(model)
            indexer = PretrainedTransformerIndexer(model_name=model)
            string_no_specials = "AllenNLP is great"
            allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials)
            vocab = Vocabulary()
            indexed = indexer.tokens_to_indices(allennlp_tokens, vocab)
            expected_masks = [1] * len(indexed["token_ids"])
            assert indexed["mask"] == expected_masks
            max_length = 10
            padding_lengths = {key: max_length for key in indexed.keys()}
            padded_tokens = indexer.as_padded_tensor_dict(
                indexed, padding_lengths)
            padding_length = max_length - len(indexed["mask"])
            expected_masks = expected_masks + ([0] * padding_length)
            assert len(padded_tokens["mask"]) == max_length
            assert padded_tokens["mask"].tolist() == expected_masks

            assert len(padded_tokens["token_ids"]) == max_length
            padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id
                              ] * padding_length
            assert padded_tokens["token_ids"][-padding_length:].tolist(
            ) == padding_suffix
Exemple #11
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              lazy: bool = False) -> None:
     super().__init__(lazy)
     self.transformer_model = "bert-base-uncased"
     self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512)
     self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512)
    def __init__(self,
                 pretrained_model: str = None,
                 tokenizer: Optional[Tokenizer] = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 add_prefix: bool = False,
                 combine_input_fields: bool = True,
                 sample: int = -1) -> None:
        super().__init__()

        if pretrained_model != None:
            self._tokenizer = PretrainedTransformerTokenizer(
                pretrained_model, max_length=max_pieces)
            token_indexer = PretrainedTransformerIndexer(pretrained_model)
            self._token_indexers = {'tokens': token_indexer}
        else:
            self._tokenizer = tokenizer or SpacyTokenizer()
            self._token_indexers = token_indexers or {
                "tokens": SingleIdTokenIndexer()
            }

        self._sample = sample
        self._add_prefix = add_prefix
        self._combine_input_fields = combine_input_fields
        self._debug_prints = -1
 def __init__(
     self,
     lazy: bool = False,
     cache_directory: Optional[str] = None,
     max_instances: Optional[int] = None,
     min_num_candidate: int = 3,
     max_num_candidate: int = 5,
     transformer_model_name_or_archive_path: str = "bert-base-uncased",
 ) -> None:
     super().__init__(lazy=lazy,
                      cache_directory=cache_directory,
                      max_instances=max_instances)
     if "tar.gz" in transformer_model_name_or_archive_path:
         config = extract_config_from_archive(
             transformer_model_name_or_archive_path)
         model_name = config.as_dict(
         )["dataset_reader"]["tokenizer"]["model_name"]
     else:
         model_name = transformer_model_name_or_archive_path
     self._tokenizer = PretrainedTransformerTokenizer(
         model_name=model_name, add_special_tokens=False)
     self._tokenindexer = PretrainedTransformerIndexer(
         model_name=model_name)
     self._min_num_candidate = min_num_candidate
     self._max_num_candidate = max_num_candidate
Exemple #14
0
 def test_splits_reformer_small(self):
     sentence = "A, [MASK] AllenNLP sentence."
     expected_tokens = [
         "▁A",
         ",",
         "▁",
         "<unk>",
         "M",
         "A",
         "S",
         "K",
         "<unk>",
         "▁A",
         "ll",
         "en",
         "N",
         "L",
         "P",
         "▁s",
         "ent",
         "en",
         "ce",
         ".",
     ]
     tokenizer = PretrainedTransformerTokenizer(
         "google/reformer-crime-and-punishment")
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     assert tokens == expected_tokens
Exemple #15
0
 def test_token_idx_bert_uncased(self):
     sentence = "A, naïve [MASK] AllenNLP sentence."
     expected_tokens = [
         "[CLS]",
         "a",
         ",",
         "naive",  # It normalizes the accent.
         "[MASK]",
         "allen",
         "##nl",
         "##p",
         "sentence",
         ".",
         "[SEP]",
     ]
     expected_idxs = [
         None,
         0,
         1,
         None,  # It can't find this one because of the normalized accent.
         9,
         16,
         21,
         23,
         25,
         33,
         None,
     ]
     tokenizer = PretrainedTransformerTokenizer(
         "bert-base-uncased", calculate_character_offsets=True)
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
Exemple #16
0
    def test_splits_roberta(self):
        tokenizer = PretrainedTransformerTokenizer("roberta-base")

        sentence = "A, <mask> AllenNLP sentence."
        expected_tokens = ["<s>", "A", ",", "<mask>", "Allen", "N", "LP", "Ġsentence", ".", "</s>"]
        tokens = [t.text for t in tokenizer.tokenize(sentence)]
        assert tokens == expected_tokens

        # sentence pair
        sentence_1 = "A, <mask> AllenNLP sentence."
        sentence_2 = "A sentence."
        expected_tokens = [
            "<s>",
            "A",
            ",",
            "<mask>",
            "Allen",
            "N",
            "LP",
            "Ġsentence",
            ".",
            "</s>",
            "</s>",
            "A",
            "Ġsentence",
            ".",
            "</s>",
        ]
        tokens = [t.text for t in tokenizer.tokenize_sentence_pair(sentence_1, sentence_2)]
        assert tokens == expected_tokens
Exemple #17
0
 def test_token_idx_bert_cased(self):
     sentence = "A, naïve [MASK] AllenNLP sentence."
     expected_tokens = [
         "[CLS]",
         "A",
         ",",
         "na",
         "##ï",  # Does not normalize the accent
         "##ve",
         "[MASK]",
         "Allen",
         "##NL",
         "##P",
         "sentence",
         ".",
         "[SEP]",
     ]
     expected_idxs = [None, 0, 1, 3, 5, 6, 9, 16, 21, 23, 25, 33, None]
     tokenizer = PretrainedTransformerTokenizer(
         "bert-base-cased", calculate_character_offsets=True)
     tokenized = tokenizer.tokenize(sentence)
     tokens = [t.text for t in tokenized]
     assert tokens == expected_tokens
     idxs = [t.idx for t in tokenized]
     assert idxs == expected_idxs
Exemple #18
0
    def test_end_to_end_t5(
        self,
        train_parameters: bool,
        last_layer_only: bool,
        gradient_checkpointing: bool,
    ):
        tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random")
        token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {
                        "type": "pretrained_transformer",
                        "model_name": "patrickvonplaten/t5-tiny-random",
                        "train_parameters": train_parameters,
                        "last_layer_only": last_layer_only,
                        "gradient_checkpointing": gradient_checkpointing,
                        "sub_module": "encoder",
                    }
                }
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"]["token_ids"].shape == (2, max_length)

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, True, True, True],
            [True, True, True, True, True, True, False, False],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 8, 64)
        assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
Exemple #19
0
 def __init__(self,
              model: str = "epwalsh/bert-xsmall-dummy",
              **kwargs) -> None:
     super().__init__(manual_distributed_sharding=True,
                      manual_multiprocess_sharding=True,
                      **kwargs)
     self.tokenizer = PretrainedTransformerTokenizer(model)
     self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)}
Exemple #20
0
 def test_from_params_kwargs(self):
     PretrainedTransformerTokenizer.from_params(
         Params({
             "model_name": "bert-base-uncased",
             "tokenizer_kwargs": {
                 "max_len": 10
             }
         }))
Exemple #21
0
 def test_from_params_kwargs(self):
     PretrainedTransformerTokenizer.from_params(
         Params({
             "model_name": "bert-base-uncased",
             "tokenizer_kwargs": {
                 "do_lower_case": True
             }
         }))
 def test_no_max_length(self):
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased",
                                                max_length=None,
                                                add_special_tokens=False)
     # Even though the bert model has a max input length of 512, when we tokenize
     # with `max_length = None`, we should not get any truncation.
     tokens = tokenizer.tokenize(" ".join(["a"] * 550))
     assert len(tokens) == 550
 def test_max_length(self):
     tokenizer = PretrainedTransformerTokenizer("bert-base-cased",
                                                max_length=10,
                                                add_special_tokens=False)
     tokens = tokenizer.tokenize(
         "hi there, this should be at least 10 tokens, but some will be truncated"
     )
     assert len(tokens) == 10
    def test_long_sequence_splitting_end_to_end(self):
        # Mostly the same as the end_to_end test (except for adding max_length=4),
        # because we don't want this splitting behavior to change input/output format.

        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased", max_length=4)

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased",
                    "max_length": 4,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        # Adds n_segments * 2 special tokens
        segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length
        assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length)

        assert tokens["bert"]["mask"].tolist() == [
            [1, 1, 1, 1, 1, 1, 1, 1, 1],
            [1, 1, 1, 1, 1, 1, 1, 0, 0],
        ]
        assert tokens["bert"]["segment_concat_mask"].tolist() == [
            [1] * segment_concat_length,
            [1] * (segment_concat_length - 4) +
            [0] * 4,  # 4 is hard-coded length difference
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
    def test_end_to_end(self):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased"
                }
            },
            "embedder_to_indexer_map": {
                "bert": ["bert", "mask"]
            },
            "allow_unmatched_keys": True,
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"].shape == (2, max_length)

        assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1],
                                           [1, 1, 1, 1, 1, 1, 1, 0, 0]]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
class QuestionGenerationDatasetReader(DatasetReader):
    def __init__(self, model_name: str, lazy: bool = False):
        super().__init__(lazy=lazy)
        self.tokenizer = PretrainedTransformerTokenizer(model_name)
        self.token_indexers = {
            'tokens': PretrainedTransformerIndexer(model_name,
                                                   namespace='tokens')
        }

        # Add the tokens which will mark the answer span
        self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN])

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        with open(file_path, 'r') as f:
            for line in f:
                data = json.loads(line)
                context = data['context']
                start = data['answer_start']
                end = data['answer_end']
                question = data.pop('question', None)
                metadata = data.pop('metadata', {})
                yield self.text_to_instance(context, start, end, question,
                                            metadata)

    def _insert_span_symbols(self, context: str, start: int, end: int) -> str:
        return f'{context[:start]}{SPAN_START_TOKEN} {context[start:end]} {SPAN_END_TOKEN}{context[end:]}'

    @overrides
    def text_to_instance(self,
                         context: str,
                         start: int,
                         end: int,
                         question: Optional[str] = None,
                         metadata: Dict[str, Any] = None) -> Instance:
        fields = {}
        metadata = metadata or {}

        answer = context[start:end]
        marked_context = self._insert_span_symbols(context, start, end)
        source_tokens = self.tokenizer.tokenize(marked_context)
        fields['source_tokens'] = TextField(source_tokens, self.token_indexers)
        metadata['answer'] = answer
        metadata['answer_start'] = start
        metadata['answer_end'] = end
        metadata['context'] = context
        metadata['marked_context'] = marked_context
        metadata['source_tokens'] = source_tokens

        if question is not None:
            target_tokens = self.tokenizer.tokenize(question)
            fields['target_tokens'] = TextField(target_tokens,
                                                self.token_indexers)
            metadata['question'] = question
            metadata['target_tokens'] = target_tokens

        fields['metadata'] = MetadataField(metadata)
        return Instance(fields)
Exemple #27
0
def main():
    tokenizer = PretrainedTransformerTokenizer(model_name=BERT_MODEL,
                                               add_special_tokens=False)
    result = tokenizer.tokenize('The best movie ever!')
    print(result)
    reader = SnliReader(tokenizer=tokenizer)
    for instance in reader.read(
            'https://realworldnlpbook.s3.amazonaws.com/data/snli/snli_1.0_dev.jsonl'
    ):
        print(instance)
Exemple #28
0
 def __init__(self,
              token_indexers: Dict[str, TokenIndexer] = None,
              balance_classes=False,
              **kwargs):
     super().__init__(**kwargs)
     # max_length ensures that we truncate the input
     self._tokenizer = PretrainedTransformerTokenizer(
         model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT)
     self._token_indexers = token_indexers
     self.balance_classes = balance_classes
Exemple #29
0
 def test_splits_into_wordpieces(self):
     tokenizer = PretrainedTransformerTokenizer('bert-base-cased',
                                                do_lowercase=False)
     sentence = "A, [MASK] AllenNLP sentence."
     tokens = [t.text for t in tokenizer.tokenize(sentence)]
     expected_tokens = [
         "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence",
         ".", "[SEP]"
     ]
     assert tokens == expected_tokens
    def __init__(self, model_name: str, lazy: bool = False):
        super().__init__(lazy=lazy)
        self.tokenizer = PretrainedTransformerTokenizer(model_name)
        self.token_indexers = {
            'tokens': PretrainedTransformerIndexer(model_name,
                                                   namespace='tokens')
        }

        # Add the tokens which will mark the answer span
        self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN])