def tmp(): config = "configs/bert_pretrain.jsonnet" serialization_dir = "models" output_dir = "bert_out" tokenizer_conllu_path = "data/coptic/converted/train" documents = read_conllu_files(tokenizer_conllu_path) sentences = [] for document in documents: for sentence in document: sentences.append(" ".join([t['form'] for t in sentence])) print("Training tokenizer...") os.environ["TOKENIZER_PATH"] = output_dir t = train_bert_tokenizer(sentences, serialize_path=output_dir, vocab_size=6000) tok = PretrainedTransformerTokenizer("./bert_out/") idx = PretrainedTransformerMismatchedIndexer("./bert_out/") vocab = Vocabulary() vocab.set_from_file("bert_out/vocab.txt", oov_token="[UNK]", is_padded=True) s = tok.tokenize(sentences[1]) i = idx.tokens_to_indices(s, vocab) i print(t)
def __init__(self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, question_length_limit: int = 64, stride: int = 128, raise_errors: bool = False, tokenizer_kwargs: Dict[str, Any] = None, one_instance_per_query: bool = False, max_instances: int = None, **kwargs) -> None: """ Initialize the RecordTaskReader. """ super(RecordTaskReader, self).__init__(manual_distributed_sharding=True, **kwargs) # Save the values passed to __init__ to protected attributes self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name, tokenizer_kwargs=tokenizer_kwargs) } self._length_limit = length_limit self._query_len_limit = question_length_limit self._stride = stride self._raise_errors = raise_errors self._cls_token = '@placeholder' self._max_instances = max_instances self._one_instance_per_query = one_instance_per_query
def __init__(self, model_path=None, cuda_device=1): # model_path = model_path or LSTM_MODEL_PATH model_path = model_path or ROBERTA_MODEL_PATH self.predictor = Predictor.from_path(model_path, cuda_device=cuda_device) _tokenizer = PretrainedTransformerTokenizer( model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT) class_name_mapper = {"0": "Negative", "1": "Positive"} _model = self.predictor._model _label_namespace = _model._label_namespace class_names = [ class_name_mapper[_model.vocab.get_index_to_token_vocabulary( _label_namespace).get(0)], class_name_mapper[_model.vocab.get_index_to_token_vocabulary( _label_namespace).get(1)] ] # reset the tokenizer to remove separators self.tokenizer = lambda s: [ t.text.replace("Ġ", "").replace('Ċ', '').replace('ĉ', "") for t in _tokenizer.tokenize(s) ][1:-1] self.explainer_lime = LimeTextExplainer( class_names=class_names, split_expression=self.tokenizer) self.explainer_integrate = IntegratedGradient(self.predictor) self.explainer_simple = SimpleGradient(self.predictor)
def __init__(self, pretrained_model: str, max_pieces: int = 512, num_choices: int = 4, add_prefix: Dict[str, str] = None, sample: int = -1) -> None: super().__init__() self._tokenizer = PretrainedTransformerTokenizer(pretrained_model) self._tokenizer_no_special_tokens = PretrainedTransformerTokenizer( pretrained_model, add_special_tokens=False) # self._tokenizer_internal = self._tokenizer._tokenizer self._tokenizer_internal = self._tokenizer.tokenizer token_indexer = PretrainedTransformerIndexer(pretrained_model) self._token_indexers = {'tokens': token_indexer} self._max_pieces = max_pieces self._sample = sample self._num_choices = num_choices self._add_prefix = add_prefix or {} for model in [ "roberta", "bert", "openai-gpt", "gpt2", "transfo-xl", "xlnet", "xlm" ]: if model in pretrained_model: self._model_type = model break
def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs) -> None: super().__init__(**kwargs) self._namespace = namespace self._allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) self._tokenizer = self._allennlp_tokenizer.tokenizer self._added_to_vocabulary = False self._num_added_start_tokens = len( self._allennlp_tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len( self._allennlp_tokenizer.single_sequence_end_tokens) self._max_length = max_length if self._max_length is not None: num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1 self._effective_max_length = ( # we need to take into account special tokens self._max_length - num_added_tokens) if self._effective_max_length <= 0: raise ValueError( "max_length needs to be greater than the number of special tokens inserted." )
def __init__(self, transformer_model_name: str = "bert-base-cased", length_limit: int = 384, stride: int = 128, skip_invalid_examples: bool = False, max_query_length: int = 64, **kwargs) -> None: super().__init__(**kwargs) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, calculate_character_offsets=True) self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit self.stride = stride self.skip_invalid_examples = skip_invalid_examples self.max_query_length = max_query_length self.non_content_type_id = max( self._tokenizer.tokenizer.encode_plus( "left", "right", return_token_type_ids=True)["token_type_ids"]) # workaround for a bug in the transformers library if "distilbert" in transformer_model_name: self.non_content_type_id = 0
def __init__( self, sent1_col: str, sent2_col: str = None, label_col: str = 'label', bert_model: str = 'bert-base-uncased', max_sequence_length: int = 500, skip_label_indexing: bool = False, lower: bool = True, lazy: bool = False, ) -> None: super().__init__(lazy=lazy) self._sent1_col = sent1_col self._sent2_col = sent2_col self._label_col = label_col self._tokenizer = PretrainedTransformerTokenizer( bert_model, add_special_tokens=False, max_length=max_sequence_length ) # type: PretrainedTransformerTokenizer self._max_sequence_length = max_sequence_length self._skip_label_indexing = skip_label_indexing self._lower = lower self._token_indexers = { "tokens": PretrainedTransformerIndexer(model_name=bert_model) }
def test_end_to_end(self, train_parameters: bool, last_layer_only: bool): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "train_parameters": train_parameters, "last_layer_only": last_layer_only, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def test_indices_to_tokens(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) indexer_no_max_length = PretrainedTransformerIndexer( model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer_no_max_length.tokens_to_indices( allennlp_tokens, vocab) tokens_from_indices = indexer_no_max_length.indices_to_tokens( indexed, vocab) self._assert_tokens_equal(allennlp_tokens, tokens_from_indices) indexed = indexer_max_length.tokens_to_indices(allennlp_tokens, vocab) tokens_from_indices = indexer_max_length.indices_to_tokens( indexed, vocab) # For now we are not removing special tokens introduced from max_length sep_cls = [allennlp_tokens[-1], allennlp_tokens[0]] expected = (allennlp_tokens[:3] + sep_cls + allennlp_tokens[3:5] + sep_cls + allennlp_tokens[5:]) self._assert_tokens_equal(expected, tokens_from_indices)
def test_mask(self): # We try these two models, because BERT pads tokens with 0, but RoBERTa pads tokens with 1. for model in ["bert-base-uncased", "roberta-base"]: allennlp_tokenizer = PretrainedTransformerTokenizer(model) indexer = PretrainedTransformerIndexer(model_name=model) string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [1] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {key: max_length for key in indexed.keys()} padded_tokens = indexer.as_padded_tensor_dict( indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([0] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks assert len(padded_tokens["token_ids"]) == max_length padding_suffix = [allennlp_tokenizer.tokenizer.pad_token_id ] * padding_length assert padded_tokens["token_ids"][-padding_length:].tolist( ) == padding_suffix
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, lazy: bool = False) -> None: super().__init__(lazy) self.transformer_model = "bert-base-uncased" self.tokenizer = PretrainedTransformerTokenizer(model_name=self.transformer_model,add_special_tokens=False,max_length=512) self.token_indexer = PretrainedTransformerIndexer(model_name=self.transformer_model,max_length =512)
def __init__(self, pretrained_model: str = None, tokenizer: Optional[Tokenizer] = None, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, add_prefix: bool = False, combine_input_fields: bool = True, sample: int = -1) -> None: super().__init__() if pretrained_model != None: self._tokenizer = PretrainedTransformerTokenizer( pretrained_model, max_length=max_pieces) token_indexer = PretrainedTransformerIndexer(pretrained_model) self._token_indexers = {'tokens': token_indexer} else: self._tokenizer = tokenizer or SpacyTokenizer() self._token_indexers = token_indexers or { "tokens": SingleIdTokenIndexer() } self._sample = sample self._add_prefix = add_prefix self._combine_input_fields = combine_input_fields self._debug_prints = -1
def __init__( self, lazy: bool = False, cache_directory: Optional[str] = None, max_instances: Optional[int] = None, min_num_candidate: int = 3, max_num_candidate: int = 5, transformer_model_name_or_archive_path: str = "bert-base-uncased", ) -> None: super().__init__(lazy=lazy, cache_directory=cache_directory, max_instances=max_instances) if "tar.gz" in transformer_model_name_or_archive_path: config = extract_config_from_archive( transformer_model_name_or_archive_path) model_name = config.as_dict( )["dataset_reader"]["tokenizer"]["model_name"] else: model_name = transformer_model_name_or_archive_path self._tokenizer = PretrainedTransformerTokenizer( model_name=model_name, add_special_tokens=False) self._tokenindexer = PretrainedTransformerIndexer( model_name=model_name) self._min_num_candidate = min_num_candidate self._max_num_candidate = max_num_candidate
def test_splits_reformer_small(self): sentence = "A, [MASK] AllenNLP sentence." expected_tokens = [ "▁A", ",", "▁", "<unk>", "M", "A", "S", "K", "<unk>", "▁A", "ll", "en", "N", "L", "P", "▁s", "ent", "en", "ce", ".", ] tokenizer = PretrainedTransformerTokenizer( "google/reformer-crime-and-punishment") tokens = [t.text for t in tokenizer.tokenize(sentence)] assert tokens == expected_tokens
def test_token_idx_bert_uncased(self): sentence = "A, naïve [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "a", ",", "naive", # It normalizes the accent. "[MASK]", "allen", "##nl", "##p", "sentence", ".", "[SEP]", ] expected_idxs = [ None, 0, 1, None, # It can't find this one because of the normalized accent. 9, 16, 21, 23, 25, 33, None, ] tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased", calculate_character_offsets=True) tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
def test_splits_roberta(self): tokenizer = PretrainedTransformerTokenizer("roberta-base") sentence = "A, <mask> AllenNLP sentence." expected_tokens = ["<s>", "A", ",", "<mask>", "Allen", "N", "LP", "Ġsentence", ".", "</s>"] tokens = [t.text for t in tokenizer.tokenize(sentence)] assert tokens == expected_tokens # sentence pair sentence_1 = "A, <mask> AllenNLP sentence." sentence_2 = "A sentence." expected_tokens = [ "<s>", "A", ",", "<mask>", "Allen", "N", "LP", "Ġsentence", ".", "</s>", "</s>", "A", "Ġsentence", ".", "</s>", ] tokens = [t.text for t in tokenizer.tokenize_sentence_pair(sentence_1, sentence_2)] assert tokens == expected_tokens
def test_token_idx_bert_cased(self): sentence = "A, naïve [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "A", ",", "na", "##ï", # Does not normalize the accent "##ve", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", ] expected_idxs = [None, 0, 1, 3, 5, 6, 9, 16, 21, 23, 25, 33, None] tokenizer = PretrainedTransformerTokenizer( "bert-base-cased", calculate_character_offsets=True) tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
def test_end_to_end_t5( self, train_parameters: bool, last_layer_only: bool, gradient_checkpointing: bool, ): tokenizer = PretrainedTransformerTokenizer(model_name="patrickvonplaten/t5-tiny-random") token_indexer = PretrainedTransformerIndexer(model_name="patrickvonplaten/t5-tiny-random") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = ["▁A", ",", "▁Allen", "N", "LP", "▁sentence", ".", "</s>"] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = ["▁Allen", "N", "LP", "▁is", "▁great", "</s>"] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params( { "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "patrickvonplaten/t5-tiny-random", "train_parameters": train_parameters, "last_layer_only": last_layer_only, "gradient_checkpointing": gradient_checkpointing, "sub_module": "encoder", } } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True], [True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 8, 64) assert bert_vectors.requires_grad == (train_parameters or not last_layer_only)
def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None: super().__init__(manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs) self.tokenizer = PretrainedTransformerTokenizer(model) self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)}
def test_from_params_kwargs(self): PretrainedTransformerTokenizer.from_params( Params({ "model_name": "bert-base-uncased", "tokenizer_kwargs": { "max_len": 10 } }))
def test_from_params_kwargs(self): PretrainedTransformerTokenizer.from_params( Params({ "model_name": "bert-base-uncased", "tokenizer_kwargs": { "do_lower_case": True } }))
def test_no_max_length(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased", max_length=None, add_special_tokens=False) # Even though the bert model has a max input length of 512, when we tokenize # with `max_length = None`, we should not get any truncation. tokens = tokenizer.tokenize(" ".join(["a"] * 550)) assert len(tokens) == 550
def test_max_length(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased", max_length=10, add_special_tokens=False) tokens = tokenizer.tokenize( "hi there, this should be at least 10 tokens, but some will be truncated" ) assert len(tokens) == 10
def test_long_sequence_splitting_end_to_end(self): # Mostly the same as the end_to_end test (except for adding max_length=4), # because we don't want this splitting behavior to change input/output format. tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased", max_length=4) sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased", "max_length": 4, } } }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) # Adds n_segments * 2 special tokens segment_concat_length = int(math.ceil(max_length / 4)) * 2 + max_length assert tokens["bert"]["token_ids"].shape == (2, segment_concat_length) assert tokens["bert"]["mask"].tolist() == [ [1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0], ] assert tokens["bert"]["segment_concat_mask"].tolist() == [ [1] * segment_concat_length, [1] * (segment_concat_length - 4) + [0] * 4, # 4 is hard-coded length difference ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
def test_end_to_end(self): tokenizer = PretrainedTransformerTokenizer( model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer( model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = [ "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]" ] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = [ "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]" ] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params({ "token_embedders": { "bert": { "type": "pretrained_transformer", "model_name": "bert-base-uncased" } }, "embedder_to_indexer_map": { "bert": ["bert", "mask"] }, "allow_unmatched_keys": True, }) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance( {"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"].shape == (2, max_length) assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0]] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
class QuestionGenerationDatasetReader(DatasetReader): def __init__(self, model_name: str, lazy: bool = False): super().__init__(lazy=lazy) self.tokenizer = PretrainedTransformerTokenizer(model_name) self.token_indexers = { 'tokens': PretrainedTransformerIndexer(model_name, namespace='tokens') } # Add the tokens which will mark the answer span self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN]) @overrides def _read(self, file_path: str) -> Iterable[Instance]: with open(file_path, 'r') as f: for line in f: data = json.loads(line) context = data['context'] start = data['answer_start'] end = data['answer_end'] question = data.pop('question', None) metadata = data.pop('metadata', {}) yield self.text_to_instance(context, start, end, question, metadata) def _insert_span_symbols(self, context: str, start: int, end: int) -> str: return f'{context[:start]}{SPAN_START_TOKEN} {context[start:end]} {SPAN_END_TOKEN}{context[end:]}' @overrides def text_to_instance(self, context: str, start: int, end: int, question: Optional[str] = None, metadata: Dict[str, Any] = None) -> Instance: fields = {} metadata = metadata or {} answer = context[start:end] marked_context = self._insert_span_symbols(context, start, end) source_tokens = self.tokenizer.tokenize(marked_context) fields['source_tokens'] = TextField(source_tokens, self.token_indexers) metadata['answer'] = answer metadata['answer_start'] = start metadata['answer_end'] = end metadata['context'] = context metadata['marked_context'] = marked_context metadata['source_tokens'] = source_tokens if question is not None: target_tokens = self.tokenizer.tokenize(question) fields['target_tokens'] = TextField(target_tokens, self.token_indexers) metadata['question'] = question metadata['target_tokens'] = target_tokens fields['metadata'] = MetadataField(metadata) return Instance(fields)
def main(): tokenizer = PretrainedTransformerTokenizer(model_name=BERT_MODEL, add_special_tokens=False) result = tokenizer.tokenize('The best movie ever!') print(result) reader = SnliReader(tokenizer=tokenizer) for instance in reader.read( 'https://realworldnlpbook.s3.amazonaws.com/data/snli/snli_1.0_dev.jsonl' ): print(instance)
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, balance_classes=False, **kwargs): super().__init__(**kwargs) # max_length ensures that we truncate the input self._tokenizer = PretrainedTransformerTokenizer( model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT) self._token_indexers = token_indexers self.balance_classes = balance_classes
def test_splits_into_wordpieces(self): tokenizer = PretrainedTransformerTokenizer('bert-base-cased', do_lowercase=False) sentence = "A, [MASK] AllenNLP sentence." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]" ] assert tokens == expected_tokens
def __init__(self, model_name: str, lazy: bool = False): super().__init__(lazy=lazy) self.tokenizer = PretrainedTransformerTokenizer(model_name) self.token_indexers = { 'tokens': PretrainedTransformerIndexer(model_name, namespace='tokens') } # Add the tokens which will mark the answer span self.tokenizer.tokenizer.add_tokens([SPAN_START_TOKEN, SPAN_END_TOKEN])