def test_max_length(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased", max_length=10, add_special_tokens=False) tokens = tokenizer.tokenize( "hi there, this should be at least 10 tokens, but some will be truncated" ) assert len(tokens) == 10
def test_no_max_length(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased", max_length=None, add_special_tokens=False) # Even though the bert model has a max input length of 512, when we tokenize # with `max_length = None`, we should not get any truncation. tokens = tokenizer.tokenize(" ".join(["a"] * 550)) assert len(tokens) == 550
def test_end_to_end(self): tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased") token_indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased") sentence1 = "A, AllenNLP sentence." tokens1 = tokenizer.tokenize(sentence1) expected_tokens1 = ["[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"] assert [t.text for t in tokens1] == expected_tokens1 sentence2 = "AllenNLP is great" tokens2 = tokenizer.tokenize(sentence2) expected_tokens2 = ["[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"] assert [t.text for t in tokens2] == expected_tokens2 vocab = Vocabulary() params = Params( { "token_embedders": { "bert": {"type": "pretrained_transformer", "model_name": "bert-base-uncased"} } } ) token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab, params=params) instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] max_length = max(len(tokens1), len(tokens2)) assert tokens["bert"]["token_ids"].shape == (2, max_length) assert tokens["bert"]["mask"].tolist() == [ [True, True, True, True, True, True, True, True, True], [True, True, True, True, True, True, True, False, False], ] # Attention mask bert_vectors = token_embedder(tokens) assert bert_vectors.size() == (2, 9, 768)
def test_splits_into_wordpieces(self): tokenizer = PretrainedTransformerTokenizer('bert-base-cased', do_lowercase=False) sentence = "A, [MASK] AllenNLP sentence." tokens = [t.text for t in tokenizer.tokenize(sentence)] expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]" ] assert tokens == expected_tokens
def main(): tokenizer = PretrainedTransformerTokenizer(model_name=BERT_MODEL, add_special_tokens=False) result = tokenizer.tokenize('The best movie ever!') print(result) reader = SnliReader(tokenizer=tokenizer) for instance in reader.read( 'https://realworldnlpbook.s3.amazonaws.com/data/snli/snli_1.0_dev.jsonl' ): print(instance)
def test_token_idx_wikipedia(self): sentence = ( "Tokyo (東京 Tōkyō, English: /ˈtoʊkioʊ/,[7] Japanese: [toːkʲoː]), officially " "Tokyo Metropolis (東京都 Tōkyō-to), is one of the 47 prefectures of Japan." ) for tokenizer_name in [ "roberta-base", "bert-base-uncased", "bert-base-cased" ]: tokenizer = PretrainedTransformerTokenizer(tokenizer_name) tokenized = tokenizer.tokenize(sentence) assert tokenized[-2].text == "." assert tokenized[-2].idx == len(sentence) - 1
def test_token_idx_wikipedia(self): # This will produce lots of problems with the index calculation. We check whether it catches back up at the # end. sentence = "Tokyo (東京 Tōkyō, English: /ˈtoʊkioʊ/,[7] Japanese: [toːkʲoː]), officially Tokyo Metropolis (東京都 Tōkyō-to), is one of the 47 prefectures of Japan." for tokenizer_name in [ "roberta-base", "bert-base-uncased", "bert-base-cased" ]: tokenizer = PretrainedTransformerTokenizer( tokenizer_name, calculate_character_offsets=True) tokenized = tokenizer.tokenize(sentence) assert tokenized[-2].text == "." assert tokenized[-2].idx == len(sentence) - 1
def test_transformers_vocab_sizes(self, model_name): namespace = "tags" tokenizer = cached_transformers.get_tokenizer(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_as_array_produces_token_sequence_roberta(self): tokenizer = AutoTokenizer.from_pretrained("roberta-base") allennlp_tokenizer = PretrainedTransformerTokenizer("roberta-base") indexer = PretrainedTransformerIndexer(model_name="roberta-base") string_specials = "<s> AllenNLP is great </s>" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab, "key") assert indexed["key"] == expected_ids
def test_transformers_vocabs_added_correctly(self): namespace, model_name = "tags", "roberta-base" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_token_to_index_vocabulary( namespace=namespace) == tokenizer.encoder
def check_vocab_size(model_name: str): namespace = "tags" tokenizer = AutoTokenizer.from_pretrained(model_name) allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) indexer = PretrainedTransformerIndexer(model_name=model_name, namespace=namespace) allennlp_tokens = allennlp_tokenizer.tokenize("AllenNLP is great!") vocab = Vocabulary() # here we copy entire transformers vocab indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) del indexed assert vocab.get_vocab_size( namespace=namespace) == tokenizer.vocab_size
def test_as_array_produces_token_sequence_bert_cased(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") allennlp_tokenizer = PretrainedTransformerTokenizer("bert-base-cased") indexer = PretrainedTransformerIndexer(model_name="bert-base-cased") string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) # tokens tokenized with our pretrained tokenizer have indices in them allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids
class MockOldDatasetReader(DatasetReader): def __init__(self, model: str = "epwalsh/bert-xsmall-dummy", **kwargs) -> None: super().__init__(**kwargs) self.tokenizer = PretrainedTransformerTokenizer(model) self.token_indexers = {"tokens": PretrainedTransformerIndexer(model)} def _read(self, file_path: str): for i in range(10): source = f"Hi there, I'm the {i}th instance" target = f"Hello, {i}th instance!" yield self.text_to_instance(source, target) def text_to_instance(self, source: str, target: str = None) -> Instance: # type: ignore fields = {} fields["source"] = TextField(self.tokenizer.tokenize(source), self.token_indexers) # type: ignore if target is not None: fields["target"] = TextField(self.tokenizer.tokenize(target), self.token_indexers) # type: ignore return Instance(fields) # type: ignore
def test_tokenizer_kwargs_default(self): text = "Hello there! General Kenobi." tokenizer = PretrainedTransformerTokenizer("bert-base-cased") original_tokens = [ "[CLS]", "Hello", "there", "!", "General", "Ken", "##ob", "##i", ".", "[SEP]", ] tokenized = [token.text for token in tokenizer.tokenize(text)] assert tokenized == original_tokens
def test_splits_uncased_bert(self): sentence = "A, [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "a", ",", "[MASK]", "allen", "##nl", "##p", "sentence", ".", "[SEP]", ] tokenizer = PretrainedTransformerTokenizer("bert-base-uncased") tokens = [t.text for t in tokenizer.tokenize(sentence)] assert tokens == expected_tokens
def test_mask(self): allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased") string_no_specials = "AllenNLP is great" allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) expected_masks = [1] * len(indexed["token_ids"]) assert indexed["mask"] == expected_masks max_length = 10 padding_lengths = {"token_ids": max_length, "mask": max_length} padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths) padding_length = max_length - len(indexed["mask"]) expected_masks = expected_masks + ([0] * padding_length) assert len(padded_tokens["mask"]) == max_length assert padded_tokens["mask"].tolist() == expected_masks
def test_splits_roberta(self): tokenizer = PretrainedTransformerTokenizer("roberta-base") sentence = "A, <mask> AllenNLP sentence." expected_tokens = [ "<s>", "ĠA", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>", ] tokens = [t.text for t in tokenizer.tokenize(sentence)] assert tokens == expected_tokens
def test_splits_cased_bert(self): tokenizer = PretrainedTransformerTokenizer("bert-base-cased") sentence = "A, [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", ] tokens = [t.text for t in tokenizer.tokenize(sentence)] assert tokens == expected_tokens # sentence pair sentence_1 = "A, [MASK] AllenNLP sentence." sentence_2 = "A sentence." expected_tokens = [ "[CLS]", "A", ",", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", "A", "sentence", ".", "[SEP]", ] tokens = [ t.text for t in tokenizer.tokenize_sentence_pair(sentence_1, sentence_2) ] assert tokens == expected_tokens
class TransformersTokenizer(Tokenizer): """This tokenizer uses the pretrained tokenizers from huggingface's transformers library. This means the output will very likely be word pieces depending on the specified pretrained model. Parameters ---------- config A `TokenizerConfiguration` object """ def __init__(self, config): super().__init__(config) self.pretrained_tokenizer = PretrainedTransformerTokenizer( **config.transformers_kwargs ) def tokenize_document(self, document: List[str]) -> List[List[Token]]: texts = [ self.text_cleaning(text)[: self.config.truncate_input] for text in document ] if not self.config.segment_sentences: return list(map(self._tokenize, texts[: self.config.max_nr_of_sentences])) sentences = [ sentence.text.strip()[: self.config.truncate_sentence] for doc in self.__nlp__.pipe(texts) for sentence in doc.sents if ( self.config.min_sentence_length < len(sentence.text.strip()) < self.config.max_sentence_length ) ] return list(map(self._tokenize, sentences[: self.config.max_nr_of_sentences])) def _tokenize(self, text: str) -> List[Token]: return self.pretrained_tokenizer.tokenize(text) @property def nlp(self) -> Language: raise NotImplementedError("For the TransformerTokenizer we have no spaCy nlp")
def test_tokenizer_kwargs_forced_lowercase(self): text = "Hello there! General Kenobi." forced_lowercase_tokenizer = PretrainedTransformerTokenizer( "bert-base-cased", tokenizer_kwargs={"do_lower_case": True}) assert forced_lowercase_tokenizer._tokenizer_lowercases tokenized = [ token.text for token in forced_lowercase_tokenizer.tokenize(text) ] lowercase_tokens = [ "[CLS]", "hello", "there", "!", "general", "k", "##eno", "##bi", ".", "[SEP]", ] assert tokenized == lowercase_tokens
def test_token_idx_bert_uncased(self): sentence = "A, naïve [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "a", ",", "naive", # BERT normalizes this away "[MASK]", "allen", "##nl", "##p", "sentence", ".", "[SEP]", ] expected_idxs = [None, 0, 1, 3, 9, 16, 21, 23, 25, 33, None] tokenizer = PretrainedTransformerTokenizer("bert-base-uncased") tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
def test_token_idx_roberta(self): sentence = "A, naïve <mask> AllenNLP sentence." expected_tokens = [ "<s>", "ĠA", ",", "Ġnaïve", # RoBERTa mangles this. Or maybe it "encodes"? "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>", ] expected_idxs = [None, 0, 1, 3, 9, 16, 21, 22, 25, 33, None] tokenizer = PretrainedTransformerTokenizer("roberta-base") tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
class CustomTextDatasetReader(DatasetReader): def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, balance_classes=False, **kwargs): self.line = 0 super().__init__(**kwargs) # max_length ensures that we truncate the input self._tokenizer = PretrainedTransformerTokenizer( model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT) self._token_indexers = token_indexers self.balance_classes = balance_classes @overrides def text_to_instance(self, doc, label=None): # self.line += 1 fields: Dict[str, Field] = {} tokens = self._tokenizer.tokenize(doc) if len(tokens) == 0 or tokens is None: print("Data contains empty examples, needs fixing...") raise Exception fields["tokens"] = TextField(tokens, token_indexers=self._token_indexers) if label is not None: fields["label"] = LabelField(label) return Instance(fields) @overrides def _read(self, filepath): with open(filepath) as f: data = pd.read_csv(f, header=None, names=['reviews', 'labels']) for i, (idx, row) in enumerate(data.iterrows()): doc = row['reviews'] label = str(row['labels']) instance = self.text_to_instance(doc, label) if instance is not None: yield instance
def test_long_sequence_splitting(self): tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") allennlp_tokenizer = PretrainedTransformerTokenizer( "bert-base-uncased") indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased", max_length=4) string_specials = "[CLS] AllenNLP is great [SEP]" string_no_specials = "AllenNLP is great" tokens = tokenizer.tokenize(string_specials) expected_ids = tokenizer.convert_tokens_to_ids(tokens) assert len( expected_ids) == 7 # just to make sure it's what we're expecting cls_id, sep_id = expected_ids[0], expected_ids[-1] expected_ids = (expected_ids[:3] + [sep_id, cls_id] + expected_ids[3:5] + [sep_id, cls_id] + expected_ids[5:]) allennlp_tokens = allennlp_tokenizer.tokenize(string_no_specials) vocab = Vocabulary() indexed = indexer.tokens_to_indices(allennlp_tokens, vocab) assert indexed["token_ids"] == expected_ids assert indexed["segment_concat_mask"] == [1] * len(expected_ids) assert indexed["mask"] == [1] * 7 # original length
def test_token_idx_roberta(self): sentence = "A, naïve <mask> AllenNLP sentence." expected_tokens = [ "<s>", "A", ",", "Ġnaïve", # RoBERTa has a funny way of encoding combining characters. "<mask>", "Allen", "N", "LP", "Ġsentence", ".", "</s>", ] expected_idxs = [None, 0, 1, None, 9, 16, 21, 22, 25, 33, None] tokenizer = PretrainedTransformerTokenizer( "roberta-base", calculate_character_offsets=True) tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
def test_token_idx_bert_cased(self): sentence = "A, naïve [MASK] AllenNLP sentence." expected_tokens = [ "[CLS]", "A", ",", "na", "##ï", "##ve", "[MASK]", "Allen", "##NL", "##P", "sentence", ".", "[SEP]", ] expected_idxs = [None, 0, 1, 3, 5, 6, 9, 16, 21, 23, 25, 33, None] tokenizer = PretrainedTransformerTokenizer("bert-base-cased") tokenized = tokenizer.tokenize(sentence) tokens = [t.text for t in tokenized] assert tokens == expected_tokens idxs = [t.idx for t in tokenized] assert idxs == expected_idxs
class PretrainedTransformerIndexer(TokenIndexer): """ This `TokenIndexer` assumes that Tokens already have their indexes in them (see `text_id` field). We still require `model_name` because we want to form allennlp vocabulary from pretrained one. This `Indexer` is only really appropriate to use if you've also used a corresponding :class:`PretrainedTransformerTokenizer` to tokenize your input. Otherwise you'll have a mismatch between your tokens and your vocabulary, and you'll get a lot of UNK tokens. Registered as a `TokenIndexer` with name "pretrained_transformer". # Parameters model_name : `str` The name of the `transformers` model to use. namespace : `str`, optional (default=`tags`) We will add the tokens in the pytorch_transformer vocabulary to this vocabulary namespace. We use a somewhat confusing default value of `tags` so that we do not add padding or UNK tokens to this namespace, which would break on loading because we wouldn't find our default OOV token. max_length : `int`, optional (default = `None`) If not None, split the document into segments of this many tokens (including special tokens) before feeding into the embedder. The embedder embeds these segments independently and concatenate the results to get the original document representation. Should be set to the same value as the `max_length` option on the `PretrainedTransformerEmbedder`. """ def __init__(self, model_name: str, namespace: str = "tags", max_length: int = None, **kwargs) -> None: super().__init__(**kwargs) self._namespace = namespace self._allennlp_tokenizer = PretrainedTransformerTokenizer(model_name) self._tokenizer = self._allennlp_tokenizer.tokenizer self._added_to_vocabulary = False self._num_added_start_tokens = len( self._allennlp_tokenizer.single_sequence_start_tokens) self._num_added_end_tokens = len( self._allennlp_tokenizer.single_sequence_end_tokens) self._max_length = max_length if self._max_length is not None: num_added_tokens = len(self._allennlp_tokenizer.tokenize("a")) - 1 self._effective_max_length = ( # we need to take into account special tokens self._max_length - num_added_tokens) if self._effective_max_length <= 0: raise ValueError( "max_length needs to be greater than the number of special tokens inserted." ) def _add_encoding_to_vocabulary_if_needed(self, vocab: Vocabulary) -> None: """ Copies tokens from ```transformers``` model's vocab to the specified namespace. """ if self._added_to_vocabulary: return try: vocab_items = self._tokenizer.get_vocab().items() except NotImplementedError: vocab_items = ((self._tokenizer.convert_ids_to_tokens(idx), idx) for idx in range(self._tokenizer.vocab_size)) for word, idx in vocab_items: vocab._token_to_index[self._namespace][word] = idx vocab._index_to_token[self._namespace][idx] = word self._added_to_vocabulary = True @overrides def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]): # If we only use pretrained models, we don't need to do anything here. pass @overrides def tokens_to_indices(self, tokens: List[Token], vocabulary: Vocabulary) -> IndexedTokenList: self._add_encoding_to_vocabulary_if_needed(vocabulary) indices, type_ids = self._extract_token_and_type_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real tokens are attended to. output: IndexedTokenList = { "token_ids": indices, "mask": [True] * len(indices), "type_ids": type_ids, } return self._postprocess_output(output) @overrides def indices_to_tokens(self, indexed_tokens: IndexedTokenList, vocabulary: Vocabulary) -> List[Token]: token_ids = indexed_tokens["token_ids"] type_ids = indexed_tokens.get("type_ids") return [ Token( text=vocabulary.get_token_from_index(token_ids[i], self._namespace), text_id=token_ids[i], type_id=type_ids[i] if type_ids is not None else None, ) for i in range(len(token_ids)) ] def _extract_token_and_type_ids( self, tokens: List[Token]) -> Tuple[List[int], Optional[List[int]]]: """ Roughly equivalent to `zip(*[(token.text_id, token.type_id) for token in tokens])`, with some checks. """ indices: List[int] = [] type_ids: List[int] = [] for token in tokens: if getattr(token, "text_id", None) is not None: # `text_id` being set on the token means that we aren't using the vocab, we just use # this id instead. Id comes from the pretrained vocab. # It is computed in PretrainedTransformerTokenizer. indices.append(token.text_id) else: raise KeyError( "Using PretrainedTransformerIndexer but field text_id is not set" f" for the following token: {token.text}") if type_ids is not None and getattr(token, "type_id", None) is not None: type_ids.append(token.type_id) else: type_ids.append(0) return indices, type_ids def _postprocess_output(self, output: IndexedTokenList) -> IndexedTokenList: """ Takes an IndexedTokenList about to be returned by `tokens_to_indices()` and adds any necessary postprocessing, e.g. long sequence splitting. The input should have a `"token_ids"` key corresponding to the token indices. They should have special tokens already inserted. """ if self._max_length is not None: # We prepare long indices by converting them to (assuming max_length == 5) # [CLS] A B C [SEP] [CLS] D E F [SEP] ... # Embedder is responsible for folding this 1-d sequence to 2-d and feed to the # transformer model. # TODO(zhaofengw): we aren't respecting word boundaries when segmenting wordpieces. indices = output["token_ids"] # Strips original special tokens indices = indices[self._num_added_start_tokens:-self. _num_added_end_tokens] # Folds indices folded_indices = [ indices[i:i + self._effective_max_length] for i in range(0, len(indices), self._effective_max_length) ] # Adds special tokens to each segment folded_indices = [ self._tokenizer.build_inputs_with_special_tokens(segment) for segment in folded_indices ] # Flattens indices = [i for segment in folded_indices for i in segment] output["token_ids"] = indices output["type_ids"] = [0] * len(indices) output["segment_concat_mask"] = [True] * len(indices) return output @overrides def get_empty_token_list(self) -> IndexedTokenList: output: IndexedTokenList = { "token_ids": [], "mask": [], "type_ids": [] } if self._max_length is not None: output["segment_concat_mask"] = [] return output @overrides def as_padded_tensor_dict( self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]) -> Dict[str, torch.Tensor]: tensor_dict = {} for key, val in tokens.items(): if key == "type_ids": padding_value = 0 mktensor = torch.LongTensor elif key == "mask" or key == "wordpiece_mask": padding_value = False mktensor = torch.BoolTensor elif len(val) > 0 and isinstance(val[0], bool): padding_value = False mktensor = torch.BoolTensor else: padding_value = self._tokenizer.pad_token_id if padding_value is None: padding_value = ( 0 # Some tokenizers don't have padding tokens and rely on the mask only. ) mktensor = torch.LongTensor tensor = mktensor( pad_sequence_to_length(val, padding_lengths[key], default_value=lambda: padding_value)) tensor_dict[key] = tensor return tensor_dict def __eq__(self, other): if isinstance(other, PretrainedTransformerIndexer): for key in self.__dict__: if key == "_tokenizer": # This is a reference to a function in the huggingface code, which we can't # really modify to make this clean. So we special-case it. continue if self.__dict__[key] != other.__dict__[key]: return False return True return NotImplemented
class TweetCandidateSpanDatasetReader(DatasetReader): def __init__( self, lazy: bool = False, cache_directory: Optional[str] = None, max_instances: Optional[int] = None, min_num_candidate: int = 3, max_num_candidate: int = 5, transformer_model_name_or_archive_path: str = "bert-base-uncased", ) -> None: super().__init__(lazy=lazy, cache_directory=cache_directory, max_instances=max_instances) if "tar.gz" in transformer_model_name_or_archive_path: config = extract_config_from_archive( transformer_model_name_or_archive_path) model_name = config.as_dict( )["dataset_reader"]["tokenizer"]["model_name"] else: model_name = transformer_model_name_or_archive_path self._tokenizer = PretrainedTransformerTokenizer( model_name=model_name, add_special_tokens=False) self._tokenindexer = PretrainedTransformerIndexer( model_name=model_name) self._min_num_candidate = min_num_candidate self._max_num_candidate = max_num_candidate def _read(self, file_path: str) -> Iterable[Instance]: file_path = cached_path(file_path) df = pd.read_json(file_path, lines=True) for record in df.to_dict("records"): if record["selected_text"]: text = record["text"] if not isinstance(text, str): continue elif text.strip() == "": continue elif len(record["candidate_spans"]) < self._min_num_candidate: continue else: yield self.text_to_instance( " " + text.strip(), record["sentiment"], record["candidate_spans"], record["textID"], record.get("selected_text"), record.get("selected_text_span"), ) def text_to_instance( self, text: str, sentiment: str, candidate_spans: list, text_id: Optional[str] = None, selected_text: Optional[str] = None, selected_text_span: Optional[tuple] = None, ) -> Instance: fields = {} text_tokens = self._tokenizer.tokenize(text) sentiment_tokens = self._tokenizer.tokenize(sentiment) text_with_sentiment_tokens = self._tokenizer.add_special_tokens( text_tokens, sentiment_tokens) fields["text_with_sentiment"] = TextField( text_with_sentiment_tokens, {"tokens": self._tokenindexer}) candidate_spans = [ tuple(i) for i in candidate_spans[:self._max_num_candidate] ] additional_metadata = {} if selected_text_span is not None: selected_text_span = tuple(selected_text_span) additional_metadata["selected_text_span"] = selected_text_span if selected_text_span not in candidate_spans: candidate_spans.append(selected_text_span) fields["label"] = LabelField(len(candidate_spans) - 1, skip_indexing=True) have_truth = False else: fields["label"] = LabelField( candidate_spans.index(selected_text_span), skip_indexing=True) have_truth = True additional_metadata["have_truth"] = have_truth additional_metadata["candidate_num"] = len(candidate_spans) fields["candidate_span_pairs"] = SpanPairsField( candidate_spans, fields["text_with_sentiment"]) metadata = { "text": text, "sentiment": sentiment, "selected_text": selected_text, "text_with_sentiment_tokens": text_with_sentiment_tokens } if text_id is not None: metadata["text_id"] = text_id if additional_metadata: metadata.update(additional_metadata) fields["metadata"] = MetadataField(metadata) return Instance(fields) def span_to_str(self, text, span_start, span_end): text_tokens = self._tokenizer.tokenize(text) text_tokens = self._tokenizer.add_special_tokens(text_tokens) return span_tokens_to_text(text, text_tokens, span_start, span_end)
class TransformerSuperGlueRteReader(DatasetReader): """ Dataset reader for the SuperGLUE Recognizing Textual Entailment task, to be used with a transformer model such as RoBERTa. The dataset is in the JSON Lines format. It will generate `Instances` with the following fields: * `tokens`, a `TextField` that contains the concatenation of premise and hypothesis, * `label`, a `LabelField` containing the label, if one exists. * `metadata`, a `MetadataField` that stores the instance's index in the file, the original premise, the original hypothesis, both of these in tokenized form, and the gold label, accessible as `metadata['index']`, `metadata['premise']`, `metadata['hypothesis']`, `metadata['tokens']`, and `metadata['label']`. # Parameters type : `str`, optional (default=`'roberta-base'`) This reader chooses tokenizer according to this setting. """ def __init__( self, transformer_model_name: str = "roberta-base", tokenizer_kwargs: Dict[str, Any] = None, **kwargs ) -> None: super().__init__( manual_distributed_sharding=True, manual_multiprocess_sharding=True, **kwargs ) self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False, tokenizer_kwargs=tokenizer_kwargs, ) self._token_indexers = { "tokens": PretrainedTransformerIndexer( transformer_model_name, tokenizer_kwargs=tokenizer_kwargs, max_length=512 ) } @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path, extract_archive=True) logger.info("Reading file at %s", file_path) yielded_relation_count = 0 from allennlp.common.file_utils import json_lines_from_file for relation in self.shard_iterable(json_lines_from_file(file_path)): premise = relation["premise"] hypothesis = relation["hypothesis"] if "label" in relation: label = relation["label"] else: label = None index = relation["idx"] # todo: see if we even need this to be in a separate method instance = self.text_to_instance(index, label, premise, hypothesis) yield instance yielded_relation_count += 1 @overrides def text_to_instance( self, index: int, label: str, premise: str, hypothesis: str, ) -> Instance: tokenized_premise = self._tokenizer.tokenize(premise) tokenized_hypothesis = self._tokenizer.tokenize(hypothesis) fields = {} premise_and_hypothesis = TextField( self._tokenizer.add_special_tokens(tokenized_premise, tokenized_hypothesis), ) fields["tokens"] = TextField(premise_and_hypothesis) # make the metadata metadata = { "premise": premise, "premise_tokens": tokenized_premise, "hypothesis": hypothesis, "hypothesis_tokens": tokenized_hypothesis, "index": index, } if label: fields["label"] = LabelField(label) metadata["label"] = label fields["metadata"] = MetadataField(metadata) return Instance(fields) @overrides def apply_token_indexers(self, instance: Instance) -> None: instance["tokens"].token_indexers = self._token_indexers
class TransformerMCReader(DatasetReader): """ Read input data for the TransformerMC model. This is the base class for all readers that produce data for TransformerMC. Instances have two fields: * `alternatives`, a `ListField` of `TextField` * `correct_alternative`, `IndexField` with the correct answer among `alternatives` * `qid`, a `MetadataField` containing question ids Parameters ---------- transformer_model_name : `str`, optional (default=`"roberta-large"`) This reader chooses tokenizer and token indexer according to this setting. length_limit : `int`, optional (default=`512`) We will make sure that the length of an alternative never exceeds this many word pieces. """ def __init__(self, transformer_model_name: str = "roberta-large", length_limit: int = 512, **kwargs) -> None: super().__init__(**kwargs) from allennlp.data.tokenizers import PretrainedTransformerTokenizer self._tokenizer = PretrainedTransformerTokenizer( transformer_model_name, add_special_tokens=False) from allennlp.data.token_indexers import PretrainedTransformerIndexer self._token_indexers = { "tokens": PretrainedTransformerIndexer(transformer_model_name) } self.length_limit = length_limit def text_to_instance( self, # type: ignore qid: str, start: str, alternatives: List[str], label: Optional[int] = None, ) -> Instance: # tokenize start = self._tokenizer.tokenize(start) sequences = [] for alternative in alternatives: alternative = self._tokenizer.tokenize(alternative) length_for_start = (self.length_limit - len(alternative) - self._tokenizer.num_special_tokens_for_pair()) if length_for_start < 0: # If the alternative is too long by itself, we take the beginning and add no tokens from the start. alternative = alternative[:length_for_start] length_for_start = 0 sequences.append( self._tokenizer.add_special_tokens(start[:length_for_start], alternative)) # make fields from allennlp.data.fields import TextField sequences = [ TextField(sequence, self._token_indexers) for sequence in sequences ] from allennlp.data.fields import ListField sequences = ListField(sequences) from allennlp.data.fields import MetadataField fields = { "alternatives": sequences, "qid": MetadataField(qid), } if label is not None: if label < 0 or label >= len(sequences): raise ValueError("Alternative %d does not exist", label) from allennlp.data.fields import IndexField fields["correct_alternative"] = IndexField(label, sequences) return Instance(fields)