def __init__(self, word_indexer: Optional[TokenIndexer] = None, is_bert: bool = False, conceptnet_path: Optional[Path] = None): super().__init__(lazy=False) self.pos_indexers = {"pos_tokens": PosTagIndexer()} self.ner_indexers = {"ner_tokens": NerTagIndexer()} self.rel_indexers = { "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens') } if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) self.word_indexers = {'tokens': word_indexer} word_splitter = SpacyWordSplitter(pos_tags=True, ner=True, parse=True) self.word_tokeniser = WordTokenizer(word_splitter=word_splitter) bert_splitter = BertBasicWordSplitter() self.bert_tokeniser = WordTokenizer(word_splitter=bert_splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
def test_do_lower_case(self): # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"]) sentence = "[UNUSED0] [UNK] [unused0]" expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"] tokens = [token.text for token in word_splitter.split_words(sentence)] assert tokens == expected_tokens
def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17] ] assert tokens["bert-offsets"].tolist() == [ [1, 2, 3, 4, 5, 6, 7, 8, 9] ]
def test_sliding_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown [SEP] jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 17, 8, 9, 17, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 16, 5, 17, 8, 9, 2, 14, 12, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11] # The extra [SEP]s shouldn't pollute the token-type-ids # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 0, 0, 0, 1, 1, 1, 1, 1, 1]
def batch_to_ids(stncs, tgt_flag=False): """ convert list of text into ids that elmo accepts :param stncs: [['I', 'Like', 'you'],['Yes'] ] :param tgt_flag: indicates if the inputs is a target sentences, if it is, use only the previous words as context, and neglect last word :return ids: indices to feed into elmo """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(stncs) vocab = Vocabulary() vocab_path = "" token_indexer = PretrainedBertIndexer(str(vocab_path)) ids = token_indexer.tokens_to_indices(tokens, vocab, "bert") if tgt_flag: ids = ids[:, :-1, :] # neglect the last word b_size, _len, dim = ids.shape expand_ids = torch.zeros(b_size * _len, _len, dim, dtype=torch.long) for i in range(1, _len + 1): expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :] return expand_ids return ids
def test_truncate_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
def test_starting_ending_offsets(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 10, 11, 12 ] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 8, 11, 12 ]
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
class Conll04BERTBinaryReader(Conll04SpaCyBinaryReader): splitter = BertBasicWordSplitter() @classmethod def match(cls, index, tokens, split_tokens): tk_idx = 0 tk_tkn = tokens[tk_idx].lower() st_idx = 0 st_tkn = split_tokens[st_idx].text.lower() matched_tokens = [] while True: if index[0] <= tk_idx and tk_idx < index[ 1] and st_idx not in matched_tokens: matched_tokens.append(st_idx) if len(tk_tkn) < len(st_tkn): assert st_tkn.startswith(tk_tkn) st_tkn = st_tkn[len(tk_tkn):] tk_idx += 1 tk_tkn = tokens[tk_idx].lower() elif len(tk_tkn) > len(st_tkn): assert tk_tkn.startswith(st_tkn) tk_tkn = tk_tkn[len(st_tkn):] st_idx += 1 st_tkn = split_tokens[st_idx].text.lower() else: assert st_tkn == tk_tkn tk_idx += 1 st_idx += 1 if tk_idx == len(tokens): assert st_idx == len(split_tokens) break tk_tkn = tokens[tk_idx].lower() st_tkn = split_tokens[st_idx].text.lower() return matched_tokens @cls.textfield('word') def update_sentence_raw(self, fields, tokens) -> Field: indexers = { 'word': PretrainedBertIndexer(pretrained_model='bert-base-uncased') } textfield = TextField(tokens, indexers) return textfield @cls.field('cancidate') def update_relation_cancidate(self, fields: Dict, raw_sample) -> Field: tokens, labels, relations = raw_sample if relations is None: return None relation_indices = [] for relation_type, src_token, dst_token in relations: relation_indices.append( (src_token[self._entity_index], dst_token[self._entity_index])) return AdjacencyField(relation_indices, fields[self.get_fieldname('word')], padding_value=0)
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "The quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "The quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) assert len(tokens1) == 10 assert len(tokens2) == 10 tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2 assert len(tokens) == 22 vocab = Vocabulary() instance = Instance( {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["sentence_pair"] assert tokens['mask'].tolist()[0] == [1] * 22 assert tokens["bert"].tolist()[0] == [ 101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272, 3367, 13971, 17709, 2080 ] assert [ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ] == [ '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', 'elm', '##o' ] assert len(tokens['bert'][0]) == 26 assert tokens["bert-offsets"].tolist()[0] == [ 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 25 ] assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13 bert_vectors = self.token_embedder( tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens['bert-type-ids']) assert list(bert_vectors.shape) == [1, 22, 768]
def get_tokenizer(embedding_type: str, xlnet_vocab_file: Path) -> WordSplitter: if embedding_type == 'bert': splitter = BertBasicWordSplitter() elif embedding_type == 'glove': splitter = SpacyWordSplitter() elif embedding_type == 'xlnet': splitter = XLNetWordSplitter(vocab_file=str(xlnet_vocab_file)) else: raise ValueError(f'Embedding type {embedding_type} not available.') return WordTokenizer(word_splitter=splitter)
def __init__(self, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) splitter = BertBasicWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) self.word_indexers = {'tokens': word_indexer}
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] assert tokens["bert"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["bert-offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def __init__(self, max_instances: int = None, min_abstract_len: int = 10, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy) self.max_instances = max_instances self.min_abstract_len = min_abstract_len self._tokenizer = tokenizer or BertBasicWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def test_sliding_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["bert"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["bert-offsets"].tolist() == [[ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]] bert_vectors = token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def __init__(self, pretrained_model: str, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, num_choices: int = 5, answer_only: bool = False, restrict_num_choices: int = None, ignore_context: bool = False, sample: int = -1, random_seed: int = 0) -> None: super().__init__() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } lower_case = not '-cased' in pretrained_model self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case) self._max_pieces = max_pieces self._sample = sample self._num_choices = num_choices self._answer_only = answer_only self._restrict_num_choices = restrict_num_choices self._ignore_context = ignore_context self._random_seed = random_seed
def test_end_to_end_with_higher_order_inputs(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def __init__(self, label_field: str, text_field: str, paper_lookup_path: str, sent_max_len: int = 256, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or BertBasicWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._label_field = label_field self._text_field = text_field self._sent_max_len = sent_max_len self._paper_lookup = self.load_paper_lookup(paper_lookup_path)
class TestBertBasicWordSplitter(AllenNlpTestCase): def setUp(self): super().setUp() self.word_splitter = BertBasicWordSplitter() def test_never_split(self): sentence = "[unused0] [UNK] [SEP] [PAD] [CLS] [MASK]" expected_tokens = ["[", "unused0", "]", "[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"] tokens = [token.text for token in self.word_splitter.split_words(sentence)] assert tokens == expected_tokens def test_do_lower_case(self): # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"]) sentence = "[UNUSED0] [UNK] [unused0]" expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"] tokens = [token.text for token in word_splitter.split_words(sentence)] assert tokens == expected_tokens
def test_truncate_window_dont_split_wordpieces(self): """ Tests if the sentence is not truncated inside of the word with 2 or more wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
def test_truncate_window_fit_two_wordpieces(self): """ Tests if the both `use_starting_offsets` options work properly when last word in the truncated sentence consists of two wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=13) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [ 16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17 ] assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=13) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [ 16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17 ] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
def test_read(self, lazy): reader = SnliReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={ 'bert': PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH) }, ) instances = reader.read( str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens'].tokens] label = example.fields['label'].label weight = example.fields['weight'].weight assert label == 'neutral' assert weight == 1 assert instances[1].fields['weight'].weight == 0.5 assert instances[2].fields['weight'].weight == 1 assert tokens == [ 'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.' ] batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def set_values(max_sequence_length: Optional[int] = -1, concat_title_abstract: Optional[bool] = None, data_source: Optional[str] = None, included_text_fields: Optional[str] = None) -> None: # set global values # note: a class with __init__ would have been a better design # we have this structure for efficiency reasons: to support multiprocessing # as multiprocessing with class methods is slower global _tokenizer global _token_indexers global _token_indexer_author_id global _token_indexer_author_position global _token_indexer_venue global _token_indexer_id global _max_sequence_length global _concat_title_abstract global _data_source global _included_text_fields if _tokenizer is None: # if not initialized, initialize the tokenizers and token indexers _tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter( do_lower_case=bert_params["do_lowercase"])) _token_indexers = { "bert": PretrainedBertIndexer.from_params(Params(bert_params)) } _token_indexer_author_id = { "tokens": SingleIdTokenIndexer(namespace='author') } _token_indexer_author_position = { "tokens": SingleIdTokenIndexer(namespace='author_positions') } _token_indexer_venue = { "tokens": SingleIdTokenIndexer(namespace='venue') } _token_indexer_id = {"tokens": SingleIdTokenIndexer(namespace='id')} _max_sequence_length = max_sequence_length _concat_title_abstract = concat_title_abstract _data_source = data_source _included_text_fields = included_text_fields
def word_embeddings(self): words = re.split(r'\W+',self.text) Text = ' '.join(words) tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(Text) vocab = Vocabulary() token_indexer = PretrainedBertIndexer('bert-base-uncased') instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lenghts = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lenghts) Tokens = tensor_dict["tokens"] model = PretrainedBertEmbedder('bert-base-uncased') bert_vectors = model(Tokens["bert"]) return(bert_vectors)
def __init__(self, text_lookup_path: str, embedded_text: str = 'title', use_bos_eos: bool = True, lazy: bool = False, sent_len_limit: int = None, abstract_tokenizer: Tokenizer = None, abstract_indexers: Dict[str, TokenIndexer] = None, sequence_tokenizer: Tokenizer = None, sequence_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) if embedded_text == 'title': with open(text_lookup_path) as f: self.data_lookup = { line[0]: { 'abstract': line[2] } for line in map(lambda x: x.strip().split('\t'), f.readlines()) if len(line) > 2 } elif embedded_text == 'abstract': with jsonlines.open(text_lookup_path) as reader: self.data_lookup = {item['paper_id']: item for item in reader} # Add these now so we can find them in the lookup, then replace with # [unused0] and [unused1] in text_to_instance method self.data_lookup['<s>'] = {'abstract': '<s>'} self._sent_len_limit = sent_len_limit self._abstract_tokenizer = abstract_tokenizer or BertBasicWordSplitter( ) self._abstract_indexers = abstract_indexers self._sequence_tokenizer = sequence_tokenizer or JustSpacesWordSplitter( ) self._sequence_indexers = sequence_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__(self, is_bert: bool, conceptnet_path: Path, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=True) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} # self.rel_indexers = { # "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
if label == pred_label: correct += 1 print( f'Accuracy: {correct}/{len(labels)} = {correct/len(labels)*100:.2f}%') if __name__ == '__main__': parser = argparse.ArgumentParser(description='CSQA using BERT NSP model') parser.add_argument('--input', help='input dataset') parser.add_argument('--bert-vocab', help='bert vocab file') parser.add_argument('--bert-model', help='pretrained bert model') parser.add_argument('--batch-size', type=int, default=8, help='batch size for BERT') parser.add_argument('--gpu-id', '-g', type=int, default=0, help='GPU ID') args = parser.parse_args() print('Initialize BERT model...') TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter()) WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab) VOCAB = Vocabulary() GPU_ID = args.gpu_id BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained( args.bert_model).to(torch.device(f"cuda:{GPU_ID}")) BERT_NEXT_SENTENCE.eval() main()
class BertMCQAReader(DatasetReader): """ Reads a file from the AllenAI-V1-Feb2018 dataset in Json format. This data is formatted as jsonl, one json-formatted instance per line. An example of the json in the data is: {"id":"MCAS_2000_4_6", "question":{"stem":"Which technology was developed most recently?", "choices":[ {"text":"cellular telephone","label":"A"}, {"text":"television","label":"B"}, {"text":"refrigerator","label":"C"}, {"text":"airplane","label":"D"} ]}, "answerKey":"A" } Parameters ---------- tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``) We use this ``Tokenizer`` for both the premise and the hypothesis. See :class:`Tokenizer`. token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) We similarly use this for both the premise and the hypothesis. See :class:`TokenIndexer`. """ def __init__(self, pretrained_model: str, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, num_choices: int = 5, answer_only: bool = False, restrict_num_choices: int = None, ignore_context: bool = False, sample: int = -1, random_seed: int = 0) -> None: super().__init__() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } lower_case = not '-cased' in pretrained_model self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case) self._max_pieces = max_pieces self._sample = sample self._num_choices = num_choices self._answer_only = answer_only self._restrict_num_choices = restrict_num_choices self._ignore_context = ignore_context self._random_seed = random_seed @overrides def _read(self, file_path: str): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) counter = self._sample + 1 debug = 5 with open(file_path, 'r') as data_file: logger.info("Reading QA instances from jsonl dataset at: %s", file_path) instances = [] for line in data_file: counter -= 1 debug -= 1 if counter == 0: break item_json = json.loads(line.strip()) if debug > 0: logger.info(item_json) item_id = item_json["id"] context = item_json.get("para") if self._ignore_context: context = None question_text = item_json["question"]["stem"] if self._answer_only: question_text = "" choice_label_to_id = {} choice_text_list = [] choice_context_list = [] any_correct = False choice_id_correction = 0 for choice_id, choice_item in enumerate( item_json["question"]["choices"]): if self._restrict_num_choices and len( choice_text_list) == self._restrict_num_choices: if not any_correct: choice_text_list.pop(-1) choice_context_list.pop(-1) choice_id_correction += 1 else: break choice_label = choice_item["label"] choice_label_to_id[ choice_label] = choice_id - choice_id_correction choice_text = choice_item["text"] choice_context = choice_item.get("para") if self._ignore_context: choice_context = None choice_text_list.append(choice_text) choice_context_list.append(choice_context) if item_json.get('answerKey') == choice_label: if any_correct: raise ValueError( "More than one correct answer found for {item_json}!" ) any_correct = True if self._restrict_num_choices \ and len(choice_text_list) == self._restrict_num_choices \ and not any_correct: continue if not any_correct and 'answerKey' in item_json: raise ValueError( "No correct answer found for {item_json}!") answer_id = choice_label_to_id[item_json["answerKey"]] # Pad choices with empty strings if not right number if len(choice_text_list) != self._num_choices: choice_text_list = ( choice_text_list + self._num_choices * [''])[:self._num_choices] choice_context_list = ( choice_context_list + self._num_choices * [None])[:self._num_choices] if answer_id >= self._num_choices: logging.warning( f"Skipping question with more than {self._num_choices} answers: {item_json}" ) continue instances.append( self.text_to_instance(item_id, question_text, choice_text_list, answer_id, context, choice_context_list, debug)) random.seed(self._random_seed) random.shuffle(instances) for instance in instances: yield instance @overrides def text_to_instance( self, # type: ignore item_id: str, question: str, choice_list: List[str], answer_id: int = None, context: str = None, choice_context_list: List[str] = None, debug: int = -1) -> Instance: # pylint: disable=arguments-differ fields: Dict[str, Field] = {} pair_fields = [] pair_tokens_list = [] choice1_index_fields = [] choice2_index_fields = [] for index1, index2 in itertools.permutations(range(len(choice_list)), 2): choice1, choice2 = (choice_list[index1], choice_list[index2]) # TODO: What to do if contexts are not none? assert context is None if choice_context_list is not None: assert all(map(lambda x: x is None, choice_context_list)) pair_tokens = self.bert_features_from_q_2a(question, choice1, choice2) pair_field = TextField(pair_tokens, self._token_indexers) choice1_index_field = LabelField(index1, skip_indexing=True) choice2_index_field = LabelField(index2, skip_indexing=True) pair_fields.append(pair_field) pair_tokens_list.append(pair_tokens) choice1_index_fields.append(choice1_index_field) choice2_index_fields.append(choice2_index_field) if debug > 0: logger.info(f"qa_tokens = {pair_tokens}") fields['question'] = ListField(pair_fields) fields['choice1_indexes'] = ListField(choice1_index_fields) fields['choice2_indexes'] = ListField(choice2_index_fields) if answer_id is not None: fields['label'] = LabelField(answer_id, skip_indexing=True) metadata = { "id": item_id, "question_text": question, "choice_text_list": choice_list, "correct_answer_index": answer_id, "question_tokens_list": pair_tokens_list } if debug > 0: logger.info(f"answer_id = {answer_id}") fields["metadata"] = MetadataField(metadata) return Instance(fields) @staticmethod def _truncate_tokens(tokens_a, tokens_b, tokens_c, max_length): """ Truncate 'a' from the start, 'b' form the start, and 'c' from the end until total is less than max_length. At each step, truncate the longest one """ while len(tokens_a) + len(tokens_b) + len(tokens_c) > max_length: reduction_candidate = numpy.argmax(len(tokens_a), len(tokens_b), len(tokens_c)) if reduction_candidate == 0: # 'a' is the longest tokens_a.pop(0) elif reduction_candidate == 1: # 'b' is the longest tokens_b.pop(0) else: # 'c' is the longest tokens_c.pop() return tokens_a, tokens_b, tokens_c def bert_features_from_q_2a(self, question: str, answer1: str, answer2: str, context: str = None): #TODO: What should we do if context is not None (where to append it?) assert context is None sep_token = Token("[SEP]") question_tokens = self._word_splitter.split_words(question) choice1_tokens = self._word_splitter.split_words(answer1) choice2_tokens = self._word_splitter.split_words(answer2) question_tokens, choice1_tokens, choice2_tokens = self._truncate_tokens( question_tokens, choice1_tokens, choice2_tokens, self._max_pieces - 2) tokens = choice1_tokens + [sep_token] + question_tokens + [ sep_token ] + choice2_tokens return tokens