def test_sliding_window_with_batch(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / 'bert' / 'config.json' config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})}) instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})}) batch = Batch([instance, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert bert_vectors is not None
def test_do_lower_case(self): # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"]) sentence = "[UNUSED0] [UNK] [unused0]" expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"] tokens = [token.text for token in word_splitter.split_words(sentence)] assert tokens == expected_tokens
def batch_to_ids(stncs, tgt_flag=False): """ convert list of text into ids that elmo accepts :param stncs: [['I', 'Like', 'you'],['Yes'] ] :param tgt_flag: indicates if the inputs is a target sentences, if it is, use only the previous words as context, and neglect last word :return ids: indices to feed into elmo """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(stncs) vocab = Vocabulary() vocab_path = "" token_indexer = PretrainedBertIndexer(str(vocab_path)) ids = token_indexer.tokens_to_indices(tokens, vocab, "bert") if tgt_flag: ids = ids[:, :-1, :] # neglect the last word b_size, _len, dim = ids.shape expand_ids = torch.zeros(b_size * _len, _len, dim, dtype=torch.long) for i in range(1, _len + 1): expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :] return expand_ids return ids
def test_padding_for_equal_length_indices(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 14 12 sentence = "the quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] assert tokens["bert"].tolist() == [ [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17] ] assert tokens["bert-offsets"].tolist() == [ [1, 2, 3, 4, 5, 6, 7, 8, 9] ]
def test_starting_ending_offsets(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 10, 11, 12 ] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 8, 11, 12 ]
def test_truncate_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
def test_read(self, lazy): reader = GLUESST2DatasetReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={'bert': PretrainedBertIndexer( pretrained_model=self.BERT_VOCAB_PATH)}, skip_label_indexing=False ) instances = reader.read( str(self.FIXTURES_ROOT / 'dev.tsv')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens']] label = example.fields['label'].label print(label) print(tokens) batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0]]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def test_sliding_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown [SEP] jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 17, 8, 9, 17, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 16, 5, 17, 8, 9, 2, 14, 12, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11] # The extra [SEP]s shouldn't pollute the token-type-ids # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 0, 0, 0, 1, 1, 1, 1, 1, 1]
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "The quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "The quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) assert len(tokens1) == 10 assert len(tokens2) == 10 tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2 assert len(tokens) == 22 vocab = Vocabulary() instance = Instance( {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["sentence_pair"] assert tokens['mask'].tolist()[0] == [1] * 22 assert tokens["bert"].tolist()[0] == [ 101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971, 3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272, 3367, 13971, 17709, 2080 ] assert [ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ] == [ '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st', 'lazy', 'elm', '##o' ] assert len(tokens['bert'][0]) == 26 assert tokens["bert-offsets"].tolist()[0] == [ 0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 25 ] assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13 bert_vectors = self.token_embedder( tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens['bert-type-ids']) assert list(bert_vectors.shape) == [1, 22, 768]
class Conll04BERTBinaryReader(Conll04SpaCyBinaryReader): splitter = BertBasicWordSplitter() @classmethod def match(cls, index, tokens, split_tokens): tk_idx = 0 tk_tkn = tokens[tk_idx].lower() st_idx = 0 st_tkn = split_tokens[st_idx].text.lower() matched_tokens = [] while True: if index[0] <= tk_idx and tk_idx < index[ 1] and st_idx not in matched_tokens: matched_tokens.append(st_idx) if len(tk_tkn) < len(st_tkn): assert st_tkn.startswith(tk_tkn) st_tkn = st_tkn[len(tk_tkn):] tk_idx += 1 tk_tkn = tokens[tk_idx].lower() elif len(tk_tkn) > len(st_tkn): assert tk_tkn.startswith(st_tkn) tk_tkn = tk_tkn[len(st_tkn):] st_idx += 1 st_tkn = split_tokens[st_idx].text.lower() else: assert st_tkn == tk_tkn tk_idx += 1 st_idx += 1 if tk_idx == len(tokens): assert st_idx == len(split_tokens) break tk_tkn = tokens[tk_idx].lower() st_tkn = split_tokens[st_idx].text.lower() return matched_tokens @cls.textfield('word') def update_sentence_raw(self, fields, tokens) -> Field: indexers = { 'word': PretrainedBertIndexer(pretrained_model='bert-base-uncased') } textfield = TextField(tokens, indexers) return textfield @cls.field('cancidate') def update_relation_cancidate(self, fields: Dict, raw_sample) -> Field: tokens, labels, relations = raw_sample if relations is None: return None relation_indices = [] for relation_type, src_token, dst_token in relations: relation_indices.append( (src_token[self._entity_index], dst_token[self._entity_index])) return AdjacencyField(relation_indices, fields[self.get_fieldname('word')], padding_value=0)
def get_tokenizer(embedding_type: str, xlnet_vocab_file: Path) -> WordSplitter: if embedding_type == 'bert': splitter = BertBasicWordSplitter() elif embedding_type == 'glove': splitter = SpacyWordSplitter() elif embedding_type == 'xlnet': splitter = XLNetWordSplitter(vocab_file=str(xlnet_vocab_file)) else: raise ValueError(f'Embedding type {embedding_type} not available.') return WordTokenizer(word_splitter=splitter)
def __init__(self, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) splitter = BertBasicWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=False) self.word_indexers = {'tokens': word_indexer}
def __init__(self, max_instances: int = None, min_abstract_len: int = 10, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy) self.max_instances = max_instances self.min_abstract_len = min_abstract_len self._tokenizer = tokenizer or BertBasicWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
def test_end_to_end(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) vocab = Vocabulary() instance1 = Instance( {"tokens": TextField(tokens1, {"bert": self.token_indexer})}) instance2 = Instance( {"tokens": TextField(tokens2, {"bert": self.token_indexer})}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] assert tokens["bert"].tolist() == [ [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0], [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17], ] assert tokens["bert-offsets"].tolist() == [ [1, 3, 4, 5, 6, 7, 8, 9, 10, 11], [1, 2, 3, 4, 5, 6, 7, 10, 11, 12], ] # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP]) bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12] # Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 14, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 10, 12]
def test_sliding_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8) config_path = self.FIXTURES_ROOT / "bert" / "config.json" config = BertConfig(str(config_path)) bert_model = BertModel(config) token_embedder = BertEmbedder(bert_model, max_pieces=8) instance = Instance( {"tokens": TextField(tokens, {"bert": token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert tokens["bert"].tolist() == [[ 16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17 ]] assert tokens["bert-offsets"].tolist() == [[ 1, 3, 4, 5, 6, 7, 8, 9, 10, 11 ]] bert_vectors = token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [1, 13, 12] # Testing without token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [1, 10, 12] # Testing with token_type_ids bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]) assert list(bert_vectors.shape) == [1, 10, 12]
def test_end_to_end_with_higher_order_inputs(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 4 3 5 6 8 9 2 14 12 sentence1 = "the quickest quick brown fox jumped over the lazy dog" tokens1 = tokenizer.tokenize(sentence1) text_field1 = TextField(tokens1, {"bert": self.token_indexer}) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence2 = "the quick brown fox jumped over the laziest lazy elmo" tokens2 = tokenizer.tokenize(sentence2) text_field2 = TextField(tokens2, {"bert": self.token_indexer}) # 2 5 15 10 11 6 sentence3 = "the brown laziest fox" tokens3 = tokenizer.tokenize(sentence3) text_field3 = TextField(tokens3, {"bert": self.token_indexer}) vocab = Vocabulary() instance1 = Instance({"tokens": ListField([text_field1])}) instance2 = Instance({"tokens": ListField([text_field2, text_field3])}) batch = Batch([instance1, instance2]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True) tokens = tensor_dict["tokens"] # No offsets, should get 12 vectors back. bert_vectors = self.token_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] # Offsets, should get 10 vectors back. bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12] ## Now try top_layer_only = True tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True) bert_vectors = tlo_embedder(tokens["bert"]) assert list(bert_vectors.shape) == [2, 2, 12, 12] bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"]) assert list(bert_vectors.shape) == [2, 2, 10, 12]
def __init__(self, label_field: str, text_field: str, paper_lookup_path: str, sent_max_len: int = 256, lazy: bool = False, tokenizer: Tokenizer = None, token_indexers: Dict[str, TokenIndexer] = None, ) -> None: super().__init__(lazy) self._tokenizer = tokenizer or BertBasicWordSplitter() self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()} self._label_field = label_field self._text_field = text_field self._sent_max_len = sent_max_len self._paper_lookup = self.load_paper_lookup(paper_lookup_path)
def test_truncate_window_dont_split_wordpieces(self): """ Tests if the sentence is not truncated inside of the word with 2 or more wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
def test_truncate_window_fit_two_wordpieces(self): """ Tests if the both `use_starting_offsets` options work properly when last word in the truncated sentence consists of two wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=13) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [ 16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17 ] assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=13) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [ 16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17 ] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
def test_read(self, lazy): reader = SnliReader( tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()), token_indexers={ 'bert': PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH) }, ) instances = reader.read( str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl')) instances = ensure_list(instances) example = instances[0] tokens = [t.text for t in example.fields['tokens'].tokens] label = example.fields['label'].label weight = example.fields['weight'].weight assert label == 'neutral' assert weight == 1 assert instances[1].fields['weight'].weight == 0.5 assert instances[2].fields['weight'].weight == 1 assert tokens == [ 'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken', 'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training', 'his', 'horse', 'for', 'a', 'competition', '.' ] batch = Batch(instances) vocab = Vocabulary.from_instances(instances) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] print(tokens['mask'].tolist()[0]) print(tokens["bert"].tolist()[0]) print([ vocab.get_token_from_index(i, "bert") for i in tokens["bert"].tolist()[0] ]) print(len(tokens['bert'][0])) print(tokens["bert-offsets"].tolist()[0]) print(tokens['bert-type-ids'].tolist()[0])
def test_max_length(self): config = BertConfig(len(self.token_indexer.vocab)) model = BertModel(config) embedder = BertEmbedder(model) tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the " * 1000 tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lengths = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lengths) tokens = tensor_dict["tokens"] embedder(tokens["bert"], tokens["bert-offsets"])
def set_values(max_sequence_length: Optional[int] = -1, concat_title_abstract: Optional[bool] = None, data_source: Optional[str] = None, included_text_fields: Optional[str] = None) -> None: # set global values # note: a class with __init__ would have been a better design # we have this structure for efficiency reasons: to support multiprocessing # as multiprocessing with class methods is slower global _tokenizer global _token_indexers global _token_indexer_author_id global _token_indexer_author_position global _token_indexer_venue global _token_indexer_id global _max_sequence_length global _concat_title_abstract global _data_source global _included_text_fields if _tokenizer is None: # if not initialized, initialize the tokenizers and token indexers _tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter( do_lower_case=bert_params["do_lowercase"])) _token_indexers = { "bert": PretrainedBertIndexer.from_params(Params(bert_params)) } _token_indexer_author_id = { "tokens": SingleIdTokenIndexer(namespace='author') } _token_indexer_author_position = { "tokens": SingleIdTokenIndexer(namespace='author_positions') } _token_indexer_venue = { "tokens": SingleIdTokenIndexer(namespace='venue') } _token_indexer_id = {"tokens": SingleIdTokenIndexer(namespace='id')} _max_sequence_length = max_sequence_length _concat_title_abstract = concat_title_abstract _data_source = data_source _included_text_fields = included_text_fields
def word_embeddings(self): words = re.split(r'\W+',self.text) Text = ' '.join(words) tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(Text) vocab = Vocabulary() token_indexer = PretrainedBertIndexer('bert-base-uncased') instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab) padding_lenghts = batch.get_padding_lengths() tensor_dict = batch.as_tensor_dict(padding_lenghts) Tokens = tensor_dict["tokens"] model = PretrainedBertEmbedder('bert-base-uncased') bert_vectors = model(Tokens["bert"]) return(bert_vectors)
def __init__(self, pretrained_model: str, token_indexers: Dict[str, TokenIndexer] = None, max_pieces: int = 512, num_choices: int = 5, answer_only: bool = False, restrict_num_choices: int = None, ignore_context: bool = False, sample: int = -1, random_seed: int = 0) -> None: super().__init__() self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } lower_case = not '-cased' in pretrained_model self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case) self._max_pieces = max_pieces self._sample = sample self._num_choices = num_choices self._answer_only = answer_only self._restrict_num_choices = restrict_num_choices self._ignore_context = ignore_context self._random_seed = random_seed
def __init__(self, text_lookup_path: str, embedded_text: str = 'title', use_bos_eos: bool = True, lazy: bool = False, sent_len_limit: int = None, abstract_tokenizer: Tokenizer = None, abstract_indexers: Dict[str, TokenIndexer] = None, sequence_tokenizer: Tokenizer = None, sequence_indexers: Dict[str, TokenIndexer] = None) -> None: super().__init__(lazy) if embedded_text == 'title': with open(text_lookup_path) as f: self.data_lookup = { line[0]: { 'abstract': line[2] } for line in map(lambda x: x.strip().split('\t'), f.readlines()) if len(line) > 2 } elif embedded_text == 'abstract': with jsonlines.open(text_lookup_path) as reader: self.data_lookup = {item['paper_id']: item for item in reader} # Add these now so we can find them in the lookup, then replace with # [unused0] and [unused1] in text_to_instance method self.data_lookup['<s>'] = {'abstract': '<s>'} self._sent_len_limit = sent_len_limit self._abstract_tokenizer = abstract_tokenizer or BertBasicWordSplitter( ) self._abstract_indexers = abstract_indexers self._sequence_tokenizer = sequence_tokenizer or JustSpacesWordSplitter( ) self._sequence_indexers = sequence_indexers or { "tokens": SingleIdTokenIndexer() }
def __init__(self, is_bert: bool, conceptnet_path: Path, word_indexer: Optional[TokenIndexer] = None): super().__init__(lazy=False) if is_bert: splitter = BertBasicWordSplitter() else: splitter = SpacyWordSplitter() self.tokeniser = WordTokenizer(word_splitter=splitter) if word_indexer is None: if is_bert: word_indexer = PretrainedBertIndexer( pretrained_model='bert-base-uncased', truncate_long_sequences=True) else: word_indexer = SingleIdTokenIndexer(lowercase_tokens=True) self.word_indexers = {'tokens': word_indexer} # self.rel_indexers = { # "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')} self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
from allennlp.data.dataset import Batch from allennlp.data.fields import TextField from allennlp.data.instance import Instance from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer from allennlp.data.tokenizers import WordTokenizer from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter from allennlp.data.vocabulary import Vocabulary from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder import re tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) vocab = Vocabulary() token_indexer = PretrainedBertIndexer('bert-base-uncased') model = PretrainedBertEmbedder('bert-base-uncased') class preprocessing(object): def __init__(self, text): self.text = text def bert_vector(self): words = re.split(r'\W+', self.text) Text = ' '.join(words) tokens = tokenizer.tokenize(Text) instance = Instance( {"tokens": TextField(tokens, {'bert': token_indexer})}) batch = Batch([instance]) batch.index_instances(vocab)
def setUp(self): super().setUp() self.word_splitter = BertBasicWordSplitter()
if label == pred_label: correct += 1 print( f'Accuracy: {correct}/{len(labels)} = {correct/len(labels)*100:.2f}%') if __name__ == '__main__': parser = argparse.ArgumentParser(description='CSQA using BERT NSP model') parser.add_argument('--input', help='input dataset') parser.add_argument('--bert-vocab', help='bert vocab file') parser.add_argument('--bert-model', help='pretrained bert model') parser.add_argument('--batch-size', type=int, default=8, help='batch size for BERT') parser.add_argument('--gpu-id', '-g', type=int, default=0, help='GPU ID') args = parser.parse_args() print('Initialize BERT model...') TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter()) WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab) VOCAB = Vocabulary() GPU_ID = args.gpu_id BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained( args.bert_model).to(torch.device(f"cuda:{GPU_ID}")) BERT_NEXT_SENTENCE.eval() main()