def test_truncate_window_fit_two_wordpieces(self): """ Tests if the both `use_starting_offsets` options work properly when last word in the truncated sentence consists of two wordpieces. """ tokenizer = BertPreTokenizer() sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer( str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=13 ) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["input_ids"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17] assert indexed_tokens["offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10] assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] token_indexer = PretrainedBertIndexer( str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=13 ) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["input_ids"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17] assert indexed_tokens["offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
def test_truncate_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
def test_do_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = WordTokenizer() # Quick is UNK because of capitalization # 2 1 5 6 8 9 2 15 10 11 14 1 sentence = "the Quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=False) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # Quick should get 1 == OOV assert indexed_tokens["bert"] == [ 16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] # Does lowercasing by default token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # Now Quick should get indexed correctly as 3 ( == "quick") assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ]
def test_never_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = WordTokenizer() # 2 15 10 11 6 sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) tokens.append( Token("[PAD]")) # have to do this b/c tokenizer splits it in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # PAD should get recognized and not lowercased # [PAD] assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 0, 17] # Unless we manually override the never lowercases token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True, never_lowercase=()) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # now PAD should get lowercased and be UNK # [UNK] assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 1, 17]
def test_never_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = WordTokenizer() # 2 15 10 11 6 sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) tokens.append(Token("[PAD]")) # have to do this b/c tokenizer splits it in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # PAD should get recognized and not lowercased # [PAD] assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 0, 17] # Unless we manually override the never lowercases token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=True, never_lowercase=()) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # now PAD should get lowercased and be UNK # [UNK] assert indexed_tokens["bert"] == [16, 2, 15, 10, 11, 6, 1, 17]
def test_starting_ending_offsets(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 10, 11, 12 ] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [ 16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17 ] assert indexed_tokens["bert-offsets"] == [ 1, 2, 3, 4, 5, 6, 7, 8, 11, 12 ]
def batch_to_ids(stncs, tgt_flag=False): """ convert list of text into ids that elmo accepts :param stncs: [['I', 'Like', 'you'],['Yes'] ] :param tgt_flag: indicates if the inputs is a target sentences, if it is, use only the previous words as context, and neglect last word :return ids: indices to feed into elmo """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) tokens = tokenizer.tokenize(stncs) vocab = Vocabulary() vocab_path = "" token_indexer = PretrainedBertIndexer(str(vocab_path)) ids = token_indexer.tokens_to_indices(tokens, vocab, "bert") if tgt_flag: ids = ids[:, :-1, :] # neglect the last word b_size, _len, dim = ids.shape expand_ids = torch.zeros(b_size * _len, _len, dim, dtype=torch.long) for i in range(1, _len + 1): expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :] return expand_ids return ids
def test_sliding_window(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown [SEP] jumped over the lazy dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, use_starting_offsets=False, max_pieces=10) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 17, 8, 9, 17, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 16, 5, 17, 8, 9, 2, 14, 12, 17] assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11] # The extra [SEP]s shouldn't pollute the token-type-ids # [CLS] the quick est quick brown [SEP] jumped over [SEP] assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, # [CLS] brown [SEP] jumped over the lazy dog [SEP] 0, 0, 0, 1, 1, 1, 1, 1, 1]
def test_truncate_window_dont_split_wordpieces(self): """ Tests if the sentence is not truncated inside of the word with 2 or more wordpieces. """ tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) sentence = "the quickest quick brown fox jumped over the quickest dog" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=True, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9] assert indexed_tokens["bert-type-ids"] == [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ] token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=True, use_starting_offsets=False, max_pieces=12) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # 16 = [CLS], 17 = [SEP] # 1 full window + 1 half window with start/end tokens assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17] # We could fit one more piece here, but we don't, not to have a cut # in the middle of the word assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
def test_indexes_empty_sequence(self): vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices([], vocab) assert indexed_tokens == { "input_ids": [16, 17], # [CLS], [SEP] "offsets": [], # no tokens => no offsets "token_type_ids": [0, 0], # just 0s for start and end "mask": [], # no tokens => no mask }
def test_indexes_empty_sequence(self): vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices([], vocab, "bert") assert indexed_tokens == { 'bert': [16, 17], # [CLS], [SEP] 'bert-offsets': [], # no tokens => no offsets 'bert-type-ids': [0, 0], # just 0s for start and end 'mask': [] # no tokens => no mask }
def test_starting_ending_offsets(self): tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter()) # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 9, 10, 11] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") assert indexed_tokens["bert"] == [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1] assert indexed_tokens["bert-offsets"] == [0, 1, 2, 3, 4, 5, 6, 7, 10, 11]
def test_starting_ending_offsets(self): tokenizer = BertPreTokenizer() # 2 3 5 6 8 9 2 15 10 11 14 1 sentence = "the quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # 16 = [CLS], 17 = [SEP] assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 10, 11, 12] token_indexer = PretrainedBertIndexer(str(vocab_path), use_starting_offsets=True) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) assert indexed_tokens["input_ids"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] assert indexed_tokens["offsets"] == [1, 2, 3, 4, 5, 6, 7, 8, 11, 12]
def test_do_lowercase(self): # Our default tokenizer doesn't handle lowercasing. tokenizer = WordTokenizer() # Quick is UNK because of capitalization # 2 1 5 6 8 9 2 15 10 11 14 1 sentence = "the Quick brown fox jumped over the laziest lazy elmo" tokens = tokenizer.tokenize(sentence) vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path), do_lowercase=False) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # Quick should get 1 == OOV assert indexed_tokens["bert"] == [16, 2, 1, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17] # Does lowercasing by default token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # Now Quick should get indexed correctly as 3 ( == "quick") assert indexed_tokens["bert"] == [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]
def test_token_type_ids(self): tokenizer = WordTokenizer() sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) # 2 15 10 11 6 17 2 15 10 11 6 # the laziest fox [SEP] the laziest fox tokens = tokens + [Token("[SEP]")] + tokens # have to do this b/c tokenizer splits `[SEP]` in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt' token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert") # [CLS] 2, 15, 10, 11, 6, 17, 2 15, 10, 11, 6, [SEP] assert indexed_tokens["bert-type-ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] #pylint: disable=bad-whitespace
def test_token_type_ids(self): tokenizer = SpacyTokenizer() sentence = "the laziest fox" tokens = tokenizer.tokenize(sentence) # 2 15 10 11 6 17 2 15 10 11 6 # the laziest fox [SEP] the laziest fox tokens = ( tokens + [Token("[SEP]")] + tokens ) # have to do this b/c tokenizer splits `[SEP]` in three vocab = Vocabulary() vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt" token_indexer = PretrainedBertIndexer(str(vocab_path)) indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab) # [CLS] 2, 15, 10, 11, 6, 17, 2 15, 10, 11, 6, [SEP] assert indexed_tokens["token_type_ids"] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, doc_encoder: Seq2VecEncoder, const_path: str, tokens_namespace: str, use_sim: bool = True, use_classifier: bool = True, ) -> None: super().__init__(vocab) self.vocab = vocab self.num_tags = vocab.get_vocab_size("labels") self._token_embedder = text_field_embedder self._doc_encoder = doc_encoder if not use_sim: raise Exception( "use_sim option is false, but it must be true for this to work" ) if use_classifier: print("Warning: use_classifier option does nothing now...") self.use_sim = use_sim self.use_classifier = use_classifier # I actually want to use the one from the config, but not sure how to do that. _spacy_word_splitter = SpacyWordSplitter() token_indexer = PretrainedBertIndexer("bert-base-cased", do_lowercase=False, use_starting_offsets=True) jc = JsonConverter() const, links = jc._read_const(const_path) # the extra 1 is for the "unmatched" label. print(vocab.get_token_to_index_vocabulary("labels")) print(const.keys()) assert self.num_tags == len( const ) + 1, "Num tags ({}) doesn't match the size of the constitution+1 ({})".format( self.num_tags, len(const) + 1) if self.use_sim: # create the constitution matrix. Every element is one of the groups. tagmap = self.vocab.get_index_to_token_vocabulary("labels") self.const_dict = {} indices = [] for i in range(self.num_tags): tagname = tagmap[i] if tagname != "unmatched": const_text = const[tagname] else: const_text = "@@pad@@" const_toks = _spacy_word_splitter.split_words(const_text) # truncate so BERT is happy. const_toks = const_toks[:250] const_indices = token_indexer.tokens_to_indices( const_toks, vocab, tokens_namespace) indices.append(const_indices) max_len = max(map(lambda j: len(j[tokens_namespace]), indices)) max_offset_len = max( map(lambda j: len(j["tokens-offsets"]), indices)) const_tensor = torch.zeros(self.num_tags, max_len).long() const_tensor_offsets = torch.zeros(self.num_tags, max_offset_len).long() const_tensor_mask = torch.zeros(self.num_tags, max_offset_len).long() for i, ind in enumerate(indices): toks = ind[tokens_namespace] mask = ind["mask"] const_tensor[i, :len(toks)] = torch.LongTensor(toks) const_tensor_offsets[ i, :len(ind["tokens-offsets"])] = torch.LongTensor( ind["tokens-offsets"]) const_tensor_mask[i, :len(mask)] = torch.LongTensor(mask) const_tokens = { tokens_namespace: const_tensor, "tokens-offsets": const_tensor_offsets, "mask": const_tensor_mask } print("Embedding the constitution... this could take a minute...") self.const_mask = util.get_text_field_mask(const_tokens) self.const_emb = self._token_embedder(const_tokens).detach() print("Done embedding the constitution.") if torch.cuda.is_available(): self.const_emb = self.const_emb.cuda() self.const_mask = self.const_mask.cuda() self.vectorf1 = VectorF1(unmatched_index=self.vocab.get_token_index( "unmatched", namespace="labels")) # self.metric = F1Measure(positive_label=1) # self.ff = FeedForward(doc_encoder.get_output_dim(), num_layers=4, # hidden_dims=100, # activations=Activation.by_name("relu")()) #self.tag_projection_layer = Linear(self.ff.get_output_dim(), self.num_tags) #self.choice_projection_layer = Linear(self.ff.get_output_dim(), 2) self.sim_ff = TimeDistributed( FeedForward(doc_encoder.get_output_dim(), num_layers=1, hidden_dims=2, activations=Activation.by_name("relu")()))