예제 #1
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
 def test_do_lower_case(self):
     # BertBasicWordSplitter makes every token not in `never_split` to lowercase by default
     word_splitter = BertBasicWordSplitter(never_split=["[UNUSED0]"])
     sentence = "[UNUSED0] [UNK] [unused0]"
     expected_tokens = ["[UNUSED0]", "[", "unk", "]", "[", "unused0", "]"]
     tokens = [token.text for token in word_splitter.split_words(sentence)]
     assert tokens == expected_tokens
 def batch_to_ids(stncs, tgt_flag=False):
     """
     convert list of text into ids that elmo accepts
     :param stncs: [['I', 'Like', 'you'],['Yes'] ]
     :param tgt_flag: indicates if the inputs is a target sentences, if it is,
                     use only the previous words as context, and neglect last word
     :return ids: indices to feed into elmo
     """
     tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
     tokens = tokenizer.tokenize(stncs)
     vocab = Vocabulary()
     vocab_path = ""
     token_indexer = PretrainedBertIndexer(str(vocab_path))
     ids = token_indexer.tokens_to_indices(tokens, vocab, "bert")
     if tgt_flag:
         ids = ids[:, :-1, :]  # neglect the last word
         b_size, _len, dim = ids.shape
         expand_ids = torch.zeros(b_size * _len,
                                  _len,
                                  dim,
                                  dtype=torch.long)
         for i in range(1, _len + 1):
             expand_ids[b_size * (i - 1):b_size * i, :i, :] = ids[:, :i, :]
         return expand_ids
     return ids
예제 #4
0
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [1, 2, 3, 4, 5, 6, 7, 8, 9]
        ]
예제 #5
0
    def test_starting_ending_offsets(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #           2   3     5     6   8      9    2  15 10 11 14   1
        sentence = "the quick brown fox jumped over the laziest lazy elmo"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path))

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 10, 11, 12
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              use_starting_offsets=True)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        assert indexed_tokens["bert"] == [
            16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17
        ]
        assert indexed_tokens["bert-offsets"] == [
            1, 2, 3, 4, 5, 6, 7, 8, 11, 12
        ]
예제 #6
0
    def test_truncate_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8]
예제 #7
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
예제 #8
0
    def test_sliding_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown [SEP] jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              use_starting_offsets=False,
                                              max_pieces=10)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
                                        # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert"] == [16,   2,  3,    4,  3,    5,    17,   8,     9,   17,
                                        # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                          16,   5,    17,   8,     9,   2,  14,  12, 17]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 10, 11]

        # The extra [SEP]s shouldn't pollute the token-type-ids
                                                 # [CLS] the quick est quick brown [SEP] jumped over [SEP]
        assert indexed_tokens["bert-type-ids"] == [0,    0,  0,    0,  0,    0,    0,    1,     1,   1,
                                                 # [CLS] brown [SEP] jumped over the lazy dog [SEP]
                                                   0,    0,    0,    1,     1,   1,  1,   1,  1]
예제 #9
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "The quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "The quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        assert len(tokens1) == 10
        assert len(tokens2) == 10

        tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2

        assert len(tokens) == 22

        vocab = Vocabulary()

        instance = Instance(
            {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()

        tensor_dict = batch.as_tensor_dict(padding_lengths)

        tokens = tensor_dict["sentence_pair"]
        assert tokens['mask'].tolist()[0] == [1] * 22
        assert tokens["bert"].tolist()[0] == [
            101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971,
            3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272,
            3367, 13971, 17709, 2080
        ]
        assert [
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ] == [
            '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox',
            'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick',
            'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st',
            'lazy', 'elm', '##o'
        ]
        assert len(tokens['bert'][0]) == 26
        assert tokens["bert-offsets"].tolist()[0] == [
            0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            22, 23, 25
        ]
        assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13

        bert_vectors = self.token_embedder(
            tokens["bert"],
            offsets=tokens["bert-offsets"],
            token_type_ids=tokens['bert-type-ids'])
        assert list(bert_vectors.shape) == [1, 22, 768]
예제 #10
0
class Conll04BERTBinaryReader(Conll04SpaCyBinaryReader):
    splitter = BertBasicWordSplitter()

    @classmethod
    def match(cls, index, tokens, split_tokens):
        tk_idx = 0
        tk_tkn = tokens[tk_idx].lower()
        st_idx = 0
        st_tkn = split_tokens[st_idx].text.lower()

        matched_tokens = []
        while True:
            if index[0] <= tk_idx and tk_idx < index[
                    1] and st_idx not in matched_tokens:
                matched_tokens.append(st_idx)

            if len(tk_tkn) < len(st_tkn):
                assert st_tkn.startswith(tk_tkn)
                st_tkn = st_tkn[len(tk_tkn):]
                tk_idx += 1
                tk_tkn = tokens[tk_idx].lower()
            elif len(tk_tkn) > len(st_tkn):
                assert tk_tkn.startswith(st_tkn)
                tk_tkn = tk_tkn[len(st_tkn):]
                st_idx += 1
                st_tkn = split_tokens[st_idx].text.lower()
            else:
                assert st_tkn == tk_tkn
                tk_idx += 1
                st_idx += 1
                if tk_idx == len(tokens):
                    assert st_idx == len(split_tokens)
                    break
                tk_tkn = tokens[tk_idx].lower()
                st_tkn = split_tokens[st_idx].text.lower()

        return matched_tokens

    @cls.textfield('word')
    def update_sentence_raw(self, fields, tokens) -> Field:
        indexers = {
            'word': PretrainedBertIndexer(pretrained_model='bert-base-uncased')
        }
        textfield = TextField(tokens, indexers)
        return textfield

    @cls.field('cancidate')
    def update_relation_cancidate(self, fields: Dict, raw_sample) -> Field:
        tokens, labels, relations = raw_sample
        if relations is None:
            return None
        relation_indices = []
        for relation_type, src_token, dst_token in relations:
            relation_indices.append(
                (src_token[self._entity_index], dst_token[self._entity_index]))
        return AdjacencyField(relation_indices,
                              fields[self.get_fieldname('word')],
                              padding_value=0)
예제 #11
0
def get_tokenizer(embedding_type: str, xlnet_vocab_file: Path) -> WordSplitter:
    if embedding_type == 'bert':
        splitter = BertBasicWordSplitter()
    elif embedding_type == 'glove':
        splitter = SpacyWordSplitter()
    elif embedding_type == 'xlnet':
        splitter = XLNetWordSplitter(vocab_file=str(xlnet_vocab_file))
    else:
        raise ValueError(f'Embedding type {embedding_type} not available.')
    return WordTokenizer(word_splitter=splitter)
예제 #12
0
    def __init__(self, word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        splitter = BertBasicWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            word_indexer = PretrainedBertIndexer(
                pretrained_model='bert-base-uncased',
                truncate_long_sequences=False)
        self.word_indexers = {'tokens': word_indexer}
 def __init__(self,
              max_instances: int = None,
              min_abstract_len: int = 10,
              lazy: bool = False,
              tokenizer: Tokenizer = None,
              token_indexers: Dict[str, TokenIndexer] = None,
              ) -> None:
     super().__init__(lazy)
     self.max_instances = max_instances
     self.min_abstract_len = min_abstract_len
     self._tokenizer = tokenizer or BertBasicWordSplitter()
     self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["bert"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
            [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17],
        ]

        assert tokens["bert-offsets"].tolist() == [
            [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
            [1, 2, 3, 4, 5, 6, 7, 10, 11, 12],
        ]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
    def test_sliding_window(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["bert"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["bert-offsets"].tolist() == [[
            1, 3, 4, 5, 6, 7, 8, 9, 10, 11
        ]]

        bert_vectors = token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"],
                                      token_type_ids=tokens["bert-type-ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
예제 #16
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
예제 #17
0
    def __init__(self,
                 label_field: str,
                 text_field: str,
                 paper_lookup_path: str,
                 sent_max_len: int = 256,
                 lazy: bool = False,
                 tokenizer: Tokenizer = None,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 ) -> None:
        super().__init__(lazy)
        self._tokenizer = tokenizer or BertBasicWordSplitter()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._label_field = label_field
        self._text_field = text_field
        self._sent_max_len = sent_max_len

        self._paper_lookup = self.load_paper_lookup(paper_lookup_path)
예제 #18
0
    def test_truncate_window_dont_split_wordpieces(self):
        """
        Tests if the sentence is not truncated inside of the word with 2 or
        more wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=12)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 17]
        # We could fit one more piece here, but we don't, not to have a cut
        # in the middle of the word
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9]
예제 #19
0
    def test_truncate_window_fit_two_wordpieces(self):
        """
        Tests if the both `use_starting_offsets` options work properly when last
        word in the truncated sentence consists of two wordpieces.
        """

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the quickest dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()
        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=True,
                                              max_pieces=13)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [
            16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17
        ]
        assert indexed_tokens["bert-offsets"] == [1, 2, 4, 5, 6, 7, 8, 9, 10]
        assert indexed_tokens["bert-type-ids"] == [
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        ]

        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=True,
                                              use_starting_offsets=False,
                                              max_pieces=13)

        indexed_tokens = token_indexer.tokens_to_indices(tokens, vocab, "bert")

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert indexed_tokens["bert"] == [
            16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 3, 4, 17
        ]
        assert indexed_tokens["bert-offsets"] == [1, 3, 4, 5, 6, 7, 8, 9, 11]
예제 #20
0
파일: snli_test.py 프로젝트: Shuailong/SPM
    def test_read(self, lazy):
        reader = SnliReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={
                'bert':
                PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)
            },
        )

        instances = reader.read(
            str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens'].tokens]
        label = example.fields['label'].label
        weight = example.fields['weight'].weight
        assert label == 'neutral'
        assert weight == 1
        assert instances[1].fields['weight'].weight == 0.5
        assert instances[2].fields['weight'].weight == 1
        assert tokens == [
            'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken',
            'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training',
            'his', 'horse', 'for', 'a', 'competition', '.'
        ]
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
예제 #22
0
def set_values(max_sequence_length: Optional[int] = -1,
               concat_title_abstract: Optional[bool] = None,
               data_source: Optional[str] = None,
               included_text_fields: Optional[str] = None) -> None:
    # set global values
    # note: a class with __init__ would have been a better design
    # we have this structure for efficiency reasons: to support multiprocessing
    # as multiprocessing with class methods is slower
    global _tokenizer
    global _token_indexers
    global _token_indexer_author_id
    global _token_indexer_author_position
    global _token_indexer_venue
    global _token_indexer_id
    global _max_sequence_length
    global _concat_title_abstract
    global _data_source
    global _included_text_fields

    if _tokenizer is None:  # if not initialized, initialize the tokenizers and token indexers
        _tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter(
            do_lower_case=bert_params["do_lowercase"]))
        _token_indexers = {
            "bert": PretrainedBertIndexer.from_params(Params(bert_params))
        }
        _token_indexer_author_id = {
            "tokens": SingleIdTokenIndexer(namespace='author')
        }
        _token_indexer_author_position = {
            "tokens": SingleIdTokenIndexer(namespace='author_positions')
        }
        _token_indexer_venue = {
            "tokens": SingleIdTokenIndexer(namespace='venue')
        }
        _token_indexer_id = {"tokens": SingleIdTokenIndexer(namespace='id')}
    _max_sequence_length = max_sequence_length
    _concat_title_abstract = concat_title_abstract
    _data_source = data_source
    _included_text_fields = included_text_fields
예제 #23
0
 def word_embeddings(self):
     words = re.split(r'\W+',self.text) 
     Text = ' '.join(words)
     
     tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter())
     
     tokens = tokenizer.tokenize(Text)
     vocab = Vocabulary()
     token_indexer = PretrainedBertIndexer('bert-base-uncased')
     
     instance = Instance({"tokens":TextField(tokens,{'bert':token_indexer})})
     batch = Batch([instance])
     batch.index_instances(vocab)
     
     padding_lenghts = batch.get_padding_lengths()
     tensor_dict = batch.as_tensor_dict(padding_lenghts)
     
     Tokens = tensor_dict["tokens"]
     
     model = PretrainedBertEmbedder('bert-base-uncased')
     bert_vectors = model(Tokens["bert"])
     return(bert_vectors)
예제 #24
0
    def __init__(self,
                 pretrained_model: str,
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_pieces: int = 512,
                 num_choices: int = 5,
                 answer_only: bool = False,
                 restrict_num_choices: int = None,
                 ignore_context: bool = False,
                 sample: int = -1,
                 random_seed: int = 0) -> None:
        super().__init__()

        self._token_indexers = token_indexers or {
            'tokens': SingleIdTokenIndexer()
        }
        lower_case = not '-cased' in pretrained_model
        self._word_splitter = BertBasicWordSplitter(do_lower_case=lower_case)
        self._max_pieces = max_pieces
        self._sample = sample
        self._num_choices = num_choices
        self._answer_only = answer_only
        self._restrict_num_choices = restrict_num_choices
        self._ignore_context = ignore_context
        self._random_seed = random_seed
예제 #25
0
    def __init__(self,
                 text_lookup_path: str,
                 embedded_text: str = 'title',
                 use_bos_eos: bool = True,
                 lazy: bool = False,
                 sent_len_limit: int = None,
                 abstract_tokenizer: Tokenizer = None,
                 abstract_indexers: Dict[str, TokenIndexer] = None,
                 sequence_tokenizer: Tokenizer = None,
                 sequence_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy)
        if embedded_text == 'title':
            with open(text_lookup_path) as f:
                self.data_lookup = {
                    line[0]: {
                        'abstract': line[2]
                    }
                    for line in map(lambda x: x.strip().split('\t'),
                                    f.readlines()) if len(line) > 2
                }
        elif embedded_text == 'abstract':
            with jsonlines.open(text_lookup_path) as reader:
                self.data_lookup = {item['paper_id']: item for item in reader}
        # Add these now so we can find them in the lookup, then replace with
        # [unused0] and [unused1] in text_to_instance method
        self.data_lookup['<s>'] = {'abstract': '<s>'}
        self._sent_len_limit = sent_len_limit
        self._abstract_tokenizer = abstract_tokenizer or BertBasicWordSplitter(
        )
        self._abstract_indexers = abstract_indexers

        self._sequence_tokenizer = sequence_tokenizer or JustSpacesWordSplitter(
        )
        self._sequence_indexers = sequence_indexers or {
            "tokens": SingleIdTokenIndexer()
        }
예제 #26
0
    def __init__(self,
                 is_bert: bool,
                 conceptnet_path: Path,
                 word_indexer: Optional[TokenIndexer] = None):
        super().__init__(lazy=False)

        if is_bert:
            splitter = BertBasicWordSplitter()
        else:
            splitter = SpacyWordSplitter()
        self.tokeniser = WordTokenizer(word_splitter=splitter)

        if word_indexer is None:
            if is_bert:
                word_indexer = PretrainedBertIndexer(
                    pretrained_model='bert-base-uncased',
                    truncate_long_sequences=True)
            else:
                word_indexer = SingleIdTokenIndexer(lowercase_tokens=True)
        self.word_indexers = {'tokens': word_indexer}

        # self.rel_indexers = {
        #     "rel_tokens": SingleIdTokenIndexer(namespace='rel_tokens')}
        self.conceptnet = ConceptNet(conceptnet_path=conceptnet_path)
예제 #27
0
from allennlp.data.dataset import Batch
from allennlp.data.fields import TextField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers.wordpiece_indexer import PretrainedBertIndexer
from allennlp.data.tokenizers import WordTokenizer
from allennlp.data.tokenizers.word_splitter import BertBasicWordSplitter
from allennlp.data.vocabulary import Vocabulary
from allennlp.modules.token_embedders.bert_token_embedder import PretrainedBertEmbedder
import re

tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
vocab = Vocabulary()
token_indexer = PretrainedBertIndexer('bert-base-uncased')
model = PretrainedBertEmbedder('bert-base-uncased')


class preprocessing(object):
    def __init__(self, text):
        self.text = text

    def bert_vector(self):
        words = re.split(r'\W+', self.text)
        Text = ' '.join(words)

        tokens = tokenizer.tokenize(Text)

        instance = Instance(
            {"tokens": TextField(tokens, {'bert': token_indexer})})
        batch = Batch([instance])
        batch.index_instances(vocab)
 def setUp(self):
     super().setUp()
     self.word_splitter = BertBasicWordSplitter()
예제 #29
0
        if label == pred_label:
            correct += 1
    print(
        f'Accuracy: {correct}/{len(labels)} = {correct/len(labels)*100:.2f}%')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='CSQA using BERT NSP model')
    parser.add_argument('--input', help='input dataset')
    parser.add_argument('--bert-vocab', help='bert vocab file')
    parser.add_argument('--bert-model', help='pretrained bert model')
    parser.add_argument('--batch-size',
                        type=int,
                        default=8,
                        help='batch size for BERT')
    parser.add_argument('--gpu-id', '-g', type=int, default=0, help='GPU ID')

    args = parser.parse_args()

    print('Initialize BERT model...')

    TOKENIZER = WordTokenizer(word_splitter=BertBasicWordSplitter())
    WORD_INDEXER = PretrainedBertIndexer(pretrained_model=args.bert_vocab)
    VOCAB = Vocabulary()
    GPU_ID = args.gpu_id
    BERT_NEXT_SENTENCE = BertForNextSentencePrediction.from_pretrained(
        args.bert_model).to(torch.device(f"cuda:{GPU_ID}"))
    BERT_NEXT_SENTENCE.eval()

    main()