def test_long_sequence_splitting(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        indexer = PretrainedTransformerMismatchedIndexer("bert-base-uncased",
                                                         max_length=4)
        text = ["AllenNLP", "is", "great"]
        tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"]))
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        assert len(
            expected_ids) == 7  # just to make sure it's what we're expecting
        cls_id, sep_id = expected_ids[0], expected_ids[-1]
        expected_ids = (expected_ids[:3] + [sep_id, cls_id] +
                        expected_ids[3:5] + [sep_id, cls_id] +
                        expected_ids[5:])

        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices([Token(word) for word in text],
                                            vocab)

        assert indexed["token_ids"] == expected_ids
        # [CLS] allen ##nl [SEP] [CLS] #p is [SEP] [CLS] great [SEP]
        assert indexed["segment_concat_mask"] == [1] * len(expected_ids)
        # allennlp is great
        assert indexed["mask"] == [1] * len(text)
        # [CLS] allen #nl #p is great [SEP]
        assert indexed["wordpiece_mask"] == [1] * 7
    def test_bert(self):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
        indexer = PretrainedTransformerMismatchedIndexer("bert-base-cased")
        text = ["AllenNLP", "is", "great"]
        tokens = tokenizer.tokenize(" ".join(["[CLS]"] + text + ["[SEP]"]))
        expected_ids = tokenizer.convert_tokens_to_ids(tokens)
        vocab = Vocabulary()
        indexed = indexer.tokens_to_indices([Token(word) for word in text],
                                            vocab)
        assert indexed["token_ids"] == expected_ids
        assert indexed["mask"] == [1] * len(text)
        # Hardcoding a few things because we know how BERT tokenization works
        assert indexed["offsets"] == [(1, 3), (4, 4), (5, 5)]
        assert indexed["wordpiece_mask"] == [1] * len(expected_ids)

        keys = indexed.keys()
        assert indexer.get_empty_token_list() == {key: [] for key in keys}

        max_length = 10
        padding_lengths = {key: max_length for key in keys}
        padded_tokens = indexer.as_padded_tensor_dict(indexed, padding_lengths)
        for key in keys:
            padding_length = max_length - len(indexed[key])
            padding = (0, 0) if key == "offsets" else 0
            expected_value = indexed[key] + ([padding] * padding_length)
            assert len(padded_tokens[key]) == max_length
            if key == "offsets":
                expected_value = [list(t) for t in expected_value]
            assert padded_tokens[key].tolist() == expected_value
예제 #3
0
파일: main.py 프로젝트: lgessler/embur
def tmp():
    config = "configs/bert_pretrain.jsonnet"
    serialization_dir = "models"
    output_dir = "bert_out"
    tokenizer_conllu_path = "data/coptic/converted/train"
    documents = read_conllu_files(tokenizer_conllu_path)
    sentences = []
    for document in documents:
        for sentence in document:
            sentences.append(" ".join([t['form'] for t in sentence]))
    print("Training tokenizer...")
    os.environ["TOKENIZER_PATH"] = output_dir

    t = train_bert_tokenizer(sentences,
                             serialize_path=output_dir,
                             vocab_size=6000)
    tok = PretrainedTransformerTokenizer("./bert_out/")
    idx = PretrainedTransformerMismatchedIndexer("./bert_out/")
    vocab = Vocabulary()
    vocab.set_from_file("bert_out/vocab.txt",
                        oov_token="[UNK]",
                        is_padded=True)
    s = tok.tokenize(sentences[1])
    i = idx.tokens_to_indices(s, vocab)
    i
    print(t)
예제 #4
0
def test_tokenized_sentences_dataset():
    sentences = [
        "this is a simple sentence .".split(),
        "Claudio Monteverdi ( 15 May 1567 – 29 November 1643 ) was an Italian composer ."
        .split(), "string player and maestro di cappella .".split(),
        "A composer of both secular and sacred music , and a pioneer in the development of opera , he is considered a transitional figure ."
        .split(),
        "Between the Renaissance and the Baroque periods of music history .".
        split(),
        "He was a court musician in Mantua ( c. 1590 – 1613 ) , and then maestro di cappella at St Mark's Basilica in the Republic of Venice ."
        .split(),
        "His surviving music includes nine books of madrigals , in the tradition of earlier"
        .split()
    ]
    indexer = PretrainedTransformerMismatchedIndexer("bert-large-cased")
    dataset = TokenizedSentencesDataset(sentences, indexer)
    dataset.index_with(Vocabulary())
    iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False)

    for elem in iterator:
        print(elem)
    model = MultilayerPretrainedTransformerMismatchedEmbedder(
        "bert-large-cased", [-1])
    iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False)
    for batch in iterator:
        print(model(**batch["tokens"]["tokens"]))
    def test_end_to_end_for_first_sub_token_embedding(self,
                                                      sub_token_mode: str):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", ",", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                    "sub_token_mode": sub_token_mode,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["mask"].tolist() == [
            [True, True, True, True, True, False],
            [True, True, True, True, True, True],
        ]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [2, 2], [3, 5], [6, 6], [7, 7], [0, 0]],
            [[1, 3], [4, 4], [5, 5], [6, 6], [7, 8], [9, 9]],
        ]

        # Attention mask
        bert_vectors = token_embedder(tokens)

        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
예제 #6
0
 def __init__(self,
              my_device=torch.device('cuda:2'),
              model_name='roberta.hdf5',
              model_path=current_directory_path +
              '/external_pretrained_models/'):
     self.answ = "UNKNOWN ERROR"
     self.model_name = model_name
     self.model_path = model_path
     self.first_object = ''
     self.second_object = ''
     self.predicates = ''
     self.aspects = ''
     cuda_device = my_device
     self.spans = [
     ]  # we can't use set because span object is dict and dict is unchashable. We add function add_span to keep non-repeatability
     try:
         print(self.model_path + self.model_name)
         print(model_path + "vocab_dir")
         vocab = Vocabulary.from_files(model_path + "vocab_dir")
         BERT_MODEL = 'google/electra-base-discriminator'
         embedder = PretrainedTransformerMismatchedEmbedder(
             model_name=BERT_MODEL)
         text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
         seq2seq_encoder = PassThroughEncoder(
             input_dim=embedder.get_output_dim())
         print("encoder loaded")
         self.indexer = PretrainedTransformerMismatchedIndexer(
             model_name=BERT_MODEL)
         print("indexer loaded")
         self.model = SimpleTagger(
             text_field_embedder=text_field_embedder,
             vocab=vocab,
             encoder=seq2seq_encoder,
             calculate_span_f1=True,
             label_encoding='IOB1').cuda(device=cuda_device)
         self.model.load_state_dict(
             torch.load(self.model_path + self.model_name))
         print("model loaded")
         self.reader = Conll2003DatasetReader(
             token_indexers={'tokens': self.indexer})
         print("reader loaded")
     except:
         e = sys.exc_info()[0]
         print("exeption while mapping to gpu in extractor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is OOM")
     try:
         self.predictor = SentenceTaggerPredictor(self.model, self.reader)
     except:
         e = sys.exc_info()[0]
         print("exeption in creating predictor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is WTF")
    def load_dataset(self, data_folder):
        corpus = ColumnCorpus(data_folder, {
            0: 'text',
            1: 'ner'
        },
                              train_file='train.txt',
                              test_file='test.txt',
                              dev_file='dev.txt')  # We do not need dev set

        indexer = PretrainedTransformerMismatchedIndexer(
            model_name=self.config.BERT_MODEL_TYPE)
        self.reader = GeniaDatasetReader(token_indexers={'tokens': indexer},
                                         tag_label='ner')
        self.train_dataset_for_voc = self.reader.read(data_folder +
                                                      '/train.txt')

        # Creating tag dictionaries
        self.idx2tag, self.tag2idx = make_bert_tag_dict_from_flair_corpus(
            corpus)
        self.tags = list(
            set((tag.split('-')[1] for tag in self.idx2tag
                 if len(tag.split('-')) > 1)))
        print('Tags:', self.tags)

        # Convert into the format suitable for training
        self.X_train, self.y_train = prepare_corpus(corpus.train)

        self.X_test, self.y_test = prepare_corpus(corpus.test)
        #         print(self.X_test)
        #         print('------------------')
        #         print(self.y_test)
        #         sent = CSentence(tokens, sentences[i])
        #         entities = format_entities(sent, result[i])
        #         tokenizer = ProcessorTokenizerRu()
        #         splitter = ProcessorSentenceSplitter()
        #         for x, y in zip(self.X_test, self.y_test):
        #             print(x, y)
        #             abc = format_entities(x, y)
        #             if abc:
        #                 print(abc, '------------')

        # Convert into the format suitable for visualization
        # self.y_train_dict = convert_y_to_dict_format(self.X_train, self.y_train)
        self.X_helper = create_helper(self.X_train)
        print("self.X_train[:1]:")
        print(self.X_train[:1])
        print("self.X_helper.head():")
        print(self.X_helper.head())

        self.y_seed_dict = [None for _ in range(len(self.X_helper))]
예제 #8
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        domain_identifier: str = None,
        bert_model_name: str = None,
        with_spans: bool = False,
        mismatched_tokens: bool = False,
        random_sample: bool = False,
        random_seed: int = None,
        limit: int = -1,
        print_violations: bool = False,
        label_namespace: str = "labels",
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self._token_indexers = token_indexers or {
            "tokens": PretrainedTransformerIndexer(model_name=bert_model_name)
        }
        if mismatched_tokens:
            self._token_indexers = {
                "tokens":
                PretrainedTransformerMismatchedIndexer(
                    model_name=bert_model_name)
            }
            self._dummy_indexer = {"dummy_tokens": SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier
        self._with_spans = with_spans
        self._mismatched_tokens = mismatched_tokens
        self._limit = limit
        self._random_sample = random_sample
        self._random_seed = random_seed
        self._max_sequence_length = 0
        self._print_violations = print_violations
        self._label_namespace = label_namespace

        if bert_model_name is not None:
            self.bert_tokenizer = AutoTokenizer.from_pretrained(
                bert_model_name)
            if with_spans:
                self.bert_tokenizer_allennlp = PretrainedTransformerTokenizer(
                    model_name=bert_model_name)
            self.lowercase_input = "uncased" in bert_model_name
            self.bert_config = AutoConfig.from_pretrained(bert_model_name)
        else:
            self.bert_tokenizer = None
            self.lowercase_input = False
예제 #9
0
 def __init__(
     self,
     wordpiece_tokenizer: Tokenizer = None,
     token_indexers: Dict[str, TokenIndexer] = None,
     combine_input_fields: bool = None,
     input_parsed: bool = None,
     parser: StanzaPipeline = None,
     input_fields: List = None,
     **kwargs,
 ) -> None:
     #super().__init__(manual_distributed_sharding=True, **kwargs)
     super(NLIGraphReader, self).__init__(**kwargs)
     self._wordpiece_tokenizer = wordpiece_tokenizer or PretrainedTransformerTokenizer(config.TRANSFORMER_NAME)
     self._token_indexers = token_indexers or {"tokens": PretrainedTransformerMismatchedIndexer(config.TRANSFORMER_NAME)}
     self._combine_input_fields = combine_input_fields or False
     self._input_parsed = input_parsed if (input_parsed is not None) else True 
     self._parser = parser or None
     self.f = input_fields or config.default_fields
예제 #10
0
    def test_token_without_wordpieces(self):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "", "great"]
        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()
        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"]["offsets"].tolist() == [
            [[1, 1], [-1, -1], [2, 4], [5, 5], [6, 6]],
            [[1, 3], [-1, -1], [4, 4], [0, 0], [0, 0]],
        ]

        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, max(len(sentence1),
                                              len(sentence2)), 768)
        assert not torch.isnan(bert_vectors).any()
        assert all(bert_vectors[0, 1] == 0)
        assert all(bert_vectors[1, 1] == 0)
    def test_throws_error_on_incorrect_sub_token_mode(self,
                                                      sub_token_mode: str):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", ",", "AllenNLP", "sentence", "."]
        sentence2 = ["AllenNLP", "is", "open", "source", "NLP", "library"]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer_mismatched",
                    "model_name": "bert-base-uncased",
                    "sub_token_mode": sub_token_mode,
                }
            }
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        with pytest.raises(ConfigurationError):
            token_embedder(tokens)
예제 #12
0
    def test_exotic_tokens_no_nan_grads(self):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = [
            "A", "\uf732\uf730\uf730\uf733", "AllenNLP", "sentence", "."
        ]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()

        token_embedder = BasicTextFieldEmbedder({
            "bert":
            PretrainedTransformerMismatchedEmbedder("bert-base-uncased")
        })

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        bert_vectors = token_embedder(tokens)
        test_loss = bert_vectors.mean()

        test_loss.backward()

        for name, param in token_embedder.named_parameters():
            grad = param.grad
            assert (grad is None) or (not torch.any(torch.isnan(grad)).item())
예제 #13
0
from allennlp.data import Vocabulary, DataLoader
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer

from build.lib.nlp_tools.nlp_models.multilayer_pretrained_transformer_mismatched_embedder import \
    MultilayerPretrainedTransformerMismatchedEmbedder
from nlp_tools.data_io.datasets import TokenizedSentencesDataset

if __name__ == "__main__":
    encoder_name = "bert-large-cased"
    print("loading indexer")
    indexer = PretrainedTransformerMismatchedIndexer(encoder_name)
    print("loading embedder")
    embedder = MultilayerPretrainedTransformerMismatchedEmbedder(encoder_name, layers_to_merge=[-1, -2, -3, -4],
                                                                 )

    s1 = "When the New York Stock Exchange heard the announcement , equities plummeted , causing a chain reaction of bank runs and failures throughout the United States that signaled the arrival of the Panic of 1873 to American shores .".split()
    s2 = "On the north bank of the river , WIS 42 turns northwest onto North Water Street and follows the river in a northerly direction to Forestville in Door County , where it becomes Forestville Avenue .".split()
    sentences = [s1, s2]
    dataset = TokenizedSentencesDataset(sentences, indexer)
    dataset.index_with(Vocabulary())
    # iterator = get_bucket_iterator(dataset, 1000, is_trainingset=False,
    #                                device=torch.device("cuda"))
    iterator = DataLoader(dataset, batch_size=2)
    outputs = list()
    import numpy as np
    for batch in iterator:
        batch_output = list()
        net_out = embedder(**batch["tokens"]["tokens"])
        a = net_out[0][18].detach().numpy()
        b = net_out[1][3].detach().numpy()
        a = a / np.linalg.norm(a, keepdims=True)
예제 #14
0
import shutil
import sys

import numpy as np

import torch
import torch.optim as optim
from torch.utils.data import DataLoader

#  CONFIG SECTION
expname = "exp1"

logger = logging.getLogger(__name__)

model_name = "bert-base-cased"
indexers = {"bert" : PretrainedTransformerMismatchedIndexer(model_name, namespace="bert")}

reader = Conll2003DatasetReader(token_indexers = indexers)
train_dataset = reader.read("conll2003/eng.train")
validation_dataset = reader.read("conll2003/eng.testa")
test_dataset = reader.read("conll2003/eng.testb")

all_insts = train_dataset + validation_dataset + test_dataset


vocab = Vocabulary.from_instances(all_insts)

dataset = Batch(all_insts)
dataset.index_instances(vocab)

embedder = PretrainedTransformerMismatchedEmbedder(model_name, last_layer_only = True)
# This pattern is typically used in cases where your input data is already
# tokenized, so we're showing that here.
text_tokens = ["This", "is", "some", "frandibulous", "text", "."]
tokens = [Token(x) for x in text_tokens]
print(tokens)

# We're using a very small transformer here so that it runs quickly in binder. You
# can change this to any transformer model name supported by Hugging Face.
transformer_model = 'google/reformer-crime-and-punishment'

# Represents the list of word tokens with a sequences of wordpieces as determined
# by the transformer's tokenizer.  This actually results in a pretty complex data
# type, which you can see by running this.  It's complicated because we need to
# know how to combine the wordpieces back into words after running the
# transformer.
indexer = PretrainedTransformerMismatchedIndexer(model_name=transformer_model)

text_field = TextField(tokens, {'transformer': indexer})
text_field.index(Vocabulary())
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())

# There are two key things to notice in this output.  First, there are two masks:
# `mask` is a word-level mask that gets used in the utility functions described in
# the last section of this chapter.  `wordpiece_mask` gets used by the `Embedder`
# itself.  Second, there is an `offsets` tensor that gives start and end wordpiece
# indices for the original tokens.  In the embedder, we grab these, average all of
# the wordpieces for each token, and return the result.
print("Indexed tensors:", token_tensor)

embedding = PretrainedTransformerMismatchedEmbedder(model_name=transformer_model)
예제 #16
0
 def test_auto_determining_num_tokens_added(self):
     indexer = PretrainedTransformerMismatchedIndexer("bert-base-cased")
     assert indexer._determine_num_special_tokens_added() == (1, 1)
예제 #17
0
파일: mlm.py 프로젝트: lgessler/embur
import torch
from allennlp.data import Vocabulary, Instance, Token
from allennlp.data.fields import TextField
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.nn.util import get_token_ids_from_text_field_tensors
from transformers import BertTokenizer, DataCollatorForWholeWordMask

tokenizer = BertTokenizer.from_pretrained('./bert_out')
vocab = Vocabulary(non_padded_namespaces=["tokens"])
vocab.add_transformer_vocab(tokenizer, "tokens")

vocab.get_token_index("[PAD]", "tokens")


idx = PretrainedTransformerMismatchedIndexer("./bert_out", namespace="tokens")
def prepare_instance(s):
    tokens = [Token(t) for t in s.split(" ")]
    indexed = idx.tokens_to_indices(tokens, vocab)
    print([vocab.get_token_from_index(i) for i in indexed['token_ids']])
    return Instance({"tokens": TextField(tokens, {"tokens": idx})})

instances = [prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ"), prepare_instance("ϩⲙⲡⲣⲁⲛ ⲙⲡⲛⲟⲩⲧⲉ ⲛϣⲟⲣⲡ ⲁⲛⲟⲕ")]
for i in instances:
    i["tokens"].index(vocab)

tensors = [i.as_tensor_dict() for i in instances]

collator = DataCollatorForWholeWordMask(tokenizer=tokenizer)
ids = torch.cat([tensors[0]['tokens']['tokens']['token_ids'].unsqueeze(0),
                 tensors[1]['tokens']['tokens']['token_ids'].unsqueeze(0)], dim=0)