Ejemplo n.º 1
0
 def __init__(self,
              my_device=torch.device('cuda:2'),
              model_name='roberta.hdf5',
              model_path=current_directory_path +
              '/external_pretrained_models/'):
     self.answ = "UNKNOWN ERROR"
     self.model_name = model_name
     self.model_path = model_path
     self.first_object = ''
     self.second_object = ''
     self.predicates = ''
     self.aspects = ''
     cuda_device = my_device
     self.spans = [
     ]  # we can't use set because span object is dict and dict is unchashable. We add function add_span to keep non-repeatability
     try:
         print(self.model_path + self.model_name)
         print(model_path + "vocab_dir")
         vocab = Vocabulary.from_files(model_path + "vocab_dir")
         BERT_MODEL = 'google/electra-base-discriminator'
         embedder = PretrainedTransformerMismatchedEmbedder(
             model_name=BERT_MODEL)
         text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
         seq2seq_encoder = PassThroughEncoder(
             input_dim=embedder.get_output_dim())
         print("encoder loaded")
         self.indexer = PretrainedTransformerMismatchedIndexer(
             model_name=BERT_MODEL)
         print("indexer loaded")
         self.model = SimpleTagger(
             text_field_embedder=text_field_embedder,
             vocab=vocab,
             encoder=seq2seq_encoder,
             calculate_span_f1=True,
             label_encoding='IOB1').cuda(device=cuda_device)
         self.model.load_state_dict(
             torch.load(self.model_path + self.model_name))
         print("model loaded")
         self.reader = Conll2003DatasetReader(
             token_indexers={'tokens': self.indexer})
         print("reader loaded")
     except:
         e = sys.exc_info()[0]
         print("exeption while mapping to gpu in extractor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is OOM")
     try:
         self.predictor = SentenceTaggerPredictor(self.model, self.reader)
     except:
         e = sys.exc_info()[0]
         print("exeption in creating predictor ", e)
         raise RuntimeError(
             "Init extractor: can't map to gpu. Maybe it is WTF")
Ejemplo n.º 2
0
        def model_ctor():
            # model = BertForTokenClassificationCustom.from_pretrained(self._bert_model_type,
            #                                                          cache_dir=self._cache_dir,
            #                                                          num_labels=len(self._tag2idx)).cuda()
            #
            # seq_tagger = SequenceTaggerBert(model, self._bert_tokenizer, idx2tag=self._idx2tag,
            #                                 tag2idx=self._tag2idx, pred_batch_size=self._ebs)

            embedder = PretrainedTransformerMismatchedEmbedder(
                model_name=self._bert_model_type)
            text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})

            seq2seq_encoder = PassThroughEncoder(
                input_dim=embedder.get_output_dim())

            tagger = SimpleTagger(text_field_embedder=text_field_embedder,
                                  vocab=self.vocab,
                                  encoder=seq2seq_encoder,
                                  calculate_span_f1=True,
                                  label_encoding='IOB1').cuda()

            return tagger
Ejemplo n.º 3
0
    def test_exotic_tokens_no_nan_grads(self):
        token_indexer = PretrainedTransformerMismatchedIndexer(
            "bert-base-uncased")

        sentence1 = ["A", "", "AllenNLP", "sentence", "."]
        sentence2 = [
            "A", "\uf732\uf730\uf730\uf733", "AllenNLP", "sentence", "."
        ]

        tokens1 = [Token(word) for word in sentence1]
        tokens2 = [Token(word) for word in sentence2]
        vocab = Vocabulary()

        token_embedder = BasicTextFieldEmbedder({
            "bert":
            PretrainedTransformerMismatchedEmbedder("bert-base-uncased")
        })

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        bert_vectors = token_embedder(tokens)
        test_loss = bert_vectors.mean()

        test_loss.backward()

        for name, param in token_embedder.named_parameters():
            grad = param.grad
            assert (grad is None) or (not torch.any(torch.isnan(grad)).item())
Ejemplo n.º 4
0
indexers = {"bert" : PretrainedTransformerMismatchedIndexer(model_name, namespace="bert")}

reader = Conll2003DatasetReader(token_indexers = indexers)
train_dataset = reader.read("conll2003/eng.train")
validation_dataset = reader.read("conll2003/eng.testa")
test_dataset = reader.read("conll2003/eng.testb")

all_insts = train_dataset + validation_dataset + test_dataset


vocab = Vocabulary.from_instances(all_insts)

dataset = Batch(all_insts)
dataset.index_instances(vocab)

embedder = PretrainedTransformerMismatchedEmbedder(model_name, last_layer_only = True)
token_embedder = BasicTextFieldEmbedder({"bert" : embedder})
embedding_dim = 768
encoder = PassThroughEncoder(input_dim=embedding_dim)

model = SimpleTagger(vocab = vocab,
                     text_field_embedder = token_embedder,
                     encoder = encoder,
                     calculate_span_f1 = True,
                     label_encoding = "IOB1")

optimizer = optim.Adam(model.parameters(), lr=3e-05)

if torch.cuda.is_available():
    print("Using GPU")
    cuda_device = 0
# We're using a very small transformer here so that it runs quickly in binder. You
# can change this to any transformer model name supported by Hugging Face.
transformer_model = 'google/reformer-crime-and-punishment'

# Represents the list of word tokens with a sequences of wordpieces as determined
# by the transformer's tokenizer.  This actually results in a pretty complex data
# type, which you can see by running this.  It's complicated because we need to
# know how to combine the wordpieces back into words after running the
# transformer.
indexer = PretrainedTransformerMismatchedIndexer(model_name=transformer_model)

text_field = TextField(tokens, {'transformer': indexer})
text_field.index(Vocabulary())
token_tensor = text_field.as_tensor(text_field.get_padding_lengths())

# There are two key things to notice in this output.  First, there are two masks:
# `mask` is a word-level mask that gets used in the utility functions described in
# the last section of this chapter.  `wordpiece_mask` gets used by the `Embedder`
# itself.  Second, there is an `offsets` tensor that gives start and end wordpiece
# indices for the original tokens.  In the embedder, we grab these, average all of
# the wordpieces for each token, and return the result.
print("Indexed tensors:", token_tensor)

embedding = PretrainedTransformerMismatchedEmbedder(model_name=transformer_model)

embedder = BasicTextFieldEmbedder(token_embedders={'transformer': embedding})

tensor_dict = text_field.batch_tensors([token_tensor])
embedded_tokens = embedder(tensor_dict)
print("Embedded tokens size:", embedded_tokens.size())
print("Embedded tokens:", embedded_tokens)