Exemple #1
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.Tensor):
                 if single_predicted.size() != batch_predicted.size():
                     slices = tuple(slice(0, size) for size in single_predicted.size())
                     batch_predicted = batch_predicted[slices]
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     slices = tuple(slice(0, size) for size in single_predicted.size())
                     batch_predicted = batch_predicted[slices]
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Exemple #3
0
    def test_offsets_with_tokenized_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        sent0 = sent0.split()
        sent1 = sent1.split()

        tokens0 = [Token(token) for token in sent0]
        tokens1 = [Token(token) for token in sent1]

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens0, {"transformer": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens1, {"transformer": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 每个token应该只取一个sub_word代表作为token的特征
        assert len(tokens['transformer-offsets'][0]) == len(tokens0)
        assert len(tokens['transformer-offsets'][1]) == len(tokens1)
Exemple #4
0
    def test_encode_decode_with_raw_text_base(self, transformer_name):
        token_indexer = TransformerIndexer(model_name=transformer_name,
                                           do_lowercase=False)
        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"

        vocab = Vocabulary()

        instance1 = Instance({
            "tokens":
            TextField([Token(sent0)], {"transformer": token_indexer})
        })
        instance2 = Instance({
            "tokens":
            TextField([Token(sent1)], {"transformer": token_indexer})
        })

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        input_ids = tokens['transformer']
        input_ids_0 = [id.item() for id in input_ids[0]]
        input_ids_1 = [id.item() for id in input_ids[1]]
        # 原句子应与indexer后的句子保持一致
        assert sent0 == token_indexer.tokenizer.decode(
            input_ids_0, skip_special_tokens=True)
        assert sent1 == token_indexer.tokenizer.decode(
            input_ids_1, skip_special_tokens=True)
 def forward(self, tree: Tree,
             label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
     str_phase_holder = []
     self.collect_phase(tree, str_phase_holder)
     # tokenize and elmo tokenize
     instances = [self.text_to_instance(phase) for phase in str_phase_holder]
     idx, instances = sort_by_padding(instances, [("tokens", "num_tokens")], self.vocab)
     batch = Batch(instances)
     pad_lengths = batch.get_padding_lengths()
     tensor_dict = batch.as_tensor_dict(pad_lengths)
     tensor_dict = move_to_device(tensor_dict, 0)
     output = self.biattentive_cell(**tensor_dict)
     # alert reshape the result to [length, comp, gaussian]
     # alert here is ugly
     batch_size, labels = output['weight'].size()
     labels = labels // self.component_num
     output['weight'] = output['weight'].reshape(batch_size, labels, self.component_num)
     output['mu'] = output['mu'].reshape(batch_size, labels, self.component_num, self.gaussian_dim)
     output['var'] = output['var'].reshape(batch_size, labels, self.component_num, self.gaussian_dim)
     # resort output result
     new_idx = [i for i in range(len(instances))]
     for pos, name in enumerate(idx):
         new_idx[name] = pos
     for name, tensor in output.items():
         output[name] = torch.stack([tensor[i] for i in new_idx])
     return output
Exemple #6
0
    def remove_tokens(self, attentions, metadata, threshold, labels):
        attentions_cpu = attentions.cpu().data.numpy()
        sentences = [x["tokens"] for x in metadata]
        instances = []
        for b in range(attentions_cpu.shape[0]):
            sentence = [x for x in sentences[b]]
            always_keep_mask = metadata[b]['always_keep_mask']
            attn = attentions_cpu[b][:len(sentence
                                          )] + always_keep_mask * -10000
            max_length = math.ceil((1 - always_keep_mask).sum() * threshold)

            top_ind = np.argsort(attn)[:-max_length]
            new_tokens = [
                x for i, x in enumerate(sentence)
                if i in top_ind or always_keep_mask[i] == 1
            ]
            instances += metadata[0]["convert_tokens_to_instance"](new_tokens,
                                                                   None)

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {
            k: v.to(attentions.device)
            for k, v in batch["document"].items()
        }
    def test_forward(self):
        batch_dialogues = Batch(self.instances)

        res = self.model.forward(**batch_dialogues.as_tensor_dict(
            batch_dialogues.get_padding_lengths()))

        print(res)
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [1, 2, 3, 4, 5, 6, 7, 8, 9]
        ]
    def test_embeddings(self, transformer_name, gold_offsets: torch.LongTensor, use_starting_offsets):
        self.token_indexer = TransformerIndexer(model_name=transformer_name, do_lowercase=False,
                                                use_starting_offsets=use_starting_offsets)
        self.transformer_embedder = TransformerEmbedder(model_name=transformer_name, trainable=False)

        sent0 = "the quickest quick brown fox jumped over the lazy dog"
        sent1 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens0 = sent0.split()
        tokens1 = sent1.split()
        tokens0 = [Token(token) for token in tokens0]
        tokens1 = [Token(token) for token in tokens1]
        vocab = Vocabulary()

        instance0 = Instance({"tokens": TextField(tokens0, {"transformer": self.token_indexer})})
        instance1 = Instance({"tokens": TextField(tokens1, {"transformer": self.token_indexer})})

        batch = Batch([instance0, instance1])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        input_ids = tokens['transformer']
        offsets = tokens['transformer-offsets']
        transformer_mask = tokens['transformer-mask']

        test_select_embeddings = self.transformer_embedder(input_ids, offsets, transformer_mask)
        transformer_vectors = self.transformer_embedder(token_ids=input_ids, mask=transformer_mask)
        gold_select_embeddings = get_select_embedding(transformer_vectors, gold_offsets)
        assert gold_select_embeddings.equal(test_select_embeddings)
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 5, 6, 8, 9, 2, 14, 12]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 1, 2, 3, 4, 5, 6, 7, 8]
        ]
Exemple #11
0
    def test_sliding_window_with_batch(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / 'bert' / 'vocab.txt'
        token_indexer = PretrainedBertIndexer(str(vocab_path), truncate_long_sequences=False, max_pieces=8)

        config_path = self.FIXTURES_ROOT / 'bert' / 'config.json'
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens + tokens + tokens, {"bert": token_indexer})})

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert bert_vectors is not None
Exemple #12
0
    def _regenerate_tokens(self, metadata, sample_z):
        sample_z_cpu = sample_z.cpu().data.numpy()
        tokens = [m["tokens"] for m in metadata]

        assert len(tokens) == len(sample_z_cpu)
        assert max([len(x) for x in tokens]) == sample_z_cpu.shape[1]

        instances = []
        new_tokens = []
        for words, mask, meta in zip(tokens, sample_z_cpu, metadata):
            mask = mask[:len(words)]
            new_words = [
                w for i, (w, m) in enumerate(zip(words, mask))
                if i == 0 or m == 1
            ]

            new_tokens.append(new_words)
            meta["new_tokens"] = new_tokens
            instance = metadata[0]["convert_tokens_to_instance"](new_words,
                                                                 None)
            instances += instance

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {k: v.to(sample_z.device) for k, v in batch["document"].items()}
Exemple #13
0
def predict(instances: List[Instance]) -> List[float]:
    """Output BERT NSP next sentence probability for a list of instances.

    Parameters
    ----------
    instances : List[Instance]

    Returns
    -------
    List[float]
        BERT NSP scores in range [0, 1].
    """
    scores = []
    for batch_instance in tqdm(batch(instances, batch_size=args.batch_size),
                               total=math.ceil(
                                   len(instances) / args.batch_size),
                               desc='Predicting'):
        batch_ins = Batch(batch_instance)
        batch_ins.index_instances(VOCAB)
        tensor_dict = batch_ins.as_tensor_dict(batch_ins.get_padding_lengths())
        tokens = tensor_dict["tokens"]
        input_ids = tokens['bert'].to(torch.device(f'cuda:{GPU_ID}'))
        token_type_ids = tokens['bert-type-ids'].to(
            torch.device(f'cuda:{GPU_ID}'))
        input_mask = (input_ids != 0).long()
        cls_out = BERT_NEXT_SENTENCE.forward(input_ids=input_ids,
                                             token_type_ids=token_type_ids,
                                             attention_mask=input_mask)
        probs = F.softmax(cls_out, dim=-1)
        next_sentence_score = probs[:, 0].detach().cpu().numpy().tolist()
        scores += next_sentence_score

    return scores
Exemple #14
0
def instances_to_batch(instances, model, for_training, cuda_device=0):
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    padding_lengths = batch.get_padding_lengths()
    return batch.as_tensor_dict(padding_lengths,
                                cuda_device=cuda_device,
                                for_training=for_training)
Exemple #15
0
def read_squad_allennlp(file_path):
    '''read data, build vocab, batch, padding, to idx
    Args:
        file_path -- raw squad json file
    Returns:
        None
    '''
    token_indexers = {
            "tokens": SingleIdTokenIndexer(namespace="token_ids"),
            "chars": TokenCharactersIndexer(namespace="token_chars")}
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    for instance in instances:
        question = instance.fields['question']
        print (question)
        print (type(question))
        break
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print (len(word2idx))
    print (len(char2idx))
    print (char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print (padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print (tensor_dict['passage']['tokens'].shape)
    print (tensor_dict['passage']['chars'].shape)
    print (tensor_dict['question']['tokens'].shape)
    print (tensor_dict['question']['chars'].shape)
    print (tensor_dict['span_start'].shape)
    print (tensor_dict['span_end'].shape)
Exemple #16
0
def read_squad_word_char(file_path):
    token_indexers = {
        "tokens": SingleIdTokenIndexer(namespace="token_ids"),
        "chars": TokenCharactersIndexer(namespace="token_chars")
    }
    reader = SquadReader(token_indexers=token_indexers)
    instances = reader.read(file_path)
    vocab = Vocabulary.from_instances(instances)
    word2idx = vocab.get_index_to_token_vocabulary("token_ids")
    char2idx = vocab.get_index_to_token_vocabulary("token_chars")
    #print (word2idx)
    print(len(word2idx))
    print(len(char2idx))
    print(char2idx)
    batch = Batch(instances)
    batch.index_instances(vocab)
    padding_lengths = batch.get_padding_lengths()
    print(padding_lengths)
    tensor_dict = batch.as_tensor_dict(padding_lengths)
    print(tensor_dict['passage']['tokens'].shape)
    print(tensor_dict['passage']['chars'].shape)
    print(tensor_dict['question']['tokens'].shape)
    print(tensor_dict['question']['chars'].shape)
    print(tensor_dict['span_start'].shape)
    print(tensor_dict['span_end'].shape)
Exemple #17
0
    def test_read(self, lazy):
        reader = GLUESST2DatasetReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={'bert': PretrainedBertIndexer(
                pretrained_model=self.BERT_VOCAB_PATH)},
            skip_label_indexing=False
        )
        instances = reader.read(
            str(self.FIXTURES_ROOT / 'dev.tsv'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens']]
        label = example.fields['label'].label
        print(label)
        print(tokens)
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)
        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([vocab.get_token_from_index(i, "bert")
               for i in tokens["bert"].tolist()[0]])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
    def test_squad_with_unwordpieceable_passage(self):

        tokenizer = SpacyTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = (
            "There were four major HDTV systems tested by SMPTE in the late 1970s, "
            "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:"
        )
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = (
            "Broca, being what today would be called a neurosurgeon, "
            "had taken an interest in the pathology of speech. He wanted "
            "to localize the difference between man and the other animals, "
            "which appeared to reside in speech. He discovered the speech "
            "center of the human brain, today called Broca's area after him. "
            "His interest was mainly in Biological anthropology, but a German "
            "philosopher specializing in psychology, Theodor Waitz, took up the "
            "theme of general and social anthropology in his six-volume work, "
            "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
            """soon translated as "The Anthropology of Primitive Peoples". """
            "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import (
            make_reading_comprehension_instance, )

        instance1 = make_reading_comprehension_instance(
            tokenizer.tokenize(question1),
            tokenizer.tokenize(passage1),
            {"bert": token_indexer},
            passage1,
        )

        instance2 = make_reading_comprehension_instance(
            tokenizer.tokenize(question2),
            tokenizer.tokenize(passage2),
            {"bert": token_indexer},
            passage2,
        )

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
    def ensure_batch_predictions_are_consistent(
        self, keys_to_ignore: Iterable[str] = ()):
        """
        Ensures that the model performs the same on a batch of instances as on individual instances.
        Ignores metrics matching the regexp .*loss.* and those specified explicitly.

        Parameters
        ----------
        keys_to_ignore : ``Iterable[str]``, optional (default=())
            Names of metrics that should not be taken into account, e.g. "batch_weight".
        """
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(
            full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if "loss" in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                if key in keys_to_ignore:
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(
                            slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(
                        single_predicted.data.numpy(),
                        batch_predicted.data.numpy(),
                        atol=tolerance,
                        err_msg=key,
                    )
                else:
                    assert single_predicted == batch_predicted, key
Exemple #20
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "The quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "The quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        assert len(tokens1) == 10
        assert len(tokens2) == 10

        tokens = [Token('[CLS]')] + tokens1 + [Token('[SEP]')] + tokens2

        assert len(tokens) == 22

        vocab = Vocabulary()

        instance = Instance(
            {"sentence_pair": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()

        tensor_dict = batch.as_tensor_dict(padding_lengths)

        tokens = tensor_dict["sentence_pair"]
        assert tokens['mask'].tolist()[0] == [1] * 22
        assert tokens["bert"].tolist()[0] == [
            101, 1996, 4248, 4355, 4248, 2829, 4419, 5598, 2058, 1996, 13971,
            3899, 102, 1996, 4248, 2829, 4419, 5598, 2058, 1996, 2474, 14272,
            3367, 13971, 17709, 2080
        ]
        assert [
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ] == [
            '[CLS]', 'the', 'quick', '##est', 'quick', 'brown', 'fox',
            'jumped', 'over', 'the', 'lazy', 'dog', '[SEP]', 'the', 'quick',
            'brown', 'fox', 'jumped', 'over', 'the', 'la', '##zie', '##st',
            'lazy', 'elm', '##o'
        ]
        assert len(tokens['bert'][0]) == 26
        assert tokens["bert-offsets"].tolist()[0] == [
            0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
            22, 23, 25
        ]
        assert tokens['bert-type-ids'].tolist()[0] == [0] * 13 + [1] * 13

        bert_vectors = self.token_embedder(
            tokens["bert"],
            offsets=tokens["bert-offsets"],
            token_type_ids=tokens['bert-type-ids'])
        assert list(bert_vectors.shape) == [1, 22, 768]
    def test_end_to_end(self):
        tokenizer = PretrainedTransformerTokenizer(
            model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(
            model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = [
            "[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"
        ]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = [
            "[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"
        ]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params({
            "token_embedders": {
                "bert": {
                    "type": "pretrained_transformer",
                    "model_name": "bert-base-uncased"
                }
            },
            "embedder_to_indexer_map": {
                "bert": ["bert", "mask"]
            },
            "allow_unmatched_keys": True,
        })
        token_embedder = BasicTextFieldEmbedder.from_params(vocab=vocab,
                                                            params=params)

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"].shape == (2, max_length)

        assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1],
                                           [1, 1, 1, 1, 1, 1, 1, 0, 0]]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
Exemple #22
0
    def ensure_batch_predictions_are_consistent(
            self,
            keys_to_ignore: Iterable[str] = ()):
        """
        Ensures that the model performs the same on a batch of instances as on individual instances.
        Ignores metrics matching the regexp .*loss.* and those specified explicitly.

        Parameters
        ----------
        keys_to_ignore : ``Iterable[str]``, optional (default=())
            Names of metrics that should not be taken into account, e.g. "batch_weight".
        """
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if 'loss' in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                if key in keys_to_ignore:
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Exemple #23
0
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {
         u"text1": {
             u"num_tokens": 5
         },
         u"text2": {
             u"num_tokens": 6
         }
     }
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors[u"text1"][u"tokens"].detach().cpu().numpy()
        text2 = tensors[u"text2"][u"tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(
            text1, numpy.array([[2, 3, 4, 5, 6], [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(
            text2, numpy.array([[2, 3, 4, 1, 5, 6], [2, 3, 1, 0, 0, 0]]))
 def preprocess(self,data,result_flag=0):
     xdata =[]
     for review in data:
         if(result_flag):
             xlabel = self.get_label(review)
         else: # fake label when predicting
             xlabel = -1
         xdata.append(self.datareader.text_to_instance(review['text'],xlabel))
     data_batch = Batch(xdata)
     data_batch.index_instances(self.vocab)
     data_tensors = data_batch.as_tensor_dict(data_batch.get_padding_lengths())
     return data_tensors
    def test_end_to_end(self):
        tokenizer = BertPreTokenizer()

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance(
            {"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance(
            {"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["bert"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
            [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17],
        ]

        assert tokens["bert-offsets"].tolist() == [
            [1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
            [1, 2, 3, 4, 5, 6, 7, 10, 11, 12],
        ]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Exemple #28
0
    def regenerate_tokens(self, tokens_list, metadata, device):
        instances = []
        for words in tokens_list:
            instance = metadata[0]["convert_tokens_to_instance"](words)
            instances.append(instance)

        batch = Batch(instances)
        batch.index_instances(self._model["model"]._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {k: v.to(device) for k, v in batch["document"].items()}
Exemple #29
0
 def ensure_batch_predictions_are_consistent(self):
     self.model.eval()
     single_predictions = []
     for i, instance in enumerate(self.instances):
         dataset = Batch([instance])
         tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(),
                                          for_training=False)
         result = self.model(**tensors)
         single_predictions.append(result)
     full_dataset = Batch(self.instances)
     batch_tensors = full_dataset.as_tensor_dict(
         full_dataset.get_padding_lengths(), for_training=False)
     batch_predictions = self.model(**batch_tensors)
     for i, instance_predictions in enumerate(single_predictions):
         for key, single_predicted in instance_predictions.items():
             tolerance = 1e-6
             if key == 'loss':
                 # Loss is particularly unstable; we'll just be satisfied if everything else is
                 # close.
                 continue
             single_predicted = single_predicted[0]
             batch_predicted = batch_predictions[key][i]
             if isinstance(single_predicted, torch.autograd.Variable):
                 if single_predicted.size() != batch_predicted.size():
                     # This is probably a sequence model, and our output shape has some padded
                     # elements in the batched case.  Fixing this in general is complicated;
                     # we'll just fix some easy cases that we actually have, for now.
                     num_tokens = single_predicted.size(0)
                     if batch_predicted.dim() == 1:
                         batch_predicted = batch_predicted[:num_tokens]
                     elif batch_predicted.dim() == 2:
                         batch_predicted = batch_predicted[:num_tokens, :]
                     else:
                         raise NotImplementedError
                 assert_allclose(single_predicted.data.numpy(),
                                 batch_predicted.data.numpy(),
                                 atol=tolerance,
                                 err_msg=key)
             else:
                 assert single_predicted == batch_predicted, key
Exemple #30
0
    def generate_tokens(self, new_tokens, metadata, labels):
        instances = []
        for tokens, instance_labels in zip(new_tokens, labels):
            instances += metadata[0]["convert_tokens_to_instance"](
                tokens, [instance_labels[k] for k in ["A", "B", "C", "D", "E"]]
            )

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {k: v.to(self._vector.device) for k, v in batch["document"].items()}
    def test_squad_with_unwordpieceable_passage(self):
        # pylint: disable=line-too-long
        tokenizer = WordTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, "
                    "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:")
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = ("Broca, being what today would be called a neurosurgeon, "
                    "had taken an interest in the pathology of speech. He wanted "
                    "to localize the difference between man and the other animals, "
                    "which appeared to reside in speech. He discovered the speech "
                    "center of the human brain, today called Broca's area after him. "
                    "His interest was mainly in Biological anthropology, but a German "
                    "philosopher specializing in psychology, Theodor Waitz, took up the "
                    "theme of general and social anthropology in his six-volume work, "
                    "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
                    """soon translated as "The Anthropology of Primitive Peoples". """
                    "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance

        instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1),
                                                        tokenizer.tokenize(passage1),
                                                        {"bert": token_indexer},
                                                        passage1)

        instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2),
                                                        tokenizer.tokenize(passage2),
                                                        {"bert": token_indexer},
                                                        passage2)

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
    def test_sliding_window(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["bert"].tolist() == [[
            16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2,
            14, 12, 17
        ]]
        assert tokens["bert-offsets"].tolist() == [[
            1, 3, 4, 5, 6, 7, 8, 9, 10, 11
        ]]

        bert_vectors = token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"],
                                      token_type_ids=tokens["bert-type-ids"])
        assert list(bert_vectors.shape) == [1, 10, 12]
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0],
                [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                [0, 1, 2, 3, 4, 5, 6, 9, 10, 11]
        ]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Exemple #34
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
    def bert_vector(self):
        words = re.split(r'\W+', self.text)
        Text = ' '.join(words)

        tokens = tokenizer.tokenize(Text)

        instance = Instance(
            {"tokens": TextField(tokens, {'bert': token_indexer})})
        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lenghts = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lenghts)

        Tokens = tensor_dict["tokens"]

        bert_vectors = model(Tokens["bert"])
        return (bert_vectors)
Exemple #36
0
    def test_read(self, lazy):
        reader = SnliReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={
                'bert':
                PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)
            },
        )

        instances = reader.read(
            str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens'].tokens]
        label = example.fields['label'].label
        weight = example.fields['weight'].weight
        assert label == 'neutral'
        assert weight == 1
        assert instances[1].fields['weight'].weight == 0.5
        assert instances[2].fields['weight'].weight == 1
        assert tokens == [
            'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken',
            'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training',
            'his', 'horse', 'for', 'a', 'competition', '.'
        ]
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
    if (create_video_training):
        pf.create_image_weights_epoch(model, video_fotograms_folder2, i)
        pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset,
                                            tr_data_loss, val_data_loss, KL_loss,
                                            video_fotograms_folder4, i+1)

#            output = model(tensor_dict["text_field"],tensor_dict["tags_field"])
#            loss = output["loss"] # We can get the loss coz we gave the labels as input



			# gradient and everything. 
"""
############## Use the trained model ######################
We use an already implemented predictor that takes the model and how to preprocess the data
"""

name_exmaple = "Eat my motherfucking jeans"
name_exmaple = "Carlos Sanchez"
tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))]
Instance_test = reader.generate_instance(tokens_list,None)
batch = Batch([Instance_test])
batch.index_instances(vocab)

padding_lengths = batch.get_padding_lengths()
tensor_dict = batch.as_tensor_dict(padding_lengths)

model.eval()
tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy()
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])
Exemple #39
0
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                "text2": {"num_tokens": 6, "tokens_length": 6}}