Ejemplo n.º 1
0
    def test_forward_pass_runs_correctly(self):
        """
        Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same
        results as the model itself.
        """
        bidaf_ensemble = BidafEnsemble([self.model, self.model])

        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()

        bidaf_output_dict = self.model(**training_tensors)
        ensemble_output_dict = bidaf_ensemble(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0
        assert torch.equal(ensemble_output_dict['best_span'], bidaf_output_dict['best_span'])
        assert ensemble_output_dict['best_span_str'] == bidaf_output_dict['best_span_str']
Ejemplo n.º 2
0
    def forward_on_instances(self,
                             instances: List[Instance],
                             cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input = dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Ejemplo n.º 3
0
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 5, 6, 8, 9, 2, 14, 12]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 1, 2, 3, 4, 5, 6, 7, 8]
        ]
Ejemplo n.º 4
0
def batch_to_ids(batch: List[List[str]]) -> torch.Tensor:
    """
    Converts a batch of tokenized sentences to a tensor representing the sentences with encoded characters
    (len(batch), max sentence length, max word length).

    Parameters
    ----------
    batch : ``List[List[str]]``, required
        A list of tokenized sentences.

    Returns
    -------
        A tensor of padded character ids.
    """
    instances = []
    indexer = ELMoTokenCharactersIndexer()
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens,
                          {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 5
0
    def test_as_tensor_dict(self):
        dataset = Batch(self.instances)
        dataset.index_instances(self.vocab)
        padding_lengths = dataset.get_padding_lengths()
        tensors = dataset.as_tensor_dict(padding_lengths)
        text1 = tensors["text1"]["tokens"].detach().cpu().numpy()
        text2 = tensors["text2"]["tokens"].detach().cpu().numpy()

        numpy.testing.assert_array_almost_equal(text1, numpy.array([[2, 3, 4, 5, 6],
                                                                    [1, 3, 4, 5, 6]]))
        numpy.testing.assert_array_almost_equal(text2, numpy.array([[2, 3, 4, 1, 5, 6],
                                                                    [2, 3, 1, 0, 0, 0]]))
Ejemplo n.º 6
0
 def test_tagger_with_elmo_token_embedder_forward_pass_runs_correctly(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict['tags']
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id, namespace="labels")
             assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
Ejemplo n.º 7
0
    def test_squad_with_unwordpieceable_passage(self):
        # pylint: disable=line-too-long
        tokenizer = WordTokenizer()

        token_indexer = PretrainedBertIndexer("bert-base-uncased")

        passage1 = ("There were four major HDTV systems tested by SMPTE in the late 1970s, "
                    "and in 1979 an SMPTE study group released A Study of High Definition Television Systems:")
        question1 = "Who released A Study of High Definition Television Systems?"

        passage2 = ("Broca, being what today would be called a neurosurgeon, "
                    "had taken an interest in the pathology of speech. He wanted "
                    "to localize the difference between man and the other animals, "
                    "which appeared to reside in speech. He discovered the speech "
                    "center of the human brain, today called Broca's area after him. "
                    "His interest was mainly in Biological anthropology, but a German "
                    "philosopher specializing in psychology, Theodor Waitz, took up the "
                    "theme of general and social anthropology in his six-volume work, "
                    "entitled Die Anthropologie der Naturvölker, 1859–1864. The title was "
                    """soon translated as "The Anthropology of Primitive Peoples". """
                    "The last two volumes were published posthumously.")
        question2 = "What did Broca discover in the human brain?"

        from allennlp.data.dataset_readers.reading_comprehension.util import make_reading_comprehension_instance

        instance1 = make_reading_comprehension_instance(tokenizer.tokenize(question1),
                                                        tokenizer.tokenize(passage1),
                                                        {"bert": token_indexer},
                                                        passage1)

        instance2 = make_reading_comprehension_instance(tokenizer.tokenize(question2),
                                                        tokenizer.tokenize(passage2),
                                                        {"bert": token_indexer},
                                                        passage2)

        vocab = Vocabulary()

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        qtokens = tensor_dict["question"]
        ptokens = tensor_dict["passage"]

        config = BertConfig(len(token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        _ = embedder(ptokens["bert"], offsets=ptokens["bert-offsets"])
        _ = embedder(qtokens["bert"], offsets=qtokens["bert-offsets"])
Ejemplo n.º 8
0
    def forward_on_instances(self,
                             instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [{} for _ in dataset.instances]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                outputs[name] = output
                for instance_output, batch_element in zip(instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Ejemplo n.º 9
0
def dry_run_from_params(params: Params, serialization_dir: str) -> None:
    prepare_environment(params)

    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError("The 'vocabulary' directory in the provided "
                                 "serialization directory is non-empty")

    all_datasets = datasets_from_params(params)
    datasets_for_vocab_creation = set(params.pop("datasets_for_vocab_creation", all_datasets))

    for dataset in datasets_for_vocab_creation:
        if dataset not in all_datasets:
            raise ConfigurationError(f"invalid 'dataset_for_vocab_creation' {dataset}")

    logger.info("From dataset instances, %s will be considered for vocabulary creation.",
                ", ".join(datasets_for_vocab_creation))

    instances = [instance for key, dataset in all_datasets.items()
                 for instance in dataset
                 if key in datasets_for_vocab_creation]

    vocab = Vocabulary.from_params(vocab_params, instances)
    dataset = Batch(instances)
    dataset.index_instances(vocab)
    dataset.print_statistics()
    vocab.print_statistics()

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)

    model = Model.from_params(vocab=vocab, params=params.pop('model'))
    trainer_params = params.pop("trainer")
    no_grad_regexes = trainer_params.pop("no_grad", ())
    for name, parameter in model.named_parameters():
        if any(re.search(regex, name) for regex in no_grad_regexes):
            parameter.requires_grad_(False)

    frozen_parameter_names, tunable_parameter_names = \
                   get_frozen_and_tunable_parameter_names(model)
    logger.info("Following parameters are Frozen  (without gradient):")
    for name in frozen_parameter_names:
        logger.info(name)
    logger.info("Following parameters are Tunable (with gradient):")
    for name in tunable_parameter_names:
        logger.info(name)
Ejemplo n.º 10
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 11
0
    def collate(self, instances_l):
        batch = Batch([x[0] for x in instances_l])
        batch.index_instances(self.vocab)

        batch_dict = {
            k: v['tokens']
            for k, v in batch.as_tensor_dict().items()
        }

        batch_dict['story_tokens'] = [
            instance[0].fields['story'].tokens for instance in instances_l
        ]
        batch_dict['story_full'] = [x[1] + x[2] for x in instances_l]
        batch_dict['items'] = [x[3] for x in instances_l]
        return batch_dict
Ejemplo n.º 12
0
    def _sentences_to_ids(self, sentences):
        indexer = ELMoTokenCharactersIndexer()

        # For each sentence, first create a TextField, then create an instance
        instances = []
        for sentence in sentences:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens, {'character_ids': indexer})
            instance = Instance({'elmo': field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary()
        dataset.index_instances(vocab)
        return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 13
0
    def generate_tokens(self, new_tokens, metadata, labels):
        instances = []
        for tokens in new_tokens:
            instances += metadata[0]["convert_tokens_to_instance"](tokens,
                                                                   None)

        batch = Batch(instances)
        batch.index_instances(self._vocabulary)
        padding_lengths = batch.get_padding_lengths()

        batch = batch.as_tensor_dict(padding_lengths)
        return {
            k: v.to(self._vector.device)
            for k, v in batch["document"].items()
        }
Ejemplo n.º 14
0
 def test_tagger_with_openai_token_embedder_forward_pass_runs_correctly(
         self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     training_tensors = dataset.as_tensor_dict()
     output_dict = self.model(**training_tensors)
     tags = output_dict['tags']
     assert len(tags) == 2
     assert len(tags[0]) == 7
     assert len(tags[1]) == 7
     for example_tags in tags:
         for tag_id in example_tags:
             tag = self.model.vocab.get_token_from_index(tag_id,
                                                         namespace="labels")
             assert tag in {'O', 'I-ORG', 'I-PER', 'I-LOC'}
Ejemplo n.º 15
0
    def forward_on_instances(
        self,
        instances: List[Instance],
        partial_labels: Optional[List[List[Tuple[int, int]]]] = None
    ) -> List[List[str]]:
        if partial_labels is not None:
            assert len(instances) == len(partial_labels)

        batch = Batch(instances)
        batch.index_instances(self.vocab)
        cuda_device = self._get_prediction_device()
        model_input = util.move_to_device(batch.as_tensor_dict(), cuda_device)
        output_dict = self.decode(self.forward(**model_input), partial_labels)

        return output_dict["tags"]
Ejemplo n.º 16
0
    def get_gradients(
            self, instances: List[Instance]
    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
        """
        Gets the gradients of the loss with respect to the model inputs.

        Parameters
        ----------
        instances: List[Instance]

        Returns
        -------
        Tuple[Dict[str, Any], Dict[str, Any]]
        The first item is a Dict of gradient entries for each input.
        The keys have the form  ``{grad_input_1: ..., grad_input_2: ... }``
        up to the number of inputs given. The second item is the model's output.

        Notes
        -----
        Takes a ``JsonDict`` representing the inputs of the model and converts
        them to :class:`~allennlp.data.instance.Instance`s, sends these through
        the model :func:`forward` function after registering hooks on the embedding
        layer of the model. Calls :func:`backward` on the loss and then removes the
        hooks.
        """
        embedding_gradients: List[Tensor] = []
        hooks: List[RemovableHandle] = self._register_embedding_gradient_hooks(
            embedding_gradients)

        dataset = Batch(instances)
        dataset.index_instances(self._model.vocab)
        outputs = self._model.decode(
            self._model.forward(**dataset.as_tensor_dict())  # type: ignore
        )

        loss = outputs["loss"]
        self._model.zero_grad()
        loss.backward()

        for hook in hooks:
            hook.remove()

        grad_dict = dict()
        for idx, grad in enumerate(embedding_gradients):
            key = "grad_input_" + str(idx + 1)
            grad_dict[key] = grad.detach().cpu().numpy()

        return grad_dict, outputs
Ejemplo n.º 17
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [
                [2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 0],
                [2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1]
        ]

        assert tokens["bert-offsets"].tolist() == [
                [0, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                [0, 1, 2, 3, 4, 5, 6, 9, 10, 11]
        ]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Ejemplo n.º 18
0
    def forward_on_instances(self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.

        Returns
        -------
        A list of the models output for each instance.
        """
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    # NOTE(markn): This is a hack because 0-dim pytorch tensors are not iterable.
                    # This occurs with batch size 1, because we still want to include the loss in that case.
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Ejemplo n.º 19
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
Ejemplo n.º 20
0
    def get_vocab_and_both_elmo_indexed_ids(batch: List[List[str]]):
        instances = []
        indexer = ELMoTokenCharactersIndexer()
        indexer2 = SingleIdTokenIndexer()
        for sentence in batch:
            tokens = [Token(token) for token in sentence]
            field = TextField(tokens,
                              {'character_ids': indexer,
                               'tokens': indexer2})
            instance = Instance({"elmo": field})
            instances.append(instance)

        dataset = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        dataset.index_instances(vocab)
        return vocab, dataset.as_tensor_dict()["elmo"]
def batch_to_ids(batch):
    """
    Given a batch (as list of tokenized sentences), return a batch
    of padded character ids.
    """
    instances = []
    for sentence in batch:
        tokens = [Token(token) for token in sentence]
        field = TextField(tokens, {'character_ids': indexer})
        instance = Instance({"elmo": field})
        instances.append(instance)

    dataset = Batch(instances)
    vocab = Vocabulary()
    dataset.index_instances(vocab)
    return dataset.as_tensor_dict()['elmo']['character_ids']
Ejemplo n.º 22
0
    def test_end_to_end_with_higher_order_inputs(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)
        text_field1 = TextField(tokens1, {"bert": self.token_indexer})

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)
        text_field2 = TextField(tokens2, {"bert": self.token_indexer})

        #            2   5    15 10 11 6
        sentence3 = "the brown laziest fox"
        tokens3 = tokenizer.tokenize(sentence3)
        text_field3 = TextField(tokens3, {"bert": self.token_indexer})

        vocab = Vocabulary()

        instance1 = Instance({"tokens": ListField([text_field1])})
        instance2 = Instance({"tokens": ListField([text_field2, text_field3])})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths, verbose=True)
        tokens = tensor_dict["tokens"]

        # No offsets, should get 12 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"],
                                           offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]

        ## Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 2, 12, 12]

        bert_vectors = tlo_embedder(tokens["bert"],
                                    offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 2, 10, 12]
Ejemplo n.º 23
0
    def predict_instance(self,
                         instances: Tuple[Instance, Instance],
                         num_samples: int = 100) -> JsonDict:
        conditioning_instance, generative_instance = instances

        self._model.eval()
        self._sampler.eval()

        # Draw a sample
        sampler_output = self._sampler.sample(**conditioning_instance)
        sampler_logp = sampler_output['logp']
        sample = sampler_output['sample']

        #
        with torch.no_grad():
            # TODO: Make this a parameter somewhere
            num_samples = 100

            # Duplicate instances (to sample in parallel)
            cuda_device = self._model._get_prediction_device()
            conditioning_batch = Batch([conditioning_instance] * num_samples)
            conditioning_batch.index_instances(self._model.vocab)
            conditioning_batch = util.move_to_device(
                conditioning_batch.as_tensor_dict(), cuda_device)

            generative_batch = Batch([generative_instance] * num_samples)
            generative_batch.index_instances(self._model.vocab)
            generative_batch = util.move_to_device(
                generative_batch.as_tensor_dict(), cuda_device)

            # Sample annotations and generate next token
            self._model._use_shortlist = True
            conditioning_output = self._model.sample(**conditioning_batch,
                                                     emit_tokens=False)
            logger.debug('clears condition generation')
            # self._model(**conditioning_output)  # Shouldn't need to do this, but just in case
            # logger.debug('clears reconditioning')
            generative_output = self._model.sample(**generative_batch,
                                                   emit_tokens=True)
            logger.debug('clears generation')
            del conditioning_batch, generative_batch

            aggregate_word_probs = self._aggregate_word_probs(
                generative_output)
            logger.debug('clears word probs')

            return aggregate_word_probs
    def forward(self, inputs, elmo_lstm_output):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        cp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                        self.cuda_device)
        words, pos_tags = cp_inputs['tokens'], cp_inputs['pos_tags']

        mask = get_text_field_mask(words)

        layer_activations = elmo_lstm_output['activations']
        mask_with_bos_eos = elmo_lstm_output['mask']

        # compute the elmo representations
        representations = []
        for i in range(len(self._scalar_mixes)):
            scalar_mix = getattr(self, 'scalar_mix_{}'.format(i))
            representation_with_bos_eos = scalar_mix(layer_activations,
                                                     mask_with_bos_eos)
            if self._keep_sentence_boundaries:
                processed_representation = representation_with_bos_eos
                processed_mask = mask_with_bos_eos
            else:
                representation_without_bos_eos, mask_without_bos_eos = remove_sentence_boundaries(
                    representation_with_bos_eos, mask_with_bos_eos)
                processed_representation = representation_without_bos_eos
                processed_mask = mask_without_bos_eos
            representations.append(self._dropout(processed_representation))

        # reshape if necessary
        mask = processed_mask
        elmo_representations = representations

        embedded_text_input = elmo_representations[0]

        if pos_tags is not None and self.model.pos_tag_embedding is not None:
            embedded_pos_tags = self.model.pos_tag_embedding(pos_tags)
            embedded_text_input = torch.cat(
                [embedded_text_input, embedded_pos_tags], -1)
        elif self.model.pos_tag_embedding is not None:
            raise ConfigurationError(
                "Model uses a POS embedding, but no POS tags were passed.")

        encoded_text = self.model.encoder(embedded_text_input, mask)
        return encoded_text.detach()
Ejemplo n.º 25
0
    def _backprop(self,
                  instances: List[Instance],
                  ind: torch.Tensor,
                  register_forward_hooks: bool = True):
        embedding_gradients: List[torch.Tensor] = []
        grad_hooks: List[
            RemovableHandle] = self._register_embedding_gradient_hooks(
                embedding_gradients)

        embedding_values: List[torch.Tensor] = []
        if register_forward_hooks:
            val_hooks: List[
                RemovableHandle] = self._register_embedding_value_hooks(
                    embedding_values)

        model = self.predictor._model

        cuda_device = model._get_prediction_device()

        dataset = Batch(instances)
        dataset.index_instances(model.vocab)

        model_input = util.move_to_device(dataset.as_tensor_dict(),
                                          cuda_device)

        output = model.decode(model.forward(**model_input)  # type: ignore
                              )

        if self.output_getter is not None:
            output = self.output_getter(output)

        grad_out = output.data.clone()
        grad_out.fill_(0.0)
        grad_out.scatter_(1, ind.unsqueeze(0).t(), 1.0)

        model.zero_grad()
        output.backward(grad_out)

        for hook in grad_hooks:
            hook.remove()

        if register_forward_hooks:
            for hook in val_hooks:
                hook.remove()

        return embedding_values, embedding_gradients
Ejemplo n.º 26
0
    def preprocess(self, token_batch):
        seq_lens = [len(sequence) for sequence in token_batch if sequence]
        if not seq_lens:
            return []
        max_len = min(max(seq_lens), self.max_len)
        batches = []
        for indexer in self.indexers:
            batch = []
            for sequence in token_batch:
                tokens = sequence[:max_len]
                tokens = [Token(token) for token in ['$START'] + tokens]
                batch.append(Instance({'tokens': TextField(tokens, indexer)}))
            batch = Batch(batch)
            batch.index_instances(self.vocab)
            batches.append(batch)

        return batches
Ejemplo n.º 27
0
    def test_sliding_window(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(
            str(vocab_path), truncate_long_sequences=False, max_pieces=8
        )

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance({"tokens": TextField(tokens, {"bert": token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        # 1 full window + 1 half window with start/end tokens
        assert tokens["bert"].tolist() == [
            [16, 2, 3, 4, 3, 5, 6, 17, 16, 3, 5, 6, 8, 9, 2, 17, 16, 8, 9, 2, 14, 12, 17]
        ]
        assert tokens["bert-offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11]]

        bert_vectors = token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [1, 13, 12]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [1, 10, 12]

        # Testing with token_type_ids
        bert_vectors = token_embedder(
            tokens["bert"], offsets=tokens["bert-offsets"], token_type_ids=tokens["bert-type-ids"]
        )
        assert list(bert_vectors.shape) == [1, 10, 12]
Ejemplo n.º 28
0
    def test_end_to_end(self):
        tokenizer = PretrainedTransformerTokenizer(model_name="bert-base-uncased")
        token_indexer = PretrainedTransformerIndexer(model_name="bert-base-uncased")

        sentence1 = "A, AllenNLP sentence."
        tokens1 = tokenizer.tokenize(sentence1)
        expected_tokens1 = ["[CLS]", "a", ",", "allen", "##nl", "##p", "sentence", ".", "[SEP]"]
        assert [t.text for t in tokens1] == expected_tokens1

        sentence2 = "AllenNLP is great"
        tokens2 = tokenizer.tokenize(sentence2)
        expected_tokens2 = ["[CLS]", "allen", "##nl", "##p", "is", "great", "[SEP]"]
        assert [t.text for t in tokens2] == expected_tokens2

        vocab = Vocabulary()

        params = Params(
            {
                "token_embedders": {
                    "bert": {"type": "pretrained_transformer", "model_name": "bert-base-uncased"}
                },
                "embedder_to_indexer_map": {"bert": ["bert", "mask"]},
                "allow_unmatched_keys": True,
            }
        )
        token_embedder = BasicTextFieldEmbedder.from_params(vocab, params)

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        max_length = max(len(tokens1), len(tokens2))

        assert tokens["bert"].shape == (2, max_length)

        assert tokens["mask"].tolist() == [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 0, 0]]

        # Attention mask
        bert_vectors = token_embedder(tokens)
        assert bert_vectors.size() == (2, 9, 768)
Ejemplo n.º 29
0
    def test_end_to_end(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3    4   3     5     6   8      9    2   14   12
        sentence1 = "the quickest quick brown fox jumped over the lazy dog"
        tokens1 = tokenizer.tokenize(sentence1)

        #            2   3     5     6   8      9    2  15 10 11 14   1
        sentence2 = "the quick brown fox jumped over the laziest lazy elmo"
        tokens2 = tokenizer.tokenize(sentence2)

        vocab = Vocabulary()

        instance1 = Instance({"tokens": TextField(tokens1, {"bert": self.token_indexer})})
        instance2 = Instance({"tokens": TextField(tokens2, {"bert": self.token_indexer})})

        batch = Batch([instance1, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # 16 = [CLS], 17 = [SEP]
        assert tokens["bert"].tolist() == [[16, 2, 3, 4, 3, 5, 6, 8, 9, 2, 14, 12, 17, 0],
                                           [16, 2, 3, 5, 6, 8, 9, 2, 15, 10, 11, 14, 1, 17]]

        assert tokens["bert-offsets"].tolist() == [[1, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                                                   [1, 2, 3, 4, 5, 6, 7, 10, 11, 12]]

        # No offsets, should get 14 vectors back ([CLS] + 12 token wordpieces + [SEP])
        bert_vectors = self.token_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        # Offsets, should get 10 vectors back.
        bert_vectors = self.token_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]

        # Now try top_layer_only = True
        tlo_embedder = BertEmbedder(self.bert_model, top_layer_only=True)
        bert_vectors = tlo_embedder(tokens["bert"])
        assert list(bert_vectors.shape) == [2, 14, 12]

        bert_vectors = tlo_embedder(tokens["bert"], offsets=tokens["bert-offsets"])
        assert list(bert_vectors.shape) == [2, 10, 12]
Ejemplo n.º 30
0
    def bert_vector(self):
        words = re.split(r'\W+', self.text)
        Text = ' '.join(words)

        tokens = tokenizer.tokenize(Text)

        instance = Instance(
            {"tokens": TextField(tokens, {'bert': token_indexer})})
        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lenghts = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lenghts)

        Tokens = tensor_dict["tokens"]

        bert_vectors = model(Tokens["bert"])
        return (bert_vectors)
Ejemplo n.º 31
0
def elmo(sentences):
    elmo=Elmo(options_file,weight_file,1,dropout=0)
    instances=[]
    elmo=elmo.cuda()
    indexer=ELMoTokenCharactersIndexer()
    for sen in sentences:
        tokens=[Token(token) for token in sen]
        field = TextField(tokens,{'character_ids': indexer})
        instance = Instance({'elmo':field})
        instances.append(instance)
    dataset = Batch(instances)
    voca=Vocabulary()
    dataset.index_instances(voca)

    dic={'elmo':{'num_tokens':15}}
    character_ids=dataset.as_tensor_dict(dic)['elmo']['character_ids']
    character_ids=character_ids.cuda()
    return elmo(character_ids)['elmo_representations']
Ejemplo n.º 32
0
    def forward(self, inputs):
        texts = self.inputs_to_texts(inputs)
        instances = self.texts_to_instances(texts)
        dataset = Batch(instances)
        dataset.index_instances(self.model.vocab)
        srl_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                         self.cuda_device)
        tokens = srl_inputs['tokens']
        verb_indicator = srl_inputs['verb_indicator']
        embedded_text_input = self.model.text_field_embedder(tokens)
        mask = get_text_field_mask(tokens)
        embedded_verb_indicator = self.model.binary_feature_embedding(
            verb_indicator.long())
        embedded_text_with_verb_indicator = torch.cat(
            [embedded_text_input, embedded_verb_indicator], -1)
        encoded_text = self.model.encoder(embedded_text_with_verb_indicator,
                                          mask)

        return encoded_text.detach()
Ejemplo n.º 33
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # TODO: Check whether we can do something similar:
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        # assert metrics['f1'] > 0

        span_start_probs = output_dict['span_start_probs'][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
    def forward_on_instances(
            self, instances: List[Instance],
            cuda_device: int) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.autograd.Variables`` or ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.
        """
        model_input = {}
        dataset = Batch(instances)
        dataset.index_instances(self.vocab)
        model_input.update({
            'raw':
            dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        })
        #extend
        extend_vocab = Vocabulary.from_instances(dataset.instances)
        self.vocab.extend_from(extend_vocab)
        dataset.index_instances(self.vocab)
        model_input.update({
            'extended':
            dataset.as_tensor_dict(cuda_device=cuda_device, for_training=False)
        })
        #input
        model_input.update({'predict': True})
        outputs = self.decode(self(**model_input))

        instance_separated_output: List[Dict[str, numpy.ndarray]] = [
            {} for _ in dataset.instances
        ]
        for name, output in list(outputs.items()):
            if isinstance(output, torch.autograd.Variable):
                output = output.data.cpu().numpy()
            outputs[name] = output
            for instance_output, batch_element in zip(
                    instance_separated_output, output):
                instance_output[name] = batch_element
        return instance_separated_output
Ejemplo n.º 35
0
 def forward(self, inputs):
     texts = self.inputs_to_texts(inputs)
     instances = self.texts_to_instances(texts)
     dataset = Batch(instances)
     dataset.index_instances(self.model.vocab)
     cp_inputs = util.move_to_device(dataset.as_tensor_dict(),
                                     self.cuda_device)
     words, pos_tags = cp_inputs['tokens'], cp_inputs['pos_tags']
     embedded_text_input = self.model.text_field_embedder(words)
     if pos_tags is not None and self.model.pos_tag_embedding is not None:
         embedded_pos_tags = self.model.pos_tag_embedding(pos_tags)
         embedded_text_input = torch.cat(
             [embedded_text_input, embedded_pos_tags], -1)
     elif self.model.pos_tag_embedding is not None:
         raise ConfigurationError(
             "Model uses a POS embedding, but no POS tags were passed.")
     mask = get_text_field_mask(words)
     encoded_text = self.model.encoder(embedded_text_input, mask)
     return encoded_text.detach()
Ejemplo n.º 36
0
    def test_sliding_window_with_batch(self):
        tokenizer = BertPreTokenizer()

        sentence = "the quickest quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        vocab_path = self.FIXTURES_ROOT / "bert" / "vocab.txt"
        token_indexer = PretrainedBertIndexer(str(vocab_path),
                                              truncate_long_sequences=False,
                                              max_pieces=8)

        config_path = self.FIXTURES_ROOT / "bert" / "config.json"
        config = BertConfig(str(config_path))
        bert_model = BertModel(config)
        token_embedder = BertEmbedder(bert_model, max_pieces=8)

        instance = Instance(
            {"tokens": TextField(tokens, {"bert": token_indexer})})
        instance2 = Instance({
            "tokens":
            TextField(tokens + tokens + tokens, {"bert": token_indexer})
        })

        batch = Batch([instance, instance2])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        # Testing without token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"])
        assert bert_vectors is not None

        # Testing with token_type_ids
        bert_vectors = token_embedder(tokens["bert"],
                                      offsets=tokens["bert-offsets"],
                                      token_type_ids=tokens["bert-type-ids"])
        assert bert_vectors is not None
Ejemplo n.º 37
0
    def test_read(self, lazy):
        reader = SnliReader(
            tokenizer=WordTokenizer(word_splitter=BertBasicWordSplitter()),
            token_indexers={
                'bert':
                PretrainedBertIndexer(pretrained_model=self.BERT_VOCAB_PATH)
            },
        )

        instances = reader.read(
            str(self.FIXTURES_ROOT / 'snli_1.0_sample.jsonl'))
        instances = ensure_list(instances)
        example = instances[0]
        tokens = [t.text for t in example.fields['tokens'].tokens]
        label = example.fields['label'].label
        weight = example.fields['weight'].weight
        assert label == 'neutral'
        assert weight == 1
        assert instances[1].fields['weight'].weight == 0.5
        assert instances[2].fields['weight'].weight == 1
        assert tokens == [
            'a', 'person', 'on', 'a', 'horse', 'jumps', 'over', 'a', 'broken',
            'down', 'airplane', '.', '[SEP]', 'a', 'person', 'is', 'training',
            'his', 'horse', 'for', 'a', 'competition', '.'
        ]
        batch = Batch(instances)
        vocab = Vocabulary.from_instances(instances)
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        print(tokens['mask'].tolist()[0])
        print(tokens["bert"].tolist()[0])
        print([
            vocab.get_token_from_index(i, "bert")
            for i in tokens["bert"].tolist()[0]
        ])
        print(len(tokens['bert'][0]))
        print(tokens["bert-offsets"].tolist()[0])
        print(tokens['bert-type-ids'].tolist()[0])
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
Ejemplo n.º 39
0
class DialogQATest(ModelTestCase):
    def setUp(self):
        super().setUp()
        self.set_up_model(self.FIXTURES_ROOT / 'dialog_qa' / 'experiment.json',
                          self.FIXTURES_ROOT / 'data' / 'quac_sample.json')
        self.batch = Batch(self.instances)
        self.batch.index_instances(self.vocab)

    def test_forward_pass_runs_correctly(self):
        training_tensors = self.batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        assert "best_span_str" in output_dict and "loss" in output_dict
        assert "followup" in output_dict and "yesno" in output_dict

    def test_model_can_train_save_and_load(self):
        self.ensure_model_can_train_save_and_load(self.param_file,
                                                  tolerance=1e-4)

    def test_batch_predictions_are_consistent(self):
        self.ensure_batch_predictions_are_consistent()
Ejemplo n.º 40
0
    def test_max_length(self):
        config = BertConfig(len(self.token_indexer.vocab))
        model = BertModel(config)
        embedder = BertEmbedder(model)

        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())
        sentence = "the " * 1000
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]
        embedder(tokens["bert"], tokens["bert-offsets"])
Ejemplo n.º 41
0
    def forward_on_instances(
            self, instances: List[Instance]) -> List[Dict[str, numpy.ndarray]]:
        """
        Takes a list of  :class:`~allennlp.data.instance.Instance`s, converts that text into
        arrays using this model's :class:`Vocabulary`, passes those arrays through
        :func:`self.forward()` and :func:`self.decode()` (which by default does nothing)
        and returns the result.  Before returning the result, we convert any
        ``torch.Tensors`` into numpy arrays and separate the
        batched output into a list of individual dicts per instance. Note that typically
        this will be faster on a GPU (and conditionally, on a CPU) than repeated calls to
        :func:`forward_on_instance`.

        Parameters
        ----------
        instances : List[Instance], required
            The instances to run the model on.
        cuda_device : int, required
            The GPU device to use.  -1 means use the CPU.

        Returns
        -------
        A list of the models output for each instance.
        """
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
            outputs = self.decode(self(**model_input))

            instance_separated_output: List[Dict[str, numpy.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    output = output.detach().cpu().numpy()
                outputs[name] = output
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
Ejemplo n.º 42
0
def predict(model, dataset_reader, test_file, output_file, cuda_device):
    gold_test_data = load_json(test_file)
    instances = dataset_reader.read(test_file)
    batch = Batch(instances)
    batch.index_instances(model.vocab)
    iterator = BatchIterator()
    iterator._batch_size = 5
    # For long documents, loop over batches of sentences. Keep track of the
    # total length and append onto the end of the predictions for each sentence
    # batch.
    assert len(gold_test_data) == 1
    gold_data = gold_test_data[0]
    predictions = {}
    total_length = 0
    for sents in iterator(batch.instances, num_epochs=1, shuffle=False):
        sents = nn_util.move_to_device(sents, cuda_device)  # Put on GPU.
        sentence_lengths = [
            len(entry["sentence"]) for entry in sents["metadata"]
        ]
        sentence_starts = np.cumsum(sentence_lengths) + total_length
        sentence_starts = np.roll(sentence_starts, 1)
        sentence_starts[0] = total_length
        pred = model(**sents)
        decoded = model.decode(pred)
        if total_length == 0:
            for k, v in decoded.items():
                predictions[decode_names[k]] = cleanup(k, v[decode_fields[k]],
                                                       sentence_starts)
        else:
            for k, v in decoded.items():
                predictions[decode_names[k]] += cleanup(
                    k, v[decode_fields[k]], sentence_starts)
        total_length += sum(sentence_lengths)

    res = {}
    res.update(gold_data)
    res.update(predictions)
    check_lengths(res)
    encoded = json.dumps(res, default=int)
    with open(output_file, "w") as f:
        f.write(encoded + "\n")
Ejemplo n.º 43
0
    def test_padding_for_equal_length_indices(self):
        tokenizer = WordTokenizer(word_splitter=BertBasicWordSplitter())

        #            2   3     5     6   8      9    2   14   12
        sentence = "the quick brown fox jumped over the lazy dog"
        tokens = tokenizer.tokenize(sentence)

        vocab = Vocabulary()

        instance = Instance({"tokens": TextField(tokens, {"bert": self.token_indexer})})

        batch = Batch([instance])
        batch.index_instances(vocab)

        padding_lengths = batch.get_padding_lengths()
        tensor_dict = batch.as_tensor_dict(padding_lengths)
        tokens = tensor_dict["tokens"]

        assert tokens["bert"].tolist() == [[16, 2, 3, 5, 6, 8, 9, 2, 14, 12, 17]]

        assert tokens["bert-offsets"].tolist() == [[1, 2, 3, 4, 5, 6, 7, 8, 9]]
Ejemplo n.º 44
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics['f1'] > 0

        span_start_probs = output_dict['span_start_probs'][0].data.numpy()
        span_end_probs = output_dict['span_start_probs'][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
        assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6)
        span_start, span_end = tuple(output_dict['best_span'][0].data.numpy())
        assert span_start >= 0
        assert span_start <= span_end
        assert span_end < self.instances[0].fields['passage'].sequence_length()
        assert isinstance(output_dict['best_span_str'][0], str)
Ejemplo n.º 45
0
class ModelTestCase(AllenNlpTestCase):
    """
    A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase`
    with added methods for testing :class:`~allennlp.models.model.Model` subclasses.
    """
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        instances = reader.read(dataset_file)
        vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(self.vocab, params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1):
        save_dir = os.path.join(self.TEST_DIR, "save_and_load_test")
        archive_file = os.path.join(save_dir, "model.tar.gz")
        model = train_model_from_file(param_file, save_dir)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(self.param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False, cuda_device=cuda_device))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False, cuda_device=cuda_device))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model

    def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None:
        if isinstance(field1, torch.autograd.Variable):
            assert_allclose(field1.data.cpu().numpy(),
                            field2.data.cpu().numpy(),
                            rtol=tolerance,
                            err_msg=name)
        elif isinstance(field1, dict):
            assert field1.keys() == field2.keys()
            for key in field1:
                self.assert_fields_equal(field1[key],
                                         field2[key],
                                         tolerance=tolerance,
                                         name=name + '.' + key)
        elif isinstance(field1, (list, tuple)):
            assert len(field1) == len(field2)
            for i, (subfield1, subfield2) in enumerate(zip(field1, field2)):
                self.assert_fields_equal(subfield1,
                                         subfield2,
                                         tolerance=tolerance,
                                         name=name + f"[{i}]")
        else:
            assert field1 == field2

    @staticmethod
    def check_model_computes_gradients_correctly(model, model_batch):
        model.zero_grad()
        result = model(**model_batch)
        result["loss"].backward()
        has_zero_or_none_grads = {}
        for name, parameter in model.named_parameters():
            zeros = torch.zeros(parameter.size())
            if parameter.requires_grad:

                if parameter.grad is None:
                    has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)"
                # Some parameters will only be partially updated,
                # like embeddings, so we just check that any gradient is non-zero.
                if (parameter.grad.data.cpu() == zeros).all():
                    has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})"
            else:
                assert parameter.grad is None

        if has_zero_or_none_grads:
            for name, grad in has_zero_or_none_grads.items():
                print(f"Parameter: {name} had incorrect gradient: {grad}")
            raise Exception("Incorrect gradients found. See stdout for more info.")

    def ensure_batch_predictions_are_consistent(self):
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths(), for_training=False)
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths(), for_training=False)
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if key == 'loss':
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.autograd.Variable):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Ejemplo n.º 46
0
 def test_padding_lengths_uses_max_instance_lengths(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     padding_lengths = dataset.get_padding_lengths()
     assert padding_lengths == {"text1": {"num_tokens": 5, "tokens_length": 5},
                                "text2": {"num_tokens": 6, "tokens_length": 6}}
 def _get_training_tensors(self):
     dataset = Batch(self.instances)
     dataset.index_instances(self.vocab)
     return dataset.as_tensor_dict()
Ejemplo n.º 48
0
class ModelTestCase(AllenNlpTestCase):
    """
    A subclass of :class:`~allennlp.common.testing.test_case.AllenNlpTestCase`
    with added methods for testing :class:`~allennlp.models.model.Model` subclasses.
    """
    def set_up_model(self, param_file, dataset_file):
        # pylint: disable=attribute-defined-outside-init
        self.param_file = param_file
        params = Params.from_file(self.param_file)

        reader = DatasetReader.from_params(params['dataset_reader'])
        # The dataset reader might be lazy, but a lazy list here breaks some of our tests.
        instances = list(reader.read(dataset_file))
        # Use parameters for vocabulary if they are present in the config file, so that choices like
        # "non_padded_namespaces", "min_count" etc. can be set if needed.
        if 'vocabulary' in params:
            vocab_params = params['vocabulary']
            vocab = Vocabulary.from_params(params=vocab_params, instances=instances)
        else:
            vocab = Vocabulary.from_instances(instances)
        self.vocab = vocab
        self.instances = instances
        self.model = Model.from_params(vocab=self.vocab, params=params['model'])

        # TODO(joelgrus) get rid of these
        # (a lot of the model tests use them, so they'll have to be changed)
        self.dataset = Batch(self.instances)
        self.dataset.index_instances(self.vocab)

    def ensure_model_can_train_save_and_load(self,
                                             param_file: str,
                                             tolerance: float = 1e-4,
                                             cuda_device: int = -1,
                                             gradients_to_ignore: Set[str] = None,
                                             overrides: str = ""):
        """
        Parameters
        ----------
        param_file : ``str``
            Path to a training configuration file that we will use to train the model for this
            test.
        tolerance : ``float``, optional (default=1e-4)
            When comparing model predictions between the originally-trained model and the model
            after saving and loading, we will use this tolerance value (passed as ``rtol`` to
            ``numpy.testing.assert_allclose``).
        cuda_device : ``int``, optional (default=-1)
            The device to run the test on.
        gradients_to_ignore : ``Set[str]``, optional (default=None)
            This test runs a gradient check to make sure that we're actually computing gradients
            for all of the parameters in the model.  If you really want to ignore certain
            parameters when doing that check, you can pass their names here.  This is not
            recommended unless you're `really` sure you don't need to have non-zero gradients for
            those parameters (e.g., some of the beam search / state machine models have
            infrequently-used parameters that are hard to force the model to use in a small test).
        overrides : ``str``, optional (default = "")
            A JSON string that we will use to override values in the input parameter file.
        """
        save_dir = self.TEST_DIR / "save_and_load_test"
        archive_file = save_dir / "model.tar.gz"
        model = train_model_from_file(param_file, save_dir, overrides=overrides)
        loaded_model = load_archive(archive_file, cuda_device=cuda_device).model
        state_keys = model.state_dict().keys()
        loaded_state_keys = loaded_model.state_dict().keys()
        assert state_keys == loaded_state_keys
        # First we make sure that the state dict (the parameters) are the same for both models.
        for key in state_keys:
            assert_allclose(model.state_dict()[key].cpu().numpy(),
                            loaded_model.state_dict()[key].cpu().numpy(),
                            err_msg=key)
        params = Params.from_file(param_file)
        reader = DatasetReader.from_params(params['dataset_reader'])

        # Need to duplicate params because Iterator.from_params will consume.
        iterator_params = params['iterator']
        iterator_params2 = Params(copy.deepcopy(iterator_params.as_dict()))

        iterator = DataIterator.from_params(iterator_params)
        iterator2 = DataIterator.from_params(iterator_params2)

        # We'll check that even if we index the dataset with each model separately, we still get
        # the same result out.
        model_dataset = reader.read(params['validation_data_path'])
        iterator.index_with(model.vocab)
        model_batch = next(iterator(model_dataset, shuffle=False))

        loaded_dataset = reader.read(params['validation_data_path'])
        iterator2.index_with(loaded_model.vocab)
        loaded_batch = next(iterator2(loaded_dataset, shuffle=False))

        # Check gradients are None for non-trainable parameters and check that
        # trainable parameters receive some gradient if they are trainable.
        self.check_model_computes_gradients_correctly(model, model_batch, gradients_to_ignore)

        # The datasets themselves should be identical.
        assert model_batch.keys() == loaded_batch.keys()
        for key in model_batch.keys():
            self.assert_fields_equal(model_batch[key], loaded_batch[key], key, 1e-6)

        # Set eval mode, to turn off things like dropout, then get predictions.
        model.eval()
        loaded_model.eval()
        # Models with stateful RNNs need their states reset to have consistent
        # behavior after loading.
        for model_ in [model, loaded_model]:
            for module in model_.modules():
                if hasattr(module, 'stateful') and module.stateful:
                    module.reset_states()
        model_predictions = model(**model_batch)
        loaded_model_predictions = loaded_model(**loaded_batch)

        # Check loaded model's loss exists and we can compute gradients, for continuing training.
        loaded_model_loss = loaded_model_predictions["loss"]
        assert loaded_model_loss is not None
        loaded_model_loss.backward()

        # Both outputs should have the same keys and the values for these keys should be close.
        for key in model_predictions.keys():
            self.assert_fields_equal(model_predictions[key],
                                     loaded_model_predictions[key],
                                     name=key,
                                     tolerance=tolerance)

        return model, loaded_model

    def assert_fields_equal(self, field1, field2, name: str, tolerance: float = 1e-6) -> None:
        if isinstance(field1, torch.Tensor):
            assert_allclose(field1.detach().cpu().numpy(),
                            field2.detach().cpu().numpy(),
                            rtol=tolerance,
                            err_msg=name)
        elif isinstance(field1, dict):
            assert field1.keys() == field2.keys()
            for key in field1:
                self.assert_fields_equal(field1[key],
                                         field2[key],
                                         tolerance=tolerance,
                                         name=name + '.' + str(key))
        elif isinstance(field1, (list, tuple)):
            assert len(field1) == len(field2)
            for i, (subfield1, subfield2) in enumerate(zip(field1, field2)):
                self.assert_fields_equal(subfield1,
                                         subfield2,
                                         tolerance=tolerance,
                                         name=name + f"[{i}]")
        elif isinstance(field1, (float, int)):
            assert_allclose([field1], [field2], rtol=tolerance, err_msg=name)
        else:
            if field1 != field2:
                for key in field1.__dict__:
                    print(key, getattr(field1, key) == getattr(field2, key))
            assert field1 == field2, f"{name}, {type(field1)}, {type(field2)}"

    @staticmethod
    def check_model_computes_gradients_correctly(model: Model,
                                                 model_batch: Dict[str, Union[Any, Dict[str, Any]]],
                                                 params_to_ignore: Set[str] = None):
        print("Checking gradients")
        model.zero_grad()
        result = model(**model_batch)
        result["loss"].backward()
        has_zero_or_none_grads = {}
        for name, parameter in model.named_parameters():
            zeros = torch.zeros(parameter.size())
            if params_to_ignore and name in params_to_ignore:
                continue
            if parameter.requires_grad:

                if parameter.grad is None:
                    has_zero_or_none_grads[name] = "No gradient computed (i.e parameter.grad is None)"

                elif parameter.grad.is_sparse or parameter.grad.data.is_sparse:
                    pass

                # Some parameters will only be partially updated,
                # like embeddings, so we just check that any gradient is non-zero.
                elif (parameter.grad.cpu() == zeros).all():
                    has_zero_or_none_grads[name] = f"zeros with shape ({tuple(parameter.grad.size())})"
            else:
                assert parameter.grad is None

        if has_zero_or_none_grads:
            for name, grad in has_zero_or_none_grads.items():
                print(f"Parameter: {name} had incorrect gradient: {grad}")
            raise Exception("Incorrect gradients found. See stdout for more info.")

    def ensure_batch_predictions_are_consistent(self):
        self.model.eval()
        single_predictions = []
        for i, instance in enumerate(self.instances):
            dataset = Batch([instance])
            tensors = dataset.as_tensor_dict(dataset.get_padding_lengths())
            result = self.model(**tensors)
            single_predictions.append(result)
        full_dataset = Batch(self.instances)
        batch_tensors = full_dataset.as_tensor_dict(full_dataset.get_padding_lengths())
        batch_predictions = self.model(**batch_tensors)
        for i, instance_predictions in enumerate(single_predictions):
            for key, single_predicted in instance_predictions.items():
                tolerance = 1e-6
                if 'loss' in key:
                    # Loss is particularly unstable; we'll just be satisfied if everything else is
                    # close.
                    continue
                single_predicted = single_predicted[0]
                batch_predicted = batch_predictions[key][i]
                if isinstance(single_predicted, torch.Tensor):
                    if single_predicted.size() != batch_predicted.size():
                        slices = tuple(slice(0, size) for size in single_predicted.size())
                        batch_predicted = batch_predicted[slices]
                    assert_allclose(single_predicted.data.numpy(),
                                    batch_predicted.data.numpy(),
                                    atol=tolerance,
                                    err_msg=key)
                else:
                    assert single_predicted == batch_predicted, key
Ejemplo n.º 49
0
    ## Vocabulary ##
    vocab = model.vocab 

    """
    ############  Propagate an instance text #############
    """
    instance = dataset_reader.text_to_instance("What kind of test succeeded on its first attempt?", 
                                               "One time I was writing a unit test, and it succeeded on the first attempt.", 
                                               char_spans=[(6, 10)])
    
    print ("Keys instance: ", instance.fields.keys())
    
    # Batch intances and convert to index using the vocabulary.
    instances = [instance]
    dataset = Batch(instances)
    dataset.index_instances(model.vocab)
    
    # Create the index tensor from the vocabulary.
    cuda_device = model._get_prediction_device()
    model_input = dataset.as_tensor_dict(cuda_device=cuda_device)
    
    # Propagate the sample and obtain the loss (since we passed labels)
    outputs = model(**model_input)
    outputs["loss"].requires_grad






Ejemplo n.º 50
0
    ## We tokenize every word. 
    field = get_ELMO_text_field(sentence, indexer, tokenizer)
    instance = Instance({"elmo": field})
    print("Fields in instance: ", instance.fields)
    instances.append(instance)


### Create a batch of the instances 
dataset = Batch(instances)

## Create an empty vocabulary ! We do not need to create one from dataset,
# It will use all of the indexer !!
vocab = Vocabulary()

## Create the index_instances from the batch, this will be used later by ELMO
dataset.index_instances(vocab)

"""
IMPORTANT: The ELMO uses just a character vocab in the interface.
It will compute the rest internally!

The ELMO words are padded to length 50 !
"""

character_ids = dataset.as_tensor_dict()['elmo']['character_ids']
print ("Shape of characters ids: ", character_ids.shape)
print ("Batch size: ", character_ids.shape[0])
print ("Maximum num words in batch: ", character_ids.shape[1])
print ("Maximum word length in dictionary: ", character_ids.shape[2])
#character_ids = batch_to_ids(sentences)
Ejemplo n.º 51
0
    if (create_video_training):
        pf.create_image_weights_epoch(model, video_fotograms_folder2, i)
        pf.create_Bayesian_analysis_charts_simplified(model ,train_dataset, validation_dataset,
                                            tr_data_loss, val_data_loss, KL_loss,
                                            video_fotograms_folder4, i+1)

#            output = model(tensor_dict["text_field"],tensor_dict["tags_field"])
#            loss = output["loss"] # We can get the loss coz we gave the labels as input



			# gradient and everything. 
"""
############## Use the trained model ######################
We use an already implemented predictor that takes the model and how to preprocess the data
"""

name_exmaple = "Eat my motherfucking jeans"
name_exmaple = "Carlos Sanchez"
tokens_list = [name_exmaple[i] for i in range(len(name_exmaple))]
Instance_test = reader.generate_instance(tokens_list,None)
batch = Batch([Instance_test])
batch.index_instances(vocab)

padding_lengths = batch.get_padding_lengths()
tensor_dict = batch.as_tensor_dict(padding_lengths)

model.eval()
tag_logits = model(tensor_dict["text_field"])['tag_logits'].detach().cpu().numpy()
tag_ids = np.argmax(tag_logits, axis=-1)
print([model.vocab.get_token_from_index(i, 'tags_country') for i in tag_ids])