def test_read_from_dir(self):
        # Test reading from multiple files in a directory
        instances = list(self.reader.read("test_fixtures/vision/gqa/question_dir/"))
        assert len(instances) == 2

        instance = instances[1]
        assert len(instance.fields) == 6
        assert len(instance["question"]) == 10
        question_tokens = [t.text for t in instance["question"]]
        assert question_tokens == [
            "Does",
            "the",
            "table",
            "below",
            "the",
            "water",
            "look",
            "wooden",
            "and",
            "round?",
        ]
        assert instance["labels"][0].label == "yes"

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (2, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (2, 2, 4)

        # (batch size, num boxes (fake),)
        assert tensors["box_mask"].size() == (2, 2)
Beispiel #2
0
    def predict_instance(self, instance):
        """
        An instance is an entire document, represented as a list of sentences.
        """
        model = self._model
        cuda_device = model._get_prediction_device()

        # Try to predict this batch.
        try:
            dataset = Batch([instance])
            dataset.index_instances(model.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            prediction = model.make_output_human_readable(
                model(**model_input)).to_json()
        # If we run out of GPU memory, warn user and indicate that this document failed.
        # This way, prediction doesn't grind to a halt every time we run out of GPU.
        except RuntimeError as err:
            # doc_key, dataset, sentences, message
            metadata = instance["metadata"].metadata
            doc_key = metadata.doc_key
            msg = (
                f"Encountered a RunTimeError on document {doc_key}. Skipping this example."
                f" Error message:\n{err.args[0]}.")
            warnings.warn(msg)
            prediction = metadata.to_json()
            prediction["_FAILED_PREDICTION"] = True

        return prediction
 def visualize_instance(self, instance: Instance):
     """
     main function of this visualizer
     usage: take an instance and visualize it
     _model have to support "return_attention=True" kwarg
     _model have to have the correct vocab in it
     """
     logger = logging.getLogger(__name__)
     # indexing with model
     instance.index_fields(self._model.vocab)
     # get tokens from instance
     json_dict = instance2json(instance)
     tokens_p = json_dict["sentence1"]
     tokens_h = json_dict["sentence2"]
     gold_label = json_dict["gold_label"]
     # get predictions and attentions
     batch = Batch([instance])
     batch_tensor = batch.as_tensor_dict()
     #print(batch.as_tensor_dict())
     ret = self._model.forward(**batch_tensor, return_attention=True)
     ret = self._model.make_output_human_readable(ret)
     pooler_p = ret["attentions"]["pooler1"][0]
     pooler_h = ret["attentions"]["pooler2"][0]
     logger.setLevel(logging.DEBUG)
     logger.info(f"tokens_p are {tokens_p}")
     logger.info(f"tokens_h are {tokens_h}")
     logger.info(f"the predicted label is {ret['predicted_label']}")
     logger.info(f"the gold label is {gold_label}")
     # hope to show_sequence_attention(strlist, att, msg=None)
     show_sequence_attention(tokens_p, pooler_p)
     show_sequence_attention(tokens_h, pooler_h)
     return
Beispiel #4
0
    def test_from_params(self):

        for config_path in CONFIG_DIR.glob("*.jsonnet"):
            params = Params.from_file(str(config_path),
                                      ext_vars={
                                          "TRAIN_DATA_PATH": "",
                                          "VALID_DATA_PATH": ""
                                      })

            data_reader_params = params["dataset_reader"]
            data_reader_params.pop("type")

            reader = openvaccine.CovidReader.from_params(data_reader_params)
            instances = reader.read(PROJECT_ROOT / "data" / "sample.jsonl")
            vocab = Vocabulary.from_instances(instances)

            batch = Batch(instances)
            batch.index_instances(vocab)

            try:
                model = Model.from_params(params=params["model"], vocab=vocab)
            except Exception as e:
                raise AssertionError(
                    f"unable to load params from {config_path}, because {e}")

            output_dict = model(**batch.as_tensor_dict())

            assert set(output_dict.keys()) == {
                "logits",
                "seq_id",
                "loss",
            }

            assert len(output_dict["logits"].shape) == 3
            assert isinstance(output_dict["seq_id"][0], str)
Beispiel #5
0
    def instances_to_captum_inputs(self, labeled_instances):
        batch_size = len(labeled_instances)

        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            batch = Batch(labeled_instances)
            batch.index_instances(self.vocab)
            model_input = util.move_to_device(batch.as_tensor_dict(),
                                              cuda_device)

            key1, key2 = self.field_names
            tokens1 = model_input[key1]
            tokens2 = model_input[key2]
            label = model_input["label"]

            tokens_mask1 = util.get_text_field_mask(tokens1)
            tokens_mask2 = util.get_text_field_mask(tokens2)
            embedded_tokens1 = self.word_embeddings(tokens1)
            embedded_tokens2 = self.word_embeddings(tokens2)

            output_dict = {}
            output_dict[f"{key1}_embedding"] = embedded_tokens1
            output_dict[f"{key2}_embedding"] = embedded_tokens2

            return (embedded_tokens1,
                    embedded_tokens2), None, (tokens_mask1, tokens_mask2,
                                              label, output_dict)
Beispiel #6
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        metrics = self.model.get_metrics(reset=True)
        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics["per_instance_f1"] > 0

        span_start_probs = output_dict["span_start_probs"][0].data.numpy()
        span_end_probs = output_dict["span_start_probs"][0].data.numpy()
        assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6)
        assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6)
        span_start, span_end = tuple(output_dict["best_span"][0].data.numpy())
        assert span_start >= 0
        assert span_start <= span_end
        assert span_end < self.instances[0].fields[
            "question_with_context"].sequence_length()
        assert isinstance(output_dict["best_span_str"][0], str)
Beispiel #7
0
    def test_forward_pass_runs_correctly(self):
        """
        Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same
        results as the model itself.
        """
        bidaf_ensemble = BidafEnsemble([self.model, self.model])

        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()

        bidaf_output_dict = self.model(**training_tensors)
        ensemble_output_dict = bidaf_ensemble(**training_tensors)

        metrics = self.model.get_metrics(reset=True)

        # We've set up the data such that there's a fake answer that consists of the whole
        # paragraph.  _Any_ valid prediction for that question should produce an F1 of greater than
        # zero, while if we somehow haven't been able to load the evaluation data, or there was an
        # error with using the evaluation script, this will fail.  This makes sure that we've
        # loaded the evaluation data correctly and have hooked things up to the official evaluation
        # script.
        assert metrics["f1"] > 0
        assert torch.equal(ensemble_output_dict["best_span"],
                           bidaf_output_dict["best_span"])
        assert ensemble_output_dict["best_span_str"] == bidaf_output_dict[
            "best_span_str"]
Beispiel #8
0
    def forward_on_instances(self, instances: List[Instance],
                             **kwargs) -> List[Dict[str, np.ndarray]]:
        # An exact copy of the original method, but supports kwargs
        batch_size = len(instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            dataset = Batch(instances)
            dataset.index_instances(self.vocab)
            model_input = util.move_to_device(dataset.as_tensor_dict(),
                                              cuda_device)
            outputs = self.make_output_human_readable(
                self(**model_input, **kwargs))
            instance_separated_output: List[Dict[str, np.ndarray]] = [
                {} for _ in dataset.instances
            ]
            for name, output in list(outputs.items()):
                if isinstance(output, torch.Tensor):
                    if output.dim() == 0:
                        output = output.unsqueeze(0)

                    if output.size(0) != batch_size:
                        self._maybe_warn_for_unseparable_batches(name)
                        continue
                    output = output.detach().cpu().numpy()
                elif len(output) != batch_size:
                    self._maybe_warn_for_unseparable_batches(name)
                    continue
                for instance_output, batch_element in zip(
                        instance_separated_output, output):
                    instance_output[name] = batch_element
            return instance_separated_output
 def load_pos_dataset_vocab(self, pos_param_file):
     reader = DatasetReader.from_params(pos_param_file['dataset_reader'])
     instances = list(reader.read(str(self.pos_data)))
     vocab = Vocabulary.from_instances(instances)
     dataset = Batch(instances)
     dataset.index_instances(vocab)
     return dataset, vocab
class DialogQATest(ModelTestCase):
    def setup_method(self):
        super().setup_method()
        self.set_up_model(
            FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json",
            FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json",
            seed=42,
        )
        self.batch = Batch(self.instances)
        self.batch.index_instances(self.vocab)
        torch.use_deterministic_algorithms(True)

    def teardown_method(self):
        super().teardown_method()
        torch.use_deterministic_algorithms(False)

    def test_forward_pass_runs_correctly(self):
        training_tensors = self.batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)
        assert "best_span_str" in output_dict and "loss" in output_dict
        assert "followup" in output_dict and "yesno" in output_dict

    def test_model_can_train_save_and_load(self):
        self.ensure_model_can_train_save_and_load(
            self.param_file,
            tolerance=1e-4,
            gradients_to_ignore={"_matrix_attention._bias"})

    def test_batch_predictions_are_consistent(self):
        self.ensure_batch_predictions_are_consistent()
Beispiel #11
0
    def test_create_models_from_allennlp_configs(self, config_path):

        params = Params.from_file(
            str(config_path),
            ext_vars={
                "CLF_TRAIN_DATA_PATH": "",
                "CLF_VALID_DATA_PATH": "",
                "DISCRETIZER_PATH": str(DISCRETIZER_PATH),
                "VOCAB_PATH": str(VOCAB_PATH),
            },
        )

        reader = DatasetReader.from_params(params["dataset_reader"])

        instances = reader.read(DATA_PATH)
        vocab = Vocabulary.from_instances(instances)
        num_labels = vocab.get_vocab_size(namespace="labels")

        batch = Batch(instances)
        batch.index_instances(vocab)

        try:
            model = Model.from_params(params=params["model"], vocab=vocab)
        except Exception as e:
            raise AssertionError(f"unable to load params from {config_path}") from e

        output_dict = model(**batch.as_tensor_dict())

        assert "probs" in output_dict
        assert len(output_dict["probs"].shape) == 2
        assert output_dict["probs"].shape[0] == len(instances)
        assert output_dict["probs"].shape[1] == num_labels
 def setup_method(self):
     super().setup_method()
     self.set_up_model(
         FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json",
         FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json",
     )
     self.batch = Batch(self.instances)
     self.batch.index_instances(self.vocab)
Beispiel #13
0
def data_to_tensors(
    data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1,
) -> ModelsInput:

    instances = Batch([reader.text_to_instance(**data.to_dict())])

    instances.index_instances(vocab)
    inputs = instances.as_tensor_dict()
    return move_to_device(inputs, device)
 def setup_method(self):
     super().setup_method()
     self.set_up_model(
         FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json",
         FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json",
         seed=42,
     )
     self.batch = Batch(self.instances)
     self.batch.index_instances(self.vocab)
     torch.use_deterministic_algorithms(True)
Beispiel #15
0
    def test_train_read(self):
        self.reader = Flickr30kReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "flickr30k",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            data_dir=FIXTURES_ROOT / "vision" / "flickr30k" / "sentences",
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
            featurize_captions=False,
            num_potential_hard_negatives=4,
        )

        instances = list(self.reader.read("test_fixtures/vision/flickr30k/test.txt"))
        assert len(instances) == 25

        instance = instances[5]
        assert len(instance.fields) == 5
        assert len(instance["caption"]) == 4
        assert len(instance["caption"][0]) == 12  # 16
        assert instance["caption"][0] != instance["caption"][1]
        assert instance["caption"][0] == instance["caption"][2]
        assert instance["caption"][0] == instance["caption"][3]
        question_tokens = [t.text for t in instance["caption"][0]]
        assert question_tokens == [
            "girl",
            "with",
            "brown",
            "hair",
            "sits",
            "on",
            "edge",
            "of",
            "concrete",
            "area",
            "overlooking",
            "water",
        ]

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (25, 4, 2, 10)

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (25, 4, 2, 4)

        # (batch size, num images (3 hard negatives + gold image), num boxes (fake),)
        assert tensors["box_mask"].size() == (25, 4, 2)

        # (batch size)
        assert tensors["label"].size() == (25,)
    def explain_prediction(
        self, prediction: Dict[str, numpy.array], instance: Instance, n_steps: int
    ) -> Dict[str, Any]:
        """Here, we must apply transformations for manage ListFields tensors shapes"""

        dataset = Batch([instance])
        input_tokens_ids = dataset.as_tensor_dict()
        ig = IntegratedGradients(self._explain_embeddings)

        num_wrapping_dims = 1

        document_tokens = [
            [token.text for token in cast(TextField, text_field).tokens]
            for text_field in cast(ListField, instance.get(self.forward_arg_name))
        ]
        document_tensors = input_tokens_ids.get(self.forward_arg_name)
        mask = get_text_field_mask(
            document_tensors, num_wrapping_dims=num_wrapping_dims
        )
        text_embeddings = self.backbone.embedder.forward(
            document_tensors, num_wrapping_dims=num_wrapping_dims
        )

        label_id = vocabulary.index_for_label(
            self.backbone.vocab, prediction.get(self.label_name)
        )
        attributions, delta = ig.attribute(
            text_embeddings,
            target=label_id,
            additional_forward_args=mask,
            return_convergence_delta=True,
            n_steps=n_steps,
        )
        attributions = attributions.sum(dim=3).squeeze(0)
        attributions = attributions / torch.norm(attributions)
        attributions = attributions.detach().numpy()

        return {
            **prediction,
            "explain": {
                self.forward_arg_name: [
                    [
                        {"token": token, "attribution": attribution}
                        for token, attribution in zip(
                            sentence_tokens, sentence_attribution
                        )
                    ]
                    for sentence_tokens, sentence_attribution in zip(
                        document_tokens, attributions
                    )
                ]
            },
        }
def test_transformer_text_field_batching():
    batch = Batch(
        [
            Instance({"text": TransformerTextField(torch.IntTensor([1, 2, 3]))}),
            Instance({"text": TransformerTextField(torch.IntTensor([2, 3, 4, 5]))}),
            Instance({"text": TransformerTextField(torch.IntTensor())}),
        ]
    )
    tensors = batch.as_tensor_dict(batch.get_padding_lengths())
    assert tensors["text"]["input_ids"].shape == (3, 4)
    assert tensors["text"]["input_ids"][0, -1] == 0
    assert tensors["text"]["attention_mask"][0, -1] == torch.Tensor([False])
    assert torch.all(tensors["text"]["input_ids"][-1] == 0)
    assert torch.all(tensors["text"]["attention_mask"][-1] == torch.tensor([False]))
Beispiel #18
0
    def predict_instance(self, instance):
        """
        An instance is an entire document, represented as a list of sentences.
        """
        model = self._model
        cuda_device = model._get_prediction_device()

        dataset = Batch([instance])
        dataset.index_instances(model.vocab)
        model_input = util.move_to_device(dataset.as_tensor_dict(),
                                          cuda_device)
        prediction = model.make_output_human_readable(model(**model_input))

        return prediction.to_json()
Beispiel #19
0
 def __iter__(self):
     while True:
         self.init_epoch()
         for idx, minibatch in enumerate(self.batches):
             # fast-forward if loaded from state
             if self._iterations_this_epoch > idx:
                 continue
             self.iterations += 1
             self._iterations_this_epoch += 1
             if self.sort_within_batch:
                 # NOTE: `rnn.pack_padded_sequence` requires that a minibatch
                 # be sorted by decreasing order, which requires reversing
                 # relative to typical sort keys
                 if self.sort:
                     minibatch.reverse()
                 else:
                     minibatch.sort(key=self.sort_key, reverse=True)
             batch = Batch(minibatch)
             if self.device == 'cuda' or self.device.type == "cuda":
                 batch = move_to_device(
                     batch, self.device.index
                     if self.device.index is not None else 0)
             yield batch.as_tensor_dict(batch.get_padding_lengths())
         if not self.repeat:
             return
Beispiel #20
0
    def test_forward_pass_runs_correctly(self):
        batch = Batch(self.instances)
        batch.index_instances(self.vocab)
        training_tensors = batch.as_tensor_dict()
        output_dict = self.model(**training_tensors)

        # The following asserts assume that we get a fair mix of answers, some 0, some 1, some correct, and some
        # incorrect. If the model was completely un-initialized, the chance of these checks failing randomly is
        # 1/1024, and there are three of them. But the model is not completely uninitialized (in fact, it contains
        # no random weights), so we know these asserts pass. We still mark the test as flaky because random
        # drop-out could mess things up.

        assert output_dict["best_alternative"].min() == 0
        assert output_dict["best_alternative"].max() == 1

        metrics = self.model.get_metrics(reset=True)
        assert metrics["acc"] > 0
def test_transformer_text_field_from_huggingface(return_tensors):
    tokenizer = get_tokenizer("bert-base-cased")

    batch = Batch(
        [
            Instance(
                {"text": TransformerTextField(**tokenizer(text, return_tensors=return_tensors))}
            )
            for text in [
                "Hello, World!",
                "The fox jumped over the fence",
                "Humpty dumpty sat on a wall",
            ]
        ]
    )
    tensors = batch.as_tensor_dict(batch.get_padding_lengths())
    assert tensors["text"]["input_ids"].shape == (3, 11)
Beispiel #22
0
    def test_forward_pass_runs_correctly(self):
        archive = load_archive(self.model_path)
        model = archive.model
        reader = DatasetReader.from_params(archive.config["dataset_reader"])
        instances = reader.read(FIXTURES_ROOT / "data" / "train.jsonl")

        batch = Batch(instances)
        batch.index_instances(model.vocab)
        output_dict = model(**batch.as_tensor_dict())

        assert set(output_dict.keys()) == {
            "edit_distance",
            "emb_sequence_a",
            "emb_sequence_b",
            "loss",
        }
        assert output_dict["edit_distance"].shape[0] == 3
    def test_read(self):
        from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader

        reader = VQAv2Reader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "vqav2",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(reader.read("unittest"))
        assert len(instances) == 3

        instance = instances[0]
        assert len(instance.fields) == 6
        assert len(instance["question"]) == 7
        question_tokens = [t.text for t in instance["question"]]
        assert question_tokens == [
            "What", "is", "this", "photo", "taken", "looking", "through?"
        ]
        assert len(instance["labels"]) == 5
        labels = [field.label for field in instance["labels"].field_list]
        assert labels == ["net", "netting", "mesh", "pitcher", "orange"]
        assert torch.allclose(
            instance["label_weights"].tensor,
            torch.tensor([1.0, 1.0 / 3, 1.0 / 3, 1.0 / 3, 1.0 / 3]),
        )

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (3, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (3, 2, 4)

        # (batch size, num boxes (fake),)
        assert tensors["box_mask"].size() == (3, 2)

        # Nothing should be masked out since the number of fake boxes is the same
        # for each item in the batch.
        assert tensors["box_mask"].all()
Beispiel #24
0
 def __call__(self, doc: Doc) -> Doc:
     cuda_device = self._model._get_prediction_device()
     sentences = [[tok.text for tok in sent] for sent in doc.sents]
     ins = self._dataset_reader.text_to_instance({
         "sentences":
         sentences,
         "doc_key":
         "test",
         "dataset":
         self.dataset_name
     })
     dataset = Batch([ins])
     dataset.index_instances(self._model.vocab)
     model_input = util.move_to_device(dataset.as_tensor_dict(),
                                       cuda_device)
     prediction = self._model.make_output_human_readable(
         self._model(**model_input)).to_json()
     # prepare and store ent/relation information to spacy Doc
     return prepare_spacy_doc(doc, prediction)
Beispiel #25
0
    def instances_to_captum_inputs(self, labeled_instances):
        batch_size = len(labeled_instances)
        with torch.no_grad():
            cuda_device = self._get_prediction_device()
            batch = Batch(labeled_instances)
            batch.index_instances(self.vocab)
            model_input = util.move_to_device(batch.as_tensor_dict(),
                                              cuda_device)

            tokens = model_input["tokens"]
            label = model_input["label"]

            tokens_mask = util.get_text_field_mask(tokens)
            embedded_tokens = self.word_embeddings(tokens)

            output_dict = {}
            output_dict["embedding"] = embedded_tokens
            # target = None because output is a single scalar
            return (embedded_tokens, ), None, (tokens_mask, output_dict)
Beispiel #26
0
    def explain_prediction(
        self, prediction: Dict[str, numpy.array], instance: Instance, n_steps: int
    ) -> Dict[str, Any]:

        dataset = Batch([instance])
        input_tokens_ids = dataset.as_tensor_dict()
        ig = IntegratedGradients(self._explain_embeddings)

        num_wrapping_dims = 0

        text_tokens = [
            token.text
            for token in cast(TextField, instance.get(self.forward_arg_name)).tokens
        ]
        text_tensor = input_tokens_ids.get(self.forward_arg_name)
        mask = get_text_field_mask(text_tensor, num_wrapping_dims=num_wrapping_dims)
        text_embeddings = self.backbone.embedder.forward(
            text_tensor, num_wrapping_dims=num_wrapping_dims
        )

        label_id = vocabulary.index_for_label(
            self.backbone.vocab, prediction["labels"][0]
        )
        attributions, delta = ig.attribute(
            text_embeddings,
            n_steps=n_steps,
            target=label_id,
            additional_forward_args=mask,
            return_convergence_delta=True,
        )
        attributions = attributions.sum(dim=2).squeeze(0)
        attributions = attributions / torch.norm(attributions)
        attributions = attributions.detach().numpy()

        return {
            **prediction,
            "explain": {
                self.forward_arg_name: [
                    {"token": token, "attribution": attribution}
                    for token, attribution in zip(text_tokens, attributions)
                ]
            },
        }
Beispiel #27
0
 def test_forward_with_weights(self):
     params = Params.from_file(self.param_file)
     reader: CopyNetDatasetReader = DatasetReader.from_params(
         params["dataset_reader"], serialization_dir=self.TEST_DIR)
     instances = [
         reader.text_to_instance("hello hello world",
                                 "hello world",
                                 weight=0.9),
         reader.text_to_instance("hello world",
                                 "hello world world",
                                 weight=0.5),
     ]
     for instance in instances:
         reader.apply_token_indexers(instance)
     batch = Batch(instances)
     batch.index_instances(self.model.vocab)
     inputs = batch.as_tensor_dict()
     assert "weight" in inputs
     _ = self.model(**inputs)
Beispiel #28
0
def test_metrics(pipeline):
    instance = pipeline.head.featurize(text="test this", label="a")
    batch = Batch([instance])
    batch.index_instances(pipeline.vocab)

    pipeline.head.forward(**batch.as_tensor_dict())
    # validation metric should have never been called
    assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 1
    assert pipeline.head._metrics.get_dict(
        is_train=False)["accuracy"].total_count == 0

    train_metrics = pipeline.head.get_metrics(reset=True)
    expected_metric_names = (["accuracy"] + [
        f"{label}/{metric}" for label in ["micro", "macro"]
        for metric in ["precision", "recall", "fscore"]
    ] + [
        f"_{metric}/{label}" for metric in ["precision", "recall", "fscore"]
        for label in ["a", "b", "c", "d", "e", "f"]
    ])
    assert all(name in train_metrics for name in expected_metric_names)

    pipeline.head.training = False
    pipeline.head.forward(**batch.as_tensor_dict())
    # training metric should have never been called after its reset
    assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0
    assert pipeline.head._metrics.get_dict(
        is_train=False)["accuracy"].total_count == 1

    valid_metrics = pipeline.head.get_metrics()
    assert all(name in valid_metrics for name in expected_metric_names)
Beispiel #29
0
def make_vocab_from_params(
    params: Params, serialization_dir: str, print_statistics: bool = False
) -> Vocabulary:
    vocab_params = params.pop("vocabulary", {})
    os.makedirs(serialization_dir, exist_ok=True)
    vocab_dir = os.path.join(serialization_dir, "vocabulary")

    if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None:
        raise ConfigurationError(
            "The 'vocabulary' directory in the provided serialization directory is non-empty"
        )

    datasets_for_vocab_creation: Optional[List[str]] = params.pop(
        "datasets_for_vocab_creation", None
    )
    # Do a quick sanity check here. There's no need to load any datasets if the vocab
    # type is "empty".
    if datasets_for_vocab_creation is None and vocab_params.get("type") in ("empty", "from_files"):
        datasets_for_vocab_creation = []

    datasets: Dict[str, Dataset]
    if datasets_for_vocab_creation is None:
        # If `datasets_for_vocab_creation` was not specified, we'll use all datasets
        # from the config.
        datasets = datasets_from_params(params)
    else:
        for dataset_name in datasets_for_vocab_creation:
            data_path = f"{dataset_name}_data_path"
            if data_path not in params:
                raise ConfigurationError(f"invalid 'datasets_for_vocab_creation' {dataset_name}")
        datasets = datasets_from_params(
            params,
            train=("train" in datasets_for_vocab_creation),
            validation=("validation" in datasets_for_vocab_creation),
            test=("test" in datasets_for_vocab_creation),
        )

    instances: Iterable[Instance] = (
        instance
        for key, dataset in datasets.items()
        if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation
        for instance in dataset
    )

    if print_statistics:
        instances = list(instances)

    vocab = Vocabulary.from_params(vocab_params, instances=instances)

    logger.info(f"writing the vocabulary to {vocab_dir}.")
    vocab.save_to_files(vocab_dir)
    logger.info("done creating vocab")

    if print_statistics:
        dataset = Batch(instances)
        dataset.index_instances(vocab)
        dataset.print_statistics()
        vocab.print_statistics()

    return vocab
Beispiel #30
0
    def test_read(self):
        from allennlp_models.vision.dataset_readers.vgqa import VGQAReader

        reader = VGQAReader(
            image_dir=FIXTURES_ROOT / "vision" / "images" / "vgqa",
            image_loader=TorchImageLoader(),
            image_featurizer=Lazy(NullGridEmbedder),
            region_detector=Lazy(RandomRegionDetector),
            tokenizer=WhitespaceTokenizer(),
            token_indexers={"tokens": SingleIdTokenIndexer()},
        )
        instances = list(
            reader.read("test_fixtures/vision/vgqa/question_answers.json"))
        assert len(instances) == 8

        instance = instances[0]
        assert len(instance.fields) == 6
        assert len(instance["question"]) == 5
        question_tokens = [t.text for t in instance["question"]]
        assert question_tokens == ["What", "is", "on", "the", "curtains?"]
        assert len(instance["labels"]) == 1
        labels = [field.label for field in instance["labels"].field_list]
        assert labels == ["sailboats"]

        batch = Batch(instances)
        batch.index_instances(Vocabulary())
        tensors = batch.as_tensor_dict()

        # (batch size, num boxes (fake), num features (fake))
        assert tensors["box_features"].size() == (8, 2, 10)

        # (batch size, num boxes (fake), 4 coords)
        assert tensors["box_coordinates"].size() == (8, 2, 4)

        # (batch size, num boxes (fake))
        assert tensors["box_mask"].size() == (8, 2)

        # Nothing should be masked out since the number of fake boxes is the same
        # for each item in the batch.
        assert tensors["box_mask"].all()