def test_read_from_dir(self): # Test reading from multiple files in a directory instances = list(self.reader.read("test_fixtures/vision/gqa/question_dir/")) assert len(instances) == 2 instance = instances[1] assert len(instance.fields) == 6 assert len(instance["question"]) == 10 question_tokens = [t.text for t in instance["question"]] assert question_tokens == [ "Does", "the", "table", "below", "the", "water", "look", "wooden", "and", "round?", ] assert instance["labels"][0].label == "yes" batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (2, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (2, 2, 4) # (batch size, num boxes (fake),) assert tensors["box_mask"].size() == (2, 2)
def predict_instance(self, instance): """ An instance is an entire document, represented as a list of sentences. """ model = self._model cuda_device = model._get_prediction_device() # Try to predict this batch. try: dataset = Batch([instance]) dataset.index_instances(model.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) prediction = model.make_output_human_readable( model(**model_input)).to_json() # If we run out of GPU memory, warn user and indicate that this document failed. # This way, prediction doesn't grind to a halt every time we run out of GPU. except RuntimeError as err: # doc_key, dataset, sentences, message metadata = instance["metadata"].metadata doc_key = metadata.doc_key msg = ( f"Encountered a RunTimeError on document {doc_key}. Skipping this example." f" Error message:\n{err.args[0]}.") warnings.warn(msg) prediction = metadata.to_json() prediction["_FAILED_PREDICTION"] = True return prediction
def visualize_instance(self, instance: Instance): """ main function of this visualizer usage: take an instance and visualize it _model have to support "return_attention=True" kwarg _model have to have the correct vocab in it """ logger = logging.getLogger(__name__) # indexing with model instance.index_fields(self._model.vocab) # get tokens from instance json_dict = instance2json(instance) tokens_p = json_dict["sentence1"] tokens_h = json_dict["sentence2"] gold_label = json_dict["gold_label"] # get predictions and attentions batch = Batch([instance]) batch_tensor = batch.as_tensor_dict() #print(batch.as_tensor_dict()) ret = self._model.forward(**batch_tensor, return_attention=True) ret = self._model.make_output_human_readable(ret) pooler_p = ret["attentions"]["pooler1"][0] pooler_h = ret["attentions"]["pooler2"][0] logger.setLevel(logging.DEBUG) logger.info(f"tokens_p are {tokens_p}") logger.info(f"tokens_h are {tokens_h}") logger.info(f"the predicted label is {ret['predicted_label']}") logger.info(f"the gold label is {gold_label}") # hope to show_sequence_attention(strlist, att, msg=None) show_sequence_attention(tokens_p, pooler_p) show_sequence_attention(tokens_h, pooler_h) return
def test_from_params(self): for config_path in CONFIG_DIR.glob("*.jsonnet"): params = Params.from_file(str(config_path), ext_vars={ "TRAIN_DATA_PATH": "", "VALID_DATA_PATH": "" }) data_reader_params = params["dataset_reader"] data_reader_params.pop("type") reader = openvaccine.CovidReader.from_params(data_reader_params) instances = reader.read(PROJECT_ROOT / "data" / "sample.jsonl") vocab = Vocabulary.from_instances(instances) batch = Batch(instances) batch.index_instances(vocab) try: model = Model.from_params(params=params["model"], vocab=vocab) except Exception as e: raise AssertionError( f"unable to load params from {config_path}, because {e}") output_dict = model(**batch.as_tensor_dict()) assert set(output_dict.keys()) == { "logits", "seq_id", "loss", } assert len(output_dict["logits"].shape) == 3 assert isinstance(output_dict["seq_id"][0], str)
def instances_to_captum_inputs(self, labeled_instances): batch_size = len(labeled_instances) with torch.no_grad(): cuda_device = self._get_prediction_device() batch = Batch(labeled_instances) batch.index_instances(self.vocab) model_input = util.move_to_device(batch.as_tensor_dict(), cuda_device) key1, key2 = self.field_names tokens1 = model_input[key1] tokens2 = model_input[key2] label = model_input["label"] tokens_mask1 = util.get_text_field_mask(tokens1) tokens_mask2 = util.get_text_field_mask(tokens2) embedded_tokens1 = self.word_embeddings(tokens1) embedded_tokens2 = self.word_embeddings(tokens2) output_dict = {} output_dict[f"{key1}_embedding"] = embedded_tokens1 output_dict[f"{key2}_embedding"] = embedded_tokens2 return (embedded_tokens1, embedded_tokens2), None, (tokens_mask1, tokens_mask2, label, output_dict)
def test_forward_pass_runs_correctly(self): batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() output_dict = self.model(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics["per_instance_f1"] > 0 span_start_probs = output_dict["span_start_probs"][0].data.numpy() span_end_probs = output_dict["span_start_probs"][0].data.numpy() assert_almost_equal(numpy.sum(span_start_probs, -1), 1, decimal=6) assert_almost_equal(numpy.sum(span_end_probs, -1), 1, decimal=6) span_start, span_end = tuple(output_dict["best_span"][0].data.numpy()) assert span_start >= 0 assert span_start <= span_end assert span_end < self.instances[0].fields[ "question_with_context"].sequence_length() assert isinstance(output_dict["best_span_str"][0], str)
def test_forward_pass_runs_correctly(self): """ Check to make sure a forward pass on an ensemble of two identical copies of a model yields the same results as the model itself. """ bidaf_ensemble = BidafEnsemble([self.model, self.model]) batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() bidaf_output_dict = self.model(**training_tensors) ensemble_output_dict = bidaf_ensemble(**training_tensors) metrics = self.model.get_metrics(reset=True) # We've set up the data such that there's a fake answer that consists of the whole # paragraph. _Any_ valid prediction for that question should produce an F1 of greater than # zero, while if we somehow haven't been able to load the evaluation data, or there was an # error with using the evaluation script, this will fail. This makes sure that we've # loaded the evaluation data correctly and have hooked things up to the official evaluation # script. assert metrics["f1"] > 0 assert torch.equal(ensemble_output_dict["best_span"], bidaf_output_dict["best_span"]) assert ensemble_output_dict["best_span_str"] == bidaf_output_dict[ "best_span_str"]
def forward_on_instances(self, instances: List[Instance], **kwargs) -> List[Dict[str, np.ndarray]]: # An exact copy of the original method, but supports kwargs batch_size = len(instances) with torch.no_grad(): cuda_device = self._get_prediction_device() dataset = Batch(instances) dataset.index_instances(self.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) outputs = self.make_output_human_readable( self(**model_input, **kwargs)) instance_separated_output: List[Dict[str, np.ndarray]] = [ {} for _ in dataset.instances ] for name, output in list(outputs.items()): if isinstance(output, torch.Tensor): if output.dim() == 0: output = output.unsqueeze(0) if output.size(0) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue output = output.detach().cpu().numpy() elif len(output) != batch_size: self._maybe_warn_for_unseparable_batches(name) continue for instance_output, batch_element in zip( instance_separated_output, output): instance_output[name] = batch_element return instance_separated_output
def load_pos_dataset_vocab(self, pos_param_file): reader = DatasetReader.from_params(pos_param_file['dataset_reader']) instances = list(reader.read(str(self.pos_data))) vocab = Vocabulary.from_instances(instances) dataset = Batch(instances) dataset.index_instances(vocab) return dataset, vocab
class DialogQATest(ModelTestCase): def setup_method(self): super().setup_method() self.set_up_model( FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json", FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json", seed=42, ) self.batch = Batch(self.instances) self.batch.index_instances(self.vocab) torch.use_deterministic_algorithms(True) def teardown_method(self): super().teardown_method() torch.use_deterministic_algorithms(False) def test_forward_pass_runs_correctly(self): training_tensors = self.batch.as_tensor_dict() output_dict = self.model(**training_tensors) assert "best_span_str" in output_dict and "loss" in output_dict assert "followup" in output_dict and "yesno" in output_dict def test_model_can_train_save_and_load(self): self.ensure_model_can_train_save_and_load( self.param_file, tolerance=1e-4, gradients_to_ignore={"_matrix_attention._bias"}) def test_batch_predictions_are_consistent(self): self.ensure_batch_predictions_are_consistent()
def test_create_models_from_allennlp_configs(self, config_path): params = Params.from_file( str(config_path), ext_vars={ "CLF_TRAIN_DATA_PATH": "", "CLF_VALID_DATA_PATH": "", "DISCRETIZER_PATH": str(DISCRETIZER_PATH), "VOCAB_PATH": str(VOCAB_PATH), }, ) reader = DatasetReader.from_params(params["dataset_reader"]) instances = reader.read(DATA_PATH) vocab = Vocabulary.from_instances(instances) num_labels = vocab.get_vocab_size(namespace="labels") batch = Batch(instances) batch.index_instances(vocab) try: model = Model.from_params(params=params["model"], vocab=vocab) except Exception as e: raise AssertionError(f"unable to load params from {config_path}") from e output_dict = model(**batch.as_tensor_dict()) assert "probs" in output_dict assert len(output_dict["probs"].shape) == 2 assert output_dict["probs"].shape[0] == len(instances) assert output_dict["probs"].shape[1] == num_labels
def setup_method(self): super().setup_method() self.set_up_model( FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json", FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json", ) self.batch = Batch(self.instances) self.batch.index_instances(self.vocab)
def data_to_tensors( data: TransactionsData, reader: DatasetReader, vocab: Vocabulary, device: Union[torch.device, int] = -1, ) -> ModelsInput: instances = Batch([reader.text_to_instance(**data.to_dict())]) instances.index_instances(vocab) inputs = instances.as_tensor_dict() return move_to_device(inputs, device)
def setup_method(self): super().setup_method() self.set_up_model( FIXTURES_ROOT / "rc" / "dialog_qa" / "experiment.json", FIXTURES_ROOT / "rc" / "dialog_qa" / "quac_sample.json", seed=42, ) self.batch = Batch(self.instances) self.batch.index_instances(self.vocab) torch.use_deterministic_algorithms(True)
def test_train_read(self): self.reader = Flickr30kReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "flickr30k", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), data_dir=FIXTURES_ROOT / "vision" / "flickr30k" / "sentences", region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, featurize_captions=False, num_potential_hard_negatives=4, ) instances = list(self.reader.read("test_fixtures/vision/flickr30k/test.txt")) assert len(instances) == 25 instance = instances[5] assert len(instance.fields) == 5 assert len(instance["caption"]) == 4 assert len(instance["caption"][0]) == 12 # 16 assert instance["caption"][0] != instance["caption"][1] assert instance["caption"][0] == instance["caption"][2] assert instance["caption"][0] == instance["caption"][3] question_tokens = [t.text for t in instance["caption"][0]] assert question_tokens == [ "girl", "with", "brown", "hair", "sits", "on", "edge", "of", "concrete", "area", "overlooking", "water", ] batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num images (3 hard negatives + gold image), num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (25, 4, 2, 10) # (batch size, num images (3 hard negatives + gold image), num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (25, 4, 2, 4) # (batch size, num images (3 hard negatives + gold image), num boxes (fake),) assert tensors["box_mask"].size() == (25, 4, 2) # (batch size) assert tensors["label"].size() == (25,)
def explain_prediction( self, prediction: Dict[str, numpy.array], instance: Instance, n_steps: int ) -> Dict[str, Any]: """Here, we must apply transformations for manage ListFields tensors shapes""" dataset = Batch([instance]) input_tokens_ids = dataset.as_tensor_dict() ig = IntegratedGradients(self._explain_embeddings) num_wrapping_dims = 1 document_tokens = [ [token.text for token in cast(TextField, text_field).tokens] for text_field in cast(ListField, instance.get(self.forward_arg_name)) ] document_tensors = input_tokens_ids.get(self.forward_arg_name) mask = get_text_field_mask( document_tensors, num_wrapping_dims=num_wrapping_dims ) text_embeddings = self.backbone.embedder.forward( document_tensors, num_wrapping_dims=num_wrapping_dims ) label_id = vocabulary.index_for_label( self.backbone.vocab, prediction.get(self.label_name) ) attributions, delta = ig.attribute( text_embeddings, target=label_id, additional_forward_args=mask, return_convergence_delta=True, n_steps=n_steps, ) attributions = attributions.sum(dim=3).squeeze(0) attributions = attributions / torch.norm(attributions) attributions = attributions.detach().numpy() return { **prediction, "explain": { self.forward_arg_name: [ [ {"token": token, "attribution": attribution} for token, attribution in zip( sentence_tokens, sentence_attribution ) ] for sentence_tokens, sentence_attribution in zip( document_tokens, attributions ) ] }, }
def test_transformer_text_field_batching(): batch = Batch( [ Instance({"text": TransformerTextField(torch.IntTensor([1, 2, 3]))}), Instance({"text": TransformerTextField(torch.IntTensor([2, 3, 4, 5]))}), Instance({"text": TransformerTextField(torch.IntTensor())}), ] ) tensors = batch.as_tensor_dict(batch.get_padding_lengths()) assert tensors["text"]["input_ids"].shape == (3, 4) assert tensors["text"]["input_ids"][0, -1] == 0 assert tensors["text"]["attention_mask"][0, -1] == torch.Tensor([False]) assert torch.all(tensors["text"]["input_ids"][-1] == 0) assert torch.all(tensors["text"]["attention_mask"][-1] == torch.tensor([False]))
def predict_instance(self, instance): """ An instance is an entire document, represented as a list of sentences. """ model = self._model cuda_device = model._get_prediction_device() dataset = Batch([instance]) dataset.index_instances(model.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) prediction = model.make_output_human_readable(model(**model_input)) return prediction.to_json()
def __iter__(self): while True: self.init_epoch() for idx, minibatch in enumerate(self.batches): # fast-forward if loaded from state if self._iterations_this_epoch > idx: continue self.iterations += 1 self._iterations_this_epoch += 1 if self.sort_within_batch: # NOTE: `rnn.pack_padded_sequence` requires that a minibatch # be sorted by decreasing order, which requires reversing # relative to typical sort keys if self.sort: minibatch.reverse() else: minibatch.sort(key=self.sort_key, reverse=True) batch = Batch(minibatch) if self.device == 'cuda' or self.device.type == "cuda": batch = move_to_device( batch, self.device.index if self.device.index is not None else 0) yield batch.as_tensor_dict(batch.get_padding_lengths()) if not self.repeat: return
def test_forward_pass_runs_correctly(self): batch = Batch(self.instances) batch.index_instances(self.vocab) training_tensors = batch.as_tensor_dict() output_dict = self.model(**training_tensors) # The following asserts assume that we get a fair mix of answers, some 0, some 1, some correct, and some # incorrect. If the model was completely un-initialized, the chance of these checks failing randomly is # 1/1024, and there are three of them. But the model is not completely uninitialized (in fact, it contains # no random weights), so we know these asserts pass. We still mark the test as flaky because random # drop-out could mess things up. assert output_dict["best_alternative"].min() == 0 assert output_dict["best_alternative"].max() == 1 metrics = self.model.get_metrics(reset=True) assert metrics["acc"] > 0
def test_transformer_text_field_from_huggingface(return_tensors): tokenizer = get_tokenizer("bert-base-cased") batch = Batch( [ Instance( {"text": TransformerTextField(**tokenizer(text, return_tensors=return_tensors))} ) for text in [ "Hello, World!", "The fox jumped over the fence", "Humpty dumpty sat on a wall", ] ] ) tensors = batch.as_tensor_dict(batch.get_padding_lengths()) assert tensors["text"]["input_ids"].shape == (3, 11)
def test_forward_pass_runs_correctly(self): archive = load_archive(self.model_path) model = archive.model reader = DatasetReader.from_params(archive.config["dataset_reader"]) instances = reader.read(FIXTURES_ROOT / "data" / "train.jsonl") batch = Batch(instances) batch.index_instances(model.vocab) output_dict = model(**batch.as_tensor_dict()) assert set(output_dict.keys()) == { "edit_distance", "emb_sequence_a", "emb_sequence_b", "loss", } assert output_dict["edit_distance"].shape[0] == 3
def test_read(self): from allennlp_models.vision.dataset_readers.vqav2 import VQAv2Reader reader = VQAv2Reader( image_dir=FIXTURES_ROOT / "vision" / "images" / "vqav2", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list(reader.read("unittest")) assert len(instances) == 3 instance = instances[0] assert len(instance.fields) == 6 assert len(instance["question"]) == 7 question_tokens = [t.text for t in instance["question"]] assert question_tokens == [ "What", "is", "this", "photo", "taken", "looking", "through?" ] assert len(instance["labels"]) == 5 labels = [field.label for field in instance["labels"].field_list] assert labels == ["net", "netting", "mesh", "pitcher", "orange"] assert torch.allclose( instance["label_weights"].tensor, torch.tensor([1.0, 1.0 / 3, 1.0 / 3, 1.0 / 3, 1.0 / 3]), ) batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (3, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (3, 2, 4) # (batch size, num boxes (fake),) assert tensors["box_mask"].size() == (3, 2) # Nothing should be masked out since the number of fake boxes is the same # for each item in the batch. assert tensors["box_mask"].all()
def __call__(self, doc: Doc) -> Doc: cuda_device = self._model._get_prediction_device() sentences = [[tok.text for tok in sent] for sent in doc.sents] ins = self._dataset_reader.text_to_instance({ "sentences": sentences, "doc_key": "test", "dataset": self.dataset_name }) dataset = Batch([ins]) dataset.index_instances(self._model.vocab) model_input = util.move_to_device(dataset.as_tensor_dict(), cuda_device) prediction = self._model.make_output_human_readable( self._model(**model_input)).to_json() # prepare and store ent/relation information to spacy Doc return prepare_spacy_doc(doc, prediction)
def instances_to_captum_inputs(self, labeled_instances): batch_size = len(labeled_instances) with torch.no_grad(): cuda_device = self._get_prediction_device() batch = Batch(labeled_instances) batch.index_instances(self.vocab) model_input = util.move_to_device(batch.as_tensor_dict(), cuda_device) tokens = model_input["tokens"] label = model_input["label"] tokens_mask = util.get_text_field_mask(tokens) embedded_tokens = self.word_embeddings(tokens) output_dict = {} output_dict["embedding"] = embedded_tokens # target = None because output is a single scalar return (embedded_tokens, ), None, (tokens_mask, output_dict)
def explain_prediction( self, prediction: Dict[str, numpy.array], instance: Instance, n_steps: int ) -> Dict[str, Any]: dataset = Batch([instance]) input_tokens_ids = dataset.as_tensor_dict() ig = IntegratedGradients(self._explain_embeddings) num_wrapping_dims = 0 text_tokens = [ token.text for token in cast(TextField, instance.get(self.forward_arg_name)).tokens ] text_tensor = input_tokens_ids.get(self.forward_arg_name) mask = get_text_field_mask(text_tensor, num_wrapping_dims=num_wrapping_dims) text_embeddings = self.backbone.embedder.forward( text_tensor, num_wrapping_dims=num_wrapping_dims ) label_id = vocabulary.index_for_label( self.backbone.vocab, prediction["labels"][0] ) attributions, delta = ig.attribute( text_embeddings, n_steps=n_steps, target=label_id, additional_forward_args=mask, return_convergence_delta=True, ) attributions = attributions.sum(dim=2).squeeze(0) attributions = attributions / torch.norm(attributions) attributions = attributions.detach().numpy() return { **prediction, "explain": { self.forward_arg_name: [ {"token": token, "attribution": attribution} for token, attribution in zip(text_tokens, attributions) ] }, }
def test_forward_with_weights(self): params = Params.from_file(self.param_file) reader: CopyNetDatasetReader = DatasetReader.from_params( params["dataset_reader"], serialization_dir=self.TEST_DIR) instances = [ reader.text_to_instance("hello hello world", "hello world", weight=0.9), reader.text_to_instance("hello world", "hello world world", weight=0.5), ] for instance in instances: reader.apply_token_indexers(instance) batch = Batch(instances) batch.index_instances(self.model.vocab) inputs = batch.as_tensor_dict() assert "weight" in inputs _ = self.model(**inputs)
def test_metrics(pipeline): instance = pipeline.head.featurize(text="test this", label="a") batch = Batch([instance]) batch.index_instances(pipeline.vocab) pipeline.head.forward(**batch.as_tensor_dict()) # validation metric should have never been called assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 1 assert pipeline.head._metrics.get_dict( is_train=False)["accuracy"].total_count == 0 train_metrics = pipeline.head.get_metrics(reset=True) expected_metric_names = (["accuracy"] + [ f"{label}/{metric}" for label in ["micro", "macro"] for metric in ["precision", "recall", "fscore"] ] + [ f"_{metric}/{label}" for metric in ["precision", "recall", "fscore"] for label in ["a", "b", "c", "d", "e", "f"] ]) assert all(name in train_metrics for name in expected_metric_names) pipeline.head.training = False pipeline.head.forward(**batch.as_tensor_dict()) # training metric should have never been called after its reset assert pipeline.head._metrics.get_dict()["accuracy"].total_count == 0 assert pipeline.head._metrics.get_dict( is_train=False)["accuracy"].total_count == 1 valid_metrics = pipeline.head.get_metrics() assert all(name in valid_metrics for name in expected_metric_names)
def make_vocab_from_params( params: Params, serialization_dir: str, print_statistics: bool = False ) -> Vocabulary: vocab_params = params.pop("vocabulary", {}) os.makedirs(serialization_dir, exist_ok=True) vocab_dir = os.path.join(serialization_dir, "vocabulary") if os.path.isdir(vocab_dir) and os.listdir(vocab_dir) is not None: raise ConfigurationError( "The 'vocabulary' directory in the provided serialization directory is non-empty" ) datasets_for_vocab_creation: Optional[List[str]] = params.pop( "datasets_for_vocab_creation", None ) # Do a quick sanity check here. There's no need to load any datasets if the vocab # type is "empty". if datasets_for_vocab_creation is None and vocab_params.get("type") in ("empty", "from_files"): datasets_for_vocab_creation = [] datasets: Dict[str, Dataset] if datasets_for_vocab_creation is None: # If `datasets_for_vocab_creation` was not specified, we'll use all datasets # from the config. datasets = datasets_from_params(params) else: for dataset_name in datasets_for_vocab_creation: data_path = f"{dataset_name}_data_path" if data_path not in params: raise ConfigurationError(f"invalid 'datasets_for_vocab_creation' {dataset_name}") datasets = datasets_from_params( params, train=("train" in datasets_for_vocab_creation), validation=("validation" in datasets_for_vocab_creation), test=("test" in datasets_for_vocab_creation), ) instances: Iterable[Instance] = ( instance for key, dataset in datasets.items() if datasets_for_vocab_creation is None or key in datasets_for_vocab_creation for instance in dataset ) if print_statistics: instances = list(instances) vocab = Vocabulary.from_params(vocab_params, instances=instances) logger.info(f"writing the vocabulary to {vocab_dir}.") vocab.save_to_files(vocab_dir) logger.info("done creating vocab") if print_statistics: dataset = Batch(instances) dataset.index_instances(vocab) dataset.print_statistics() vocab.print_statistics() return vocab
def test_read(self): from allennlp_models.vision.dataset_readers.vgqa import VGQAReader reader = VGQAReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "vgqa", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list( reader.read("test_fixtures/vision/vgqa/question_answers.json")) assert len(instances) == 8 instance = instances[0] assert len(instance.fields) == 6 assert len(instance["question"]) == 5 question_tokens = [t.text for t in instance["question"]] assert question_tokens == ["What", "is", "on", "the", "curtains?"] assert len(instance["labels"]) == 1 labels = [field.label for field in instance["labels"].field_list] assert labels == ["sailboats"] batch = Batch(instances) batch.index_instances(Vocabulary()) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (8, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (8, 2, 4) # (batch size, num boxes (fake)) assert tensors["box_mask"].size() == (8, 2) # Nothing should be masked out since the number of fake boxes is the same # for each item in the batch. assert tensors["box_mask"].all()