def test_loading_with_sampler(self): reader = MultiTaskDatasetReader(readers={ "a": FakeDatasetReaderA(), "b": FakeDatasetReaderB() }) data_path = {"a": "ignored", "b": "ignored"} scheduler = RoundRobinScheduler(batch_size=4) sampler = WeightedSampler({"a": 1, "b": 2}) loader = MultiTaskDataLoader( reader=reader, data_path=data_path, scheduler=scheduler, sampler=sampler, instances_per_epoch=9, ) vocab = Vocabulary() vocab.add_tokens_to_namespace(["A", "B"], "labels") loader.index_with(vocab) iterator = iter(loader) batch = next(iterator) assert torch.all(batch["label"] == torch.IntTensor([0, 1, 0, 1])) batch = next(iterator) assert torch.all(batch["label"] == torch.IntTensor([0, 1, 1, 1])) batch = next(iterator) assert torch.all(batch["label"] == torch.IntTensor([1])) with pytest.raises(StopIteration): next(iterator)
def labeled_json_to_labeled_instances( self, json_dict: JsonDict) -> Dict[int, Instance]: seq_offset = 0 seq_len = -1 adhoc_vocab = Vocabulary() instances = {} for i, str_i in sorted(map((lambda x: (int(x), x)), json_dict.keys())): inst_obj = json_dict[str_i] if seq_len == -1: seq_len = len(inst_obj['words']) text_field = TextField( [Token(tok['text']) for tok in inst_obj['words']], {}) instance = Instance({'tokens': text_field}) new_instance = instance.duplicate() tags_field = ConstructiveSupertagField( [json_to_cat(tag) for tag in inst_obj['tags']], text_field, [i - seq_offset]) adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels') new_instance.add_field('tags', tags_field) new_instance.index_fields(adhoc_vocab) instances[i] = new_instance if i + 1 - seq_offset == seq_len: seq_offset += seq_len seq_len = -1 return instances
def extend_labels(vocab: Vocabulary, labels: List[str]): """Adds a list of label strings to the vocabulary Use this to add new labels to your vocabulary (e.g., useful for reusing the weights of an existing classifier) Parameters ---------- vocab: `allennlp.data.Vocabulary` labels: `List[str]` A list of strings containing the labels to add to an existing vocabulary """ vocab.add_tokens_to_namespace(labels, namespace=LABELS_NAMESPACE)
def test_model_loads_weights_correctly(self): vocab = Vocabulary() vocab.add_tokens_to_namespace( ["orange", "net", "netting", "pitcher", "catcher"], "answers") model_name = "epwalsh/bert-xsmall-dummy" model = VqaVilbert.from_huggingface_model_name( vocab=vocab, model_name=model_name, image_feature_dim=2048, image_num_hidden_layers=1, image_hidden_size=6, combined_hidden_size=10, pooled_output_dim=7, image_intermediate_size=11, image_attention_dropout=0.0, image_hidden_dropout=0.0, image_biattention_id=[0, 1], text_biattention_id=[0, 1], text_fixed_layer=0, image_fixed_layer=0, image_num_attention_heads=3, combined_num_attention_heads=2, ) transformer = AutoModel.from_pretrained(model_name) # compare embedding parameters mapping = { val: key for key, val in model.backbone.text_embeddings._construct_default_mapping( transformer.embeddings, "huggingface", {}).items() } assert_equal_parameters(transformer.embeddings, model.backbone.text_embeddings, mapping=mapping) # compare encoder parameters mapping = { val: key for key, val in model.backbone.encoder._construct_default_mapping( transformer.encoder, "huggingface", {}).items() } # We ignore the new parameters for the second modality, since they won't be present # in the huggingface model. assert_equal_parameters(transformer.encoder, model.backbone.encoder, ignore_missing=True, mapping=mapping)
def test_text_to_instance_with_basic_tokenizer_and_indexer(self): reader = NextTokenLmReader() vocab = Vocabulary() vocab.add_tokens_to_namespace(["This", "is", "a"], "tokens") instance = reader.text_to_instance(sentence="This is a", target="This") assert [t.text for t in instance["tokens"]] == ["This", "is", "a"] assert [t.text for t in instance["target_ids"]] == ["This"] instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {"tokens", "target_ids"} assert tensor_dict["tokens"]["tokens"].numpy().tolist() == [2, 3, 4] assert tensor_dict["target_ids"]["tokens"].numpy().tolist() == [2]
def test_text_to_instance_with_basic_tokenizer_and_indexer(self): reader = NextTokenLmReader() vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'a'], 'tokens') instance = reader.text_to_instance(sentence='This is a', target='This') assert [t.text for t in instance['tokens']] == ['This', 'is', 'a'] assert [t.text for t in instance['target_ids']] == ['This'] instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {'tokens', 'target_ids'} assert tensor_dict['tokens']['tokens'].numpy().tolist() == [2, 3, 4] assert tensor_dict['target_ids']['tokens'].numpy().tolist() == [2]
def test_text_to_instance_with_basic_tokenizer_and_indexer(self): reader = MaskedLanguageModelingReader() vocab = Vocabulary() vocab.add_tokens_to_namespace(["This", "is", "a", "[MASK]", "token", "."], "tokens") instance = reader.text_to_instance(sentence="This is a [MASK] token .", targets=["This"]) assert [t.text for t in instance["tokens"]] == ["This", "is", "a", "[MASK]", "token", "."] assert [i.sequence_index for i in instance["mask_positions"]] == [3] assert [t.text for t in instance["target_ids"]] == ["This"] instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {"tokens", "mask_positions", "target_ids"} assert tensor_dict["tokens"]["tokens"].numpy().tolist() == [2, 3, 4, 5, 6, 7] assert tensor_dict["target_ids"]["tokens"].numpy().tolist() == [2] assert tensor_dict["mask_positions"].numpy().tolist() == [[3]]
def test_model_loads_weights_correctly(self): vocab = Vocabulary() vocab.add_tokens_to_namespace( ["orange", "net", "netting", "pitcher", "catcher"], "answers") model_name = "epwalsh/bert-xsmall-dummy" model = VqaVilbert.from_huggingface_model_name( vocab=vocab, model_name=model_name, image_feature_dim=2048, image_num_hidden_layers=1, image_hidden_size=6, combined_hidden_size=10, pooled_output_dim=7, image_intermediate_size=11, image_attention_dropout=0.0, image_hidden_dropout=0.0, image_biattention_id=[0, 1], text_biattention_id=[0, 1], text_fixed_layer=0, image_fixed_layer=0, image_num_attention_heads=3, combined_num_attention_heads=2, ) transformer = AutoModel.from_pretrained(model_name) # compare embedding parameters assert_allclose( transformer.embeddings.word_embeddings.weight.data, model.backbone.text_embeddings.embeddings.word_embeddings.weight. data, ) # compare encoder parameters assert_allclose( transformer.encoder.layer[0].intermediate.dense.weight.data, model.backbone.encoder.layers1[0].intermediate.dense.weight.data, )
def build_vocab_fixed_labels(labels: list, instances: Iterable[Instance]) -> Vocabulary: logger.critical("Building the vocabulary") logger.critical("Initializing the labels namespace") vocab = Vocabulary() indexes = vocab.add_tokens_to_namespace(labels, namespace="labels") logger.critical(f"Mapped them\n{labels}\n{indexes}") logger.critical("Initializing the regular namespace") vocab.extend_from_instances(instances) second_indexes = [vocab.get_token_index(token, namespace="labels") for token in labels] # indexes = vocab.add_tokens_to_namespace(labels, namespace="labels") logger.critical(f"Mapped them\n{labels}\n{second_indexes}") return vocab
def test_text_to_instance_with_basic_tokenizer_and_indexer(self): reader = MaskedLanguageModelingReader() vocab = Vocabulary() vocab.add_tokens_to_namespace( ['This', 'is', 'a', '[MASK]', 'token', '.'], 'tokens') instance = reader.text_to_instance(sentence='This is a [MASK] token .', targets=['This']) assert [t.text for t in instance['tokens'] ] == ['This', 'is', 'a', '[MASK]', 'token', '.'] assert [i.sequence_index for i in instance['mask_positions']] == [3] assert [t.text for t in instance['target_ids']] == ['This'] instance.index_fields(vocab) tensor_dict = instance.as_tensor_dict(instance.get_padding_lengths()) assert tensor_dict.keys() == {'tokens', 'mask_positions', 'target_ids'} assert tensor_dict['tokens']['tokens'].numpy().tolist() == [ 2, 3, 4, 5, 6, 7 ] assert tensor_dict['target_ids']['tokens'].numpy().tolist() == [2] assert tensor_dict['mask_positions'].numpy().tolist() == [[3]]
def test_read(self): from allennlp_models.vision.dataset_readers.visual_entailment import VisualEntailmentReader reader = VisualEntailmentReader( image_dir=FIXTURES_ROOT / "vision" / "images" / "visual_entailment", image_loader=TorchImageLoader(), image_featurizer=Lazy(NullGridEmbedder), region_detector=Lazy(RandomRegionDetector), tokenizer=WhitespaceTokenizer(), token_indexers={"tokens": SingleIdTokenIndexer()}, ) instances = list( reader.read( "test_fixtures/vision/visual_entailment/sample_pairs.jsonl")) assert len(instances) == 16 instance = instances[0] assert len(instance.fields) == 5 assert len(instance["hypothesis"]) == 4 sentence_tokens = [t.text for t in instance["hypothesis"]] assert sentence_tokens == ["A", "toddler", "sleeps", "outside."] assert instance["labels"].label == "contradiction" batch = Batch(instances) vocab = Vocabulary() vocab.add_tokens_to_namespace( ["entailment", "contradiction", "neutral"], "labels") batch.index_instances(vocab) tensors = batch.as_tensor_dict() # (batch size, num boxes (fake), num features (fake)) assert tensors["box_features"].size() == (16, 2, 10) # (batch size, num boxes (fake), 4 coords) assert tensors["box_coordinates"].size() == (16, 2, 4) # (batch_size, num boxes (fake),) assert tensors["box_mask"].size() == (16, 2)
def predictions_to_labeled_instances( self, instances: Iterable[Instance], outputs: Dict[str, Union[numpy.ndarray, torch.Tensor, Iterable[Union[str, Category]]]] ) -> List[Instance]: predicted_tags = outputs['tags'] predicted_probs = outputs['probs'] adhoc_vocab = Vocabulary() new_instances = [] cr = CategoryReader() gen = self._model.wrapped_model.generators[0] for instance, tags, probs in zip(instances, predicted_tags, predicted_probs): text_field: TextField = instance['tokens'] length = text_field.sequence_length() for i in range(length): new_instance = instance.duplicate() if all(map((lambda x: isinstance(x, Category)), tags)): cat = tags[i:i + 1] elif all(map((lambda x: isinstance(x, str)), tags)): cat = [cr.read(tag) for tag in tags[i:i + 1]] else: cat = gen.extract_outputs( numpy.expand_dims(tags[i:i + 1], 0))[0] tags_field = ConstructiveSupertagField(cat, text_field, [i]) adhoc_vocab.add_tokens_to_namespace(tags_field.labels, 'labels') new_instance.add_field('tags', tags_field) new_instance.add_field('probs', ArrayField(probs[i:i + 1])) new_instance.index_fields(adhoc_vocab) new_instances.append(new_instance) return new_instances
from allennlp.data import Vocabulary from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder from allennlp.modules.token_embedders import Embedding import torch # This is what gets created by TextField.as_tensor with a SingleIdTokenIndexer; # see the exercises above. token_tensor = {"tokens": {"tokens": torch.LongTensor([1, 3, 2, 1, 4, 3])}} vocab = Vocabulary() vocab.add_tokens_to_namespace(["This", "is", "some", "text", "."], namespace="token_vocab") glove_file = "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.50d.txt.gz" # This is for embedding each token. embedding = Embedding( vocab=vocab, vocab_namespace="token_vocab", embedding_dim=50, pretrained_file=glove_file, ) embedder = BasicTextFieldEmbedder(token_embedders={"tokens": embedding}) embedded_tokens = embedder(token_tensor) print(embedded_tokens.size())
import json import argparse from allennlp.data import Vocabulary if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--ontology-path', type=str, required=True) parser.add_argument('--output-path', type=str, required=True) args = parser.parse_args() with open(args.ontology_path) as f: ontology = json.load(f) vocab = Vocabulary() vocab.add_token_to_namespace(token='None', namespace='span_labels') vocab.add_token_to_namespace(token='@@PADDING@@', namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['args'].keys()), namespace='span_labels') vocab.add_tokens_to_namespace(tokens=list(ontology['events'].keys()), namespace='event_labels') vocab.save_to_files(args.output_path)
from allennlp.data.fields import TextField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer from allennlp.data import Vocabulary # Splits text into words (instead of wordpieces or characters). tokenizer = WordTokenizer() # Represents each token with a single id from a vocabulary. token_indexer = SingleIdTokenIndexer(namespace='token_vocab') vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') text = "This is some text." tokens = tokenizer.tokenize(text) print(tokens) text_field = TextField(tokens, {'tokens': token_indexer}) # In order to convert the token strings into integer ids, we need to tell the # TextField what Vocabulary to use. text_field.index(vocab) # We typically batch things together when making tensors, which requires some # padding computation. Don't worry too much about the padding for now. padding_lengths = text_field.get_padding_lengths() tensor_dict = text_field.as_tensor(padding_lengths) print(tensor_dict)
def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, ): """ Expected results for ``test.json`` from the Open Entity dataset: {'micro_precision': 0.7997806072235107, 'micro_recall': 0.7657563090324402, 'micro_fscore': 0.7823987007141113}. Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") tokenizer_kwargs = {"additional_special_tokens": [ENT]} reader = EntityTypingReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=True, tokenizer_kwargs=tokenizer_kwargs), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name, tokenizer_kwargs=tokenizer_kwargs) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntityClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)
from allennlp.data.fields import TextField from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer from allennlp.data.tokenizers import WordTokenizer, CharacterTokenizer from allennlp.data import Vocabulary # Splits text into words (instead of wordpieces or characters). tokenizer = WordTokenizer() # Represents each token with both an id from a vocabulary and a sequence of characters. token_indexers = { 'tokens': SingleIdTokenIndexer(namespace='token_vocab'), 'token_characters': TokenCharactersIndexer(namespace='character_vocab') } vocab = Vocabulary() vocab.add_tokens_to_namespace(['This', 'is', 'some', 'text', '.'], namespace='token_vocab') vocab.add_tokens_to_namespace( ['T', 'h', 'i', 's', ' ', 'o', 'm', 'e', 't', 'x', '.'], namespace='character_vocab') text = "This is some text." tokens = tokenizer.tokenize(text) print(tokens) text_field = TextField(tokens, {'tokens': token_indexer}) # In order to convert the token strings into integer ids, we need to tell the # TextField what Vocabulary to use. text_field.index(vocab) # We typically batch things together when making tensors, which requires some
def evaluate_transformers_checkpoint( data_path: str, model_config_path: str, checkpoint_model_name: str, checkpoint_tokenizer_name: str, batch_size: int, cuda_device: int, result_save_path: str, prediction_save_path: str, ): """ Expected results for CoNLL-2003 NER English test set. {'f1': 0.9461946902654867, 'precision': 0.945859872611465, 'recall': 0.9465297450424929} Parameters ---------- data_path : str Data path to the input file. model_config_path : str A config file that defines the model architecture to evaluate. checkpoint_model_name : str The name of the checkpoint in Hugging Face Model Hub. checkpoint_tokenizer_name : str This should be the name of the base pre-training model because sometimes the tokenizer of downstream task is not compatible with allennlp. batch_size : int cuda_device : int result_save_path : str """ import_module_and_submodules("examples_allennlp") reader = ConllSpanReader( tokenizer=PretrainedTransformerTokenizer( model_name=checkpoint_tokenizer_name, add_special_tokens=False, tokenizer_kwargs={"add_prefix_space": True}), token_indexers={ "tokens": PretrainedTransformerIndexer(model_name=checkpoint_tokenizer_name) }, use_entity_feature=True, ) transformers_tokenizer = LukeTokenizer.from_pretrained( checkpoint_model_name) transformers_model = LukeForEntitySpanClassification.from_pretrained( checkpoint_model_name) vocab = Vocabulary() vocab.add_transformer_vocab(transformers_tokenizer, "tokens") num_labels = len(transformers_model.config.id2label) labels = [transformers_model.config.id2label[i] for i in range(num_labels)] labels = ["O" if l == "NIL" else l for l in labels] vocab.add_tokens_to_namespace(labels, namespace="labels") # read model params = Params.from_file( model_config_path, ext_vars={"TRANSFORMERS_MODEL_NAME": checkpoint_model_name}) if prediction_save_path is not None: params["prediction_save_path"] = prediction_save_path model = Model.from_params(params, vocab=vocab) model.classifier = transformers_model.classifier model.eval() # set the GPU device to use if cuda_device < 0: device = torch.device("cpu") else: device = torch.device(f"cuda:{cuda_device}") model = model.to(device) loader = MultiProcessDataLoader(reader, data_path, batch_size=batch_size, shuffle=False) loader.index_with(model.vocab) with torch.no_grad(): for batch in tqdm.tqdm(loader): batch = nn_util.move_to_device(batch, device) output_dict = model(**batch) metrics = model.get_metrics(reset=True) print(metrics) if result_save_path is not None: with open(result_save_path, "w") as f: json.dump(metrics, f)