def _get_expected_vocab(dataset, namespace, model_name): vocab_from_instances = Vocabulary.from_instances(dataset) instance_tokens = set( vocab_from_instances._token_to_index[namespace].keys()) transformer_tokens = set( Vocabulary.from_pretrained_transformer( model_name, namespace)._token_to_index[namespace].keys()) return instance_tokens.union(transformer_tokens)
def test_from_pretrained_transformer(self, model_name): namespace = "tokens" from allennlp.common import cached_transformers tokenizer = cached_transformers.get_tokenizer(model_name) vocab = Vocabulary.from_pretrained_transformer(model_name, namespace=namespace) assert vocab._token_to_index[namespace] == tokenizer.get_vocab() vocab.save_to_files(self.TEST_DIR / "vocab") vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab") assert vocab1._token_to_index[namespace] == tokenizer.get_vocab()