コード例 #1
0
ファイル: vocabulary_test.py プロジェクト: himkt/allennlp
 def _get_expected_vocab(dataset, namespace, model_name):
     vocab_from_instances = Vocabulary.from_instances(dataset)
     instance_tokens = set(
         vocab_from_instances._token_to_index[namespace].keys())
     transformer_tokens = set(
         Vocabulary.from_pretrained_transformer(
             model_name, namespace)._token_to_index[namespace].keys())
     return instance_tokens.union(transformer_tokens)
コード例 #2
0
    def test_from_pretrained_transformer(self, model_name):
        namespace = "tokens"
        from allennlp.common import cached_transformers

        tokenizer = cached_transformers.get_tokenizer(model_name)

        vocab = Vocabulary.from_pretrained_transformer(model_name,
                                                       namespace=namespace)
        assert vocab._token_to_index[namespace] == tokenizer.get_vocab()
        vocab.save_to_files(self.TEST_DIR / "vocab")

        vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab")
        assert vocab1._token_to_index[namespace] == tokenizer.get_vocab()