Esempio n. 1
0
 def test_instantiate(self):
     assert Metaspace() is not None
     assert Metaspace(replacement="-") is not None
     with pytest.raises(Exception, match="replacement must be a character"):
         Metaspace(replacement="")
     assert Metaspace(add_prefix_space=True) is not None
     assert isinstance(Metaspace(), PreTokenizer)
    def test_can_modify(self):
        pretok = Metaspace(replacement="$", add_prefix_space=False)

        assert pretok.replacement == "$"
        assert pretok.add_prefix_space == False

        # Modify these
        pretok.replacement = "%"
        assert pretok.replacement == "%"
        pretok.add_prefix_space = True
        assert pretok.add_prefix_space == True
Esempio n. 3
0
 def test_instantiate(self):
     assert Metaspace() is not None
     assert Metaspace(replacement="-") is not None
     with pytest.raises(ValueError, match="expected a string of length 1"):
         Metaspace(replacement="")
     assert Metaspace(add_prefix_space=True) is not None
     assert isinstance(Metaspace(), PreTokenizer)
     assert isinstance(Metaspace(), Metaspace)
     assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
Esempio n. 4
0
    def converted(self):
        tokenizer = self.tokenizer(self.proto)

        # Tokenizer assemble
        tokenizer.normalizer = self.normalizer(self.proto)

        replacement = "▁"
        add_prefix_space = True
        tokenizer.pre_tokenizer = Metaspace(replacement=replacement,
                                            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)
        post_processor = self.post_processor(tokenizer)
        if post_processor:
            tokenizer.post_processor = post_processor

        # TODO what parameters should we give ?
        parameters = {}

        return BaseTokenizer(tokenizer, parameters)