def test_instantiate(self): assert Metaspace() is not None assert Metaspace(replacement="-") is not None with pytest.raises(Exception, match="replacement must be a character"): Metaspace(replacement="") assert Metaspace(add_prefix_space=True) is not None assert isinstance(Metaspace(), PreTokenizer)
def test_can_modify(self): pretok = Metaspace(replacement="$", add_prefix_space=False) assert pretok.replacement == "$" assert pretok.add_prefix_space == False # Modify these pretok.replacement = "%" assert pretok.replacement == "%" pretok.add_prefix_space = True assert pretok.add_prefix_space == True
def test_instantiate(self): assert Metaspace() is not None assert Metaspace(replacement="-") is not None with pytest.raises(ValueError, match="expected a string of length 1"): Metaspace(replacement="") assert Metaspace(add_prefix_space=True) is not None assert isinstance(Metaspace(), PreTokenizer) assert isinstance(Metaspace(), Metaspace) assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def converted(self): tokenizer = self.tokenizer(self.proto) # Tokenizer assemble tokenizer.normalizer = self.normalizer(self.proto) replacement = "▁" add_prefix_space = True tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor # TODO what parameters should we give ? parameters = {} return BaseTokenizer(tokenizer, parameters)