def pre_tokenizer(self, replacement, add_prefix_space): return pre_tokenizers.Sequence([ pre_tokenizers.WhitespaceSplit(), pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space), ])
def pre_tokenizer(self, replacement, add_prefix_space): return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
def __init__( self, vocab: Union[str, List], merges: List[Tuple[str, str]], bos_token: str = "<s>", eos_token: str = "</s>", sep_token: str = "</s>", cls_token: str = "<s>", pad_token: str = "<pad>", unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, normalize: bool = True, ): bpe = BPE( vocab=vocab, merges=merges, unk_token=unk_token, fuse_unk=True, ) tokenizer = Tokenizer(bpe) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, ) if normalize: tokenizer.normalizer = NFKC() parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters) bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) self.add_special_tokens([ bos_token, eos_token, sep_token, cls_token, unk_token, pad_token, ])