def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()]) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, do_lowercase: bool = False, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "")) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if do_lowercase: normalizers += [Lowercase.new()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence.new(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)