def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()]) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
def test_has_expected_type_and_methods(self): tokenizer = Tokenizer(BPE()) assert type(tokenizer) == Tokenizer assert callable(tokenizer.num_special_tokens_to_add) assert callable(tokenizer.get_vocab) assert callable(tokenizer.get_vocab_size) assert callable(tokenizer.enable_truncation) assert callable(tokenizer.no_truncation) assert callable(tokenizer.enable_padding) assert callable(tokenizer.no_padding) assert callable(tokenizer.encode) assert callable(tokenizer.encode_batch) assert callable(tokenizer.decode) assert callable(tokenizer.decode_batch) assert callable(tokenizer.token_to_id) assert callable(tokenizer.id_to_token) assert callable(tokenizer.add_tokens) assert callable(tokenizer.add_special_tokens) assert callable(tokenizer.train) assert callable(tokenizer.post_process) assert isinstance(tokenizer.model, Model) assert tokenizer.normalizer is None assert tokenizer.pre_tokenizer is None assert tokenizer.post_processor is None assert tokenizer.decoder is None assert isinstance(pickle.loads(pickle.dumps(Tokenizer(BPE()))), Tokenizer)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, vocab: Optional[Union[str, Dict[str, int]]] = None, merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int, int]]]] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab is not None and merges is not None: tokenizer = Tokenizer( BPE( vocab, merges, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Union[str, AddedToken] = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, no_consecutive_space: bool = True, dropout: Optional[float] = None, clean_text: bool = True, handle_chinese_chars: bool = True, separate_numbers: bool = True, strip_accents: bool = True, lowercase: bool = True, wordpieces_prefix: str = "##", special_chars: str = SPECIAL_CHARS, zh_norm: bool = True, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE()) if tokenizer.token_to_id(str(unk_token)) is not None: tokenizer.add_special_tokens([str(unk_token)]) tokenizer.normalizer = Sequence([ NFKC(), BertNormalizer(clean_text=clean_text, handle_chinese_chars=handle_chinese_chars, separate_numbers=separate_numbers, strip_accents=strip_accents, lowercase=lowercase, special_chars=special_chars, zh_norm=zh_norm) ]) tokenizer.pre_tokenizer = pre_tokenizers.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) tokenizer.decoder = decoders.Metaspace( replacement=replacement, add_prefix_space=add_prefix_space, no_consecutive_space=no_consecutive_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "no_consecutive_space": no_consecutive_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def test_instantiate(self, roberta_files): assert isinstance(BPE(), Model) assert isinstance(BPE(), BPE) assert isinstance(BPE(roberta_files["vocab"], roberta_files["merges"]), Model) with pytest.raises( ValueError, match="`vocab` and `merges` must be both specified"): BPE(vocab=roberta_files["vocab"]) BPE(merges=roberta_files["merges"])
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, } super().__init__(tokenizer, parameters)
def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from)
def train_tokenizer(langs, dataset, vocab_size): """Train a tokenizer on given list of languages. Reserves a special token for each language which is [LANG] where LANG is the language tag. These are assigned to tokens 5, 6, ..., len(langs) + 4. """ # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer lang_tokens = ['[' + lang + ']' for lang in langs] special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens trainer = BpeTrainer( special_tokens=special_tokens, vocab_size=vocab_size) # normalise and pre tokenize tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoder = decoders.ByteLevel() # create iterator and train iterator = _MultilingualIterator(dataset, langs) tokenizer.train_from_iterator(iterator, trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__(self): self.tokenizer = Tokenizer(BPE()) self.tokenizer.normalizer = Sequence([ NFKC() ]) self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def test_add_tokens(self): tokenizer = Tokenizer(BPE()) added = tokenizer.add_tokens(["my", "name", "is", "john"]) assert added == 4 added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)]) assert added == 2
def from_file( vocab_filename: str, merges_filename: Union[str, None], **kwargs, ): vocab, merges = BPE.read_file(vocab_filename, merges_filename) return BrainBertTokenizer(vocab, merges, **kwargs)
def __init__( self, path_src, path_tgt, path_tokenizer, path_root: Optional[str] = '', ): self.path_src = path_root + path_src self.path_tgt = path_root + path_tgt self.len = 0 self.max_len = 512 self.tokenizer = Tokenizer( BPE( path_root + path_tokenizer + 'vocab.json', path_root + path_tokenizer + 'merges.txt', )) self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) with open(self.path_src, 'r+') as f: lines_src = f.readlines() with open(self.path_tgt, 'r+') as f: lines_tgt = f.readlines() self.len = len(lines_src) self.example = list(zip(lines_src, lines_tgt))
def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder()
def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.RobertaProcessing( sep=(ot.sep_token, ot.sep_token_id), cls=(ot.cls_token, ot.cls_token_id), add_prefix_space=ot.add_prefix_space, trim_offsets=True, # True by default on Roberta (historical) ) return tokenizer
def test_encode_add_special_tokens(self, roberta_files): with pytest.deprecated_call(): tokenizer = Tokenizer( BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == [ "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>" ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == [ "ĠMy", "Ġname", "Ġis", "ĠJohn" ]
def converted(self) -> Tokenizer: tokenizer_info_str = "#version:" token_suffix = "</w>" vocab = self.original_tokenizer.encoder merges = list(self.original_tokenizer.bpe_ranks.keys()) if tokenizer_info_str in merges[0][0]: merges = merges[1:] tokenizer = Tokenizer( BPE( vocab, merges, dropout=None, unk_token=self.original_tokenizer.unk_token, end_of_word_suffix=token_suffix, )) tokenizer.normalizer = normalizers.BertNormalizer(lowercase=False, strip_accents=False) tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer() tokenizer.decoder = decoders.BPEDecoder(suffix=token_suffix) tokenizer.post_processor = processors.BertProcessing( sep=(self.original_tokenizer.sep_token, self.original_tokenizer.sep_token_id), cls=(self.original_tokenizer.cls_token, self.original_tokenizer.cls_token_id), ) return tokenizer
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list with pytest.warns(DeprecationWarning): assert type(output.words) == list assert type(output.word_ids) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] assert isinstance(pickle.loads(pickle.dumps(output)), Encoding) # Can encode a single pre-tokenized sequence output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["my", "name", "is", "john"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def converted(self) -> Tokenizer: ot = self.original_tokenizer vocab = ot.encoder merges = list(ot.bpe_ranks.keys()) tokenizer = Tokenizer( BPE( vocab=vocab, merges=merges, dropout=None, continuing_subword_prefix="", end_of_word_suffix="", fuse_unk=False, )) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=ot.add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.TemplateProcessing( single="[CLS]:0 $A:0 [SEP]:0", pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0", special_tokens=[ ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")), ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")), ], ) return tokenizer
def test_normalize(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.normalizer = Lowercase() output = tokenizer.normalize("My Name Is John") assert output == "my name is john"
def train_tokenizer(lang, dataset, vocab_size): # Byte-pair encoding tokenizer = Tokenizer(BPE(unk_token='[UNK]')) # trainer trainer = BpeTrainer( special_tokens=['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'], vocab_size=vocab_size) # pre tokenizer with whitespace tokenizer.pre_tokenizer = Whitespace() # train tokenizer.train_from_iterator(dataset[lang], trainer) # post process start/end tokens tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) return tokenizer
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def configure(self): self.testing_file = self.get_value_from_config('testing_file') self.vocab_file = self.get_value_from_config('vocab_file') self.merges_file = self.get_value_from_config('merges_file') self.max_seq_length = int(self.get_value_from_config('max_seq_length')) self.tokenizer = Tokenizer(BPE(str(self.vocab_file), str(self.merges_file))) self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) self.tokenizer.decoder = decoders.ByteLevel()
def test_strip_accents(self): tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = BertNormalizer( strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False ) output = tokenizer.normalize("Héllò") assert output == "Hello"
def test_can_modify(self): model = BPE( dropout=0.5, unk_token="[UNK]", continuing_subword_prefix="__prefix__", end_of_word_suffix="__suffix__", fuse_unk=False, ) assert model.dropout == 0.5 assert model.unk_token == "[UNK]" assert model.continuing_subword_prefix == "__prefix__" assert model.end_of_word_suffix == "__suffix__" assert model.fuse_unk == False # Modify these model.dropout = 0.1 assert pytest.approx(model.dropout) == 0.1 model.unk_token = "<unk>" assert model.unk_token == "<unk>" model.continuing_subword_prefix = None assert model.continuing_subword_prefix == None model.end_of_word_suffix = "suff" assert model.end_of_word_suffix == "suff" model.fuse_unk = True assert model.fuse_unk == True
def generate_tokenizer(equations, output, vocab_size): from tokenizers import Tokenizer, pre_tokenizers from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) trainer = BpeTrainer(special_tokens=["[PAD]", "[BOS]", "[EOS]"], vocab_size=vocab_size, show_progress=True) tokenizer.train(trainer, equations) tokenizer.save(path=output, pretty=False)
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0]
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"] assert output.ids == [0, 2, 3, 1, 1, 6, 1]