def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()]) tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new() tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: str = "<unk>", replacement: str = "▁", add_prefix_space: bool = True, dropout: Optional[float] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.add_special_tokens([unk_token]) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.Metaspace.new( replacement=replacement, add_prefix_space=add_prefix_space) parameters = { "model": "SentencePieceBPE", "unk_token": unk_token, "replacement": replacement, "add_prefix_space": add_prefix_space, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, lowercase: bool = False, dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, dropout=dropout, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "", )) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if lowercase: normalizers += [Lowercase()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel( trim_offsets=trim_offsets) parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, "lowercase": lowercase, "dropout": dropout, "unicode_normalizer": unicode_normalizer, "continuing_subword_prefix": continuing_subword_prefix, "end_of_word_suffix": end_of_word_suffix, "trim_offsets": trim_offsets, } super().__init__(tokenizer, parameters)
def test_processing(self, roberta_files): tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True) # Keeps original offsets output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated tokenizer.post_processor = ByteLevel(trim_offsets=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
def slow_train(): tokenizer, trainer = TestQuicktour.get_tokenizer_trainer() # START train files = [ f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"] ] tokenizer.train(trainer, files) # END train # START reload_model files = tokenizer.model.save("data", "wiki") tokenizer.model = BPE.from_files(*files, unk_token="[UNK]") # END reload_model # START save tokenizer.save("data/tokenizer-wiki.json")
def __init__( self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, unk_token: Optional[str] = "<unk>", suffix: Optional[str] = "</w>", dropout: Optional[float] = None, unicode_normalizer: Optional[str] = None, ): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files(vocab_file, merges_file, dropout=dropout, unk_token=unk_token, end_of_word_suffix=suffix)) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] # OpenAI normalization is the same as Bert normalizers += [BertNormalizer()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = BertPreTokenizer() tokenizer.decoder = BPEDecoder(suffix=suffix) parameters = { "model": "BPE", "unk_token": unk_token, "suffix": suffix, "dropout": dropout, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str] = None, merges_file: Optional[str] = None, add_prefix_space: bool = False, do_lowercase: bool = False, unicode_normalizer: Optional[str] = None, continuing_subword_prefix: Optional[str] = None, end_of_word_suffix: Optional[str] = None): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer( BPE.from_files( vocab_file, merges_file, continuing_subword_prefix=continuing_subword_prefix or "", end_of_word_suffix=end_of_word_suffix or "")) else: tokenizer = Tokenizer(BPE.empty()) # Check for Unicode normalization first (before everything else) normalizers = [] if unicode_normalizer: normalizers += [unicode_normalizer_from_str(unicode_normalizer)] if do_lowercase: normalizers += [Lowercase.new()] # Create the normalizer structure if len(normalizers) > 0: if len(normalizers) > 1: tokenizer.normalizer = Sequence.new(normalizers) else: tokenizer.normalizer = normalizers[0] tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new( add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def __init__(self, vocab_file: Optional[str]=None, merges_file: Optional[str]=None, add_prefix_space: bool=False): if vocab_file is not None and merges_file is not None: tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file)) else: tokenizer = Tokenizer(BPE.empty()) tokenizer.normalizer = NFKC.new() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) tokenizer.decoder = decoders.ByteLevel.new() parameters = { "model": "ByteLevelBPE", "add_prefix_space": add_prefix_space, } super().__init__(tokenizer, parameters)
def test_encode_add_special_tokens(self, roberta_files): tokenizer = Tokenizer( BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == [ "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>" ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == [ "ĠMy", "Ġname", "Ġis", "ĠJohn" ]
In the face of ambiguity, refuse the temptation to guess. There should be one-- and preferably only one --obvious way to do it. Although that way may not be obvious at first unless you're Dutch. Now is better than never. Although never is often better than *right* now. If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! """.split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") tok_p = GPT2Tokenizer.from_pretrained('gpt2') # Create a Tokenizer using BPE tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges)) # Use ByteLevel PreTokenizer tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # Use ByteLevel Decoder tok_r.decoder = decoders.ByteLevel() elif args.type == "bert": print("Running Bert tokenizer") tok_p = BertTokenizer.from_pretrained(args.vocab) tok_r = Tokenizer( WordPiece.from_files(args.vocab, unk_token="[UNK]", max_input_chars_per_word=100)) tok_r.normalizer = BertNormalizer( clean_text=True, handle_chinese_chars=True,
def test_instantiate(self, roberta_files): assert isinstance(BPE.empty(), Model) assert isinstance( BPE.from_files(roberta_files["vocab"], roberta_files["merges"]), Model)