def train( self, files: Union[str, List[str]], vocab_size: int = 30000, min_frequency: int = 2, limit_alphabet: int = 1000, initial_alphabet: List[str] = [], special_tokens: List[Union[str, AddedToken]] = [ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", ], show_progress: bool = True, wordpieces_prefix: str = "##", ): """ Train the model using the given files """ trainer = trainers.WordPieceTrainer( vocab_size=vocab_size, min_frequency=min_frequency, limit_alphabet=limit_alphabet, initial_alphabet=initial_alphabet, special_tokens=special_tokens, show_progress=show_progress, continuing_subword_prefix=wordpieces_prefix, ) if isinstance(files, str): files = [files] self._tokenizer.train(trainer, files)
def train_from_iterator( self, iterator: Union[Iterator[str], Iterator[Iterator[str]]], vocab_size: int = 30000, min_frequency: int = 2, limit_alphabet: int = 1000, initial_alphabet: List[str] = [], special_tokens: List[Union[str, AddedToken]] = [ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", ], show_progress: bool = True, wordpieces_prefix: str = "##", ): """ Train the model using the given iterator """ trainer = trainers.WordPieceTrainer( vocab_size=vocab_size, min_frequency=min_frequency, limit_alphabet=limit_alphabet, initial_alphabet=initial_alphabet, special_tokens=special_tokens, show_progress=show_progress, continuing_subword_prefix=wordpieces_prefix, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer)
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="test") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': for sentence in sent_tokenizer(record['text']): yield sentence data = tf.data.Dataset.from_generator(generator, output_signature=(tf.TensorSpec( shape=(None), dtype=tf.string))) data = data.map(tf.strings.strip, num_parallel_calls=tf.data.experimental.AUTOTUNE) return tokenizer, data
def test_can_modify(self): trainer = trainers.WordPieceTrainer( vocab_size=12345, min_frequency=12, show_progress=False, special_tokens=["1", "2"], limit_alphabet=13, initial_alphabet=["a", "b", "c"], continuing_subword_prefix="pref", end_of_word_suffix="suf", ) assert trainer.vocab_size == 12345 assert trainer.min_frequency == 12 assert trainer.show_progress == False assert trainer.special_tokens == [ AddedToken("1"), AddedToken("2"), ] assert trainer.limit_alphabet == 13 assert sorted(trainer.initial_alphabet) == ["a", "b", "c"] assert trainer.continuing_subword_prefix == "pref" assert trainer.end_of_word_suffix == "suf" # Modify these trainer.vocab_size = 20000 assert trainer.vocab_size == 20000 trainer.min_frequency = 1 assert trainer.min_frequency == 1 trainer.show_progress = True assert trainer.show_progress == True trainer.special_tokens = [] assert trainer.special_tokens == [] trainer.limit_alphabet = None assert trainer.limit_alphabet == None trainer.initial_alphabet = ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"] trainer.continuing_subword_prefix = None assert trainer.continuing_subword_prefix == None trainer.end_of_word_suffix = None assert trainer.continuing_subword_prefix == None
def train_tokenizer() -> tokenizers.Tokenizer: tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>")) tokenizer.decoder = decoders.WordPiece() tokenizer.normalizer = normalizers.Sequence([ normalizers.NFD(), # NFD unicode normalizer normalizers.Lowercase(), normalizers.StripAccents() ]) tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([ pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=False) ]) tokenizer.post_processor = processors.TemplateProcessing( single="$A </s>", pair="$A </s> [SEP] <s> $B:1", special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)]) # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation") dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="validation") def batch_iterator(batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i:i + batch_size]["text"] tokenizer.train_from_iterator( batch_iterator(), trainer=trainers.WordPieceTrainer( vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>", "</s>"])) def generator(): for record in dataset: if record['text'].strip() != '': yield record['text'] return tokenizer, generator
from tokenizers import trainers from transformers import BertForMaskedLM from transformers import BertTokenizerFast from transformers import BertConfig import ipdb import os os.environ["CUDA_VISIBLE_DEVICES"] = "2" uid_task_id_sequence_path = 'data/feature_sequence/uid_task_id.txt' paths = [str(x) for x in Path(".").glob('data/feature_sequence/*.txt')] tokenizer = Tokenizer(WordLevel()) tokenizer.pre_tokenizer = Whitespace() # trainer = trainers.BpeTrainer( trainer = trainers.WordPieceTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.train(trainer, [uid_task_id_sequence_path]) tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ], ) # tokenizer.save_model("tmp") tokenizer.model.save('data/bert_and_tokenizer', 'uid_task_id') # tokenizer = ByteLevelBPETokenizer( # "./tmp/vocab.json",