print(v) # Start vocabulary with all standard special tokens. (PAD=0!) vocab = {} for special_token in ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]: vocab[special_token] = len(vocab) # Add other words - if not already present. for w in words: if w not in vocab: vocab[w] = len(vocab) print(vocab) # New tokenizer. init_tokenizer = BertWordPieceTokenizer(vocab=vocab) init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()]) init_tokenizer.pre_tokenizer = Whitespace() #init_tokenizer.pad_token_id = vocab["[PAD]"] #print("Created tokenizer: ", init_tokenizer) # Save the created tokenizer. init_tokenizer.save(decoder_tokenizer_path) print("Tokenizer saved to: ", decoder_tokenizer_path) # Load from tokenizer file. tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50) for k, v in tokenizer.get_vocab().items():
def pre_tokenize(self, text): return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text)) if __name__ == '__main__': lang = 'fr' clean_text = False handle_chinese_chars = True strip_accents = False lowercase = True vocab_size = 30000 min_frequency = 2 spt = ["<s>", "<pad>", "</s>", "<unk>", "<mask>", "[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]"] if lang == "fr": train_data = "../.data/wmt19_de_fr/train.fr" elif lang == "en": train_data = "../.data/wmt19_en_de/train.en" else: raise ValueError("Undefined language {}".format(lang)) tokenizer = BertWordPieceTokenizer(clean_text=clean_text, lowercase=lowercase, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents) tokenizer.pre_tokenizer = MosesPreTokenizer(lang, lowercase) # Customize training print("Starting to train ...") tokenizer.train(files=train_data, vocab_size=vocab_size, show_progress=True, min_frequency=min_frequency, special_tokens=spt) # Save files to disk tokenizer.save_model(".", "moses-pre-tokenized-wmt-uncased-{}".format(lang))