def main(): parser = ArgumentParser(description="Training tokenizer on text files.") parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).") parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.") args = parser.parse_args() text_dir = args.text_dir tokenizer_path = args.tokenizer_path if Path(tokenizer_path).exists(): paths = [str(x) for x in Path(text_dir).glob("**/*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = ByteLevel tokenizer.train( files=paths, vocab_size=config.vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", # probably not needed if using ByteLevel pretokenization "<mask>", ] ) tokenizer.save_model(tokenizer_path) else: print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
def tokenize_cards( files=['./dataset/cards_train.txt', './dataset/cards_val.txt'], output_dir='./tokenizer'): tokenizer = ByteLevelBPETokenizer() tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.save_model(output_dir)
def load_tokenizer(vocab='./tokenizer/vocab.json', merges='./tokenizer/merges.txt', gpt=False, load_from=None): if gpt: if load_from: tokenizer = GPT2Tokenizer.from_pretrained(load_from) else: tokenizer = GPT2Tokenizer( vocab, merges, bos_token=CARD_BEGIN, eos_token=CARD_END, sep_token=CARD_END, unk_token=UNK, pad_token=CARD_PAD, mask_token=CARD_MASK, padding_side="left" ) else: tokenizer = ByteLevelBPETokenizer(vocab, merges) tokenizer.add_special_tokens(SPECIAL_TOKENS + OTHER_TOKENS) tokenizer.mask_token = CARD_MASK tokenizer.pre_tokenizer = Whitespace() return tokenizer
def get_french_vocab(model_name): root = Path(os.getcwd()).parent.parent.parent french_corpus = "Datasets/corpora/fr/text" fr_corpus_path = os.path.join(root, french_corpus) files = [] for dir_ in os.listdir(fr_corpus_path): fr_corpus_dir = os.path.join(fr_corpus_path, dir_) for text_file in os.listdir(fr_corpus_dir): text_file = os.path.join(fr_corpus_dir, text_file) files.append(text_file) tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files, vocab_size=20000, min_frequency=2, show_progress=True, special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"]) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer.save(model_name)