def train(args): tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=1000) tokenizer.save("src/dev_scripts/tokenizer.json")
class HuggingFaceTokenizer: def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer') def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name) def encode(self, text): token_ids = self.tokenizer.encode(text.lower()).ids token_ids = token_ids[:self.max_length] return token_ids def decode(self, tokens, skip_special_tokens=True): text = self.tokenizer.decode( # My special tokens tokens, # [token for token in tokens if token > 3], # aren't skipped skip_special_tokens=skip_special_tokens, # even I set f*****g ) # skip_special_tokens return text # to True def decode_plus(self, token_batch): sentences = [] for tokens in token_batch: sentences.append(self.decode(tokens)) return sentences
def train(): """My main man""" base = os.environ['DATA_ROOT'] corpus_path = base + 'Thyme/Text/train+dev+test/*' files = glob.glob(corpus_path) tokenizer = CharBPETokenizer(lowercase=True) tokenizer.train(files=files, vocab_size=10000, min_frequency=3, show_progress=True) tokenizer.save('.', name='thyme-tokenizer')
def train_subword_tokenizer(size, special_tokens, path): """Train subword tokenizers for subword encoding ref: https://github.com/huggingface/tokenizers Args: path: path of training corpus. """ tokenizer = CharBPETokenizer() tokenizer.train( [path+"/corpus_all.txt"], vocab_size=size, min_frequency=2, show_progress=True, special_tokens=special_tokens[:3]+["<unk>"], ) tokenizer.save(path, "bpe")
def create_tokenizer_imbd(data_path, file_name, vocab_size): #df = pd.read_csv(os.path.join(data_path, file_name)) tokenizer = CharBPETokenizer() tokenizer.train( os.path.join(data_path, file_name), vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
def create_tokenizer(data_path, vocab_size): tokenizer = CharBPETokenizer() tokenizer.train([ os.path.join(data_path, file) for file in [f for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20] ], vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=[ "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]" ]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
import json import argparse from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors from tokenizers import CharBPETokenizer parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="Path to text training corpus", default="/home/benet/IRI/How2Sign/metadata/metadata.txt") parser.add_argument("--saveto", help="Path where to save the model", default="steps/tokenizer.json") parser.add_argument("--size", help="Number of tokens / vocabulary size", type=int, default=1000) if __name__ == '__main__': args = parser.parse_args() tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=args.size) tokenizer.save(args.saveto)
# ANY ARGS? tokenizer = CharBPETokenizer() # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<unk>'], suffix='</w>', limit_alphabet=args.limit_alphabet, ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = CharBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), ) # Test encoding logger.info( 'Tokens and their ids from CharBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT' ) encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT') logger.info(encoded.tokens) logger.info(encoded.ids) logger.info('done!')
# coding:utf-8 from tokenizers import CharBPETokenizer from pathlib import Path # Initialize a tokenizer tokenizer = CharBPETokenizer() # Then train it! tokenizer.train(["./data/wiki_sunyang.txt"]) # And you can use it encoded = tokenizer.encode( "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming." ) # print(encoded.tokens) # And finally save it somewhere saved_path = Path("./saved_tokenizer/wiki_sunyang") saved_path.mkdir(exist_ok=True, parents=True) tokenizer.save(str(saved_path))
f.write(t + '\n') tokenizer = CharBPETokenizer(lowercase=True) tokenizer.train( ["raw_corpus.txt"], vocab_size=1000, min_frequency=2, special_tokens=[ "<blank>", "<bos>", "<unk>", ], ) # os.makedirs('./BPE-1000', exist_ok=True) tokenizer.save(f'./BPE-1000', '') tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json', './BPE-1000/-merges.txt') # with open('.test.pkl', 'w') as f: # pickle.dump(tokenizer, f) tokenizer = HuggingFaceTokenizer() print( tokenizer.encode( 'might have a solution it might take a long time nobody')) print( tokenizer.decode( tokenizer.encode( 'might have a solution it might take a long time nobody'), ))
def train(args): tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=args.size) tokenizer.save(args.output_file)
from tokenizers import CharBPETokenizer import json import tqdm if __name__ == "__main__": # Initialize a tokenizer tokenizer = CharBPETokenizer() # Then train it! tokenizer.train( [ "data\\train.txt", "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw", "D:/数据/webtext2019zh/web_text_raw.txt" ], vocab_size=30000, min_frequency=2, special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>']) # Now, let's use it: encoded = tokenizer.encode("I can feel the magic, can you?") # And finally save it somewhere tokenizer.save("./", "bpe.tokenizer.json")