class HuggingFaceTokenizer: def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer') def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name) def encode(self, text): token_ids = self.tokenizer.encode(text.lower()).ids token_ids = token_ids[:self.max_length] return token_ids def decode(self, tokens, skip_special_tokens=True): text = self.tokenizer.decode( # My special tokens tokens, # [token for token in tokens if token > 3], # aren't skipped skip_special_tokens=skip_special_tokens, # even I set f*****g ) # skip_special_tokens return text # to True def decode_plus(self, token_batch): sentences = [] for tokens in token_batch: sentences.append(self.decode(tokens)) return sentences
def test(): """Test trained tokenizer""" tokenizer = CharBPETokenizer('./thyme-tokenizer-vocab.json', './thyme-tokenizer-merges.txt') vocab = tokenizer.get_vocab() print('vocab size:', len(vocab)) encoded = tokenizer.encode('patient dr. who diagnosed with brain abc') encoded.pad(15) print('encoded:', encoded.ids) print('decoded:', tokenizer.decode(encoded.ids)) print(encoded.tokens) print(encoded.attention_mask)
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()
def test_decoding(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) decoded = tokenizer.decode(tokenizer.encode("my name is john").ids) assert decoded == "my name is john"
special_tokens=[ "<blank>", "<bos>", "<unk>", ], ) # os.makedirs('./BPE-1000', exist_ok=True) tokenizer.save(f'./BPE-1000', '') tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json', './BPE-1000/-merges.txt') # with open('.test.pkl', 'w') as f: # pickle.dump(tokenizer, f) tokenizer = HuggingFaceTokenizer() print( tokenizer.encode( 'might have a solution it might take a long time nobody')) print( tokenizer.decode( tokenizer.encode( 'might have a solution it might take a long time nobody'), )) # transforms = torchaudio.transforms.MFCC(n_mfcc=40) # concat = ConcatFeature() # waveform = transforms(data) # print(waveform.shape) # waveform = concat(waveform) # print(waveform[:, -1])