def __init__(self, path_to_text_file: str, tokenizer_in: Tokenizer, tokenizer_out: Tokenizer, max_sequence_length: int, sep: str, **kwargs): logger.info("Processing file: {}".format(path_to_text_file)) self.pad_token_in = tokenizer_in.get_vocab()['<PAD>'] self.pad_token_out = tokenizer_out.get_vocab()['<PAD>'] self.max_sequence_length = max_sequence_length with open(path_to_text_file, "r") as file: texts = file.readlines() texts = list(map(lambda x: x.split(sep), texts)) texts = list(map(lambda x: x[0:2], texts)) self.texts_in = [] self.texts_in_length = [] self.texts_out = [] for i in tqdm(range(len(texts)), desc="Tokenization...."): text_in_ids = tokenizer_in.encode(texts[i][0]).ids texts_out_ids = tokenizer_out.encode(texts[i][1]).ids if len(text_in_ids) and len(texts_out_ids) <= max_sequence_length: self.texts_in.append(text_in_ids) self.texts_in_length.append(len(text_in_ids)) self.texts_out.append(texts_out_ids) logger.info("# Texts: {}".format(len(self.texts_in)))
def test_encode_add_special_tokens(self, roberta_files): with pytest.deprecated_call(): tokenizer = Tokenizer( BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) assert output_with_specials.tokens == [ "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>" ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) assert output_without_specials.tokens == [ "ĠMy", "Ġname", "Ġis", "ĠJohn" ]
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list with pytest.warns(DeprecationWarning): assert type(output.words) == list assert type(output.word_ids) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] assert isinstance(pickle.loads(pickle.dumps(output)), Encoding) # Can encode a single pre-tokenized sequence output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["my", "name", "is", "john"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def test_roberta_parity(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) original = tokenizer.encode("my name is john", "pair") tokenizer.post_processor = self.get_roberta() template = tokenizer.encode("my name is john", "pair") assert original.ids == template.ids
def test_bert_parity(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) original = tokenizer.encode("my name", "pair") tokenizer.post_processor = self.get_bert() template = tokenizer.encode("my name", "pair") assert original.ids == template.ids
def test_truncation(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"]
def test_processing(self, roberta_files): tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True) # Keeps original offsets output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)] # Trims offsets when activated tokenizer.post_processor = ByteLevel(trim_offsets=True) output = tokenizer.encode("My name is John") assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
def test_post_process(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) tokenizer.enable_padding(length=4) encoding = tokenizer.encode("my name is john") pair_encoding = tokenizer.encode("pair") # Can post process a single encoding output = tokenizer.post_process(encoding) assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
def train_tokenizer(args): """[summary] Arguments: args {[dictionary]} -- [arguments객체] """ # Tokenizer train morpheme_func = None if args.tokenizer.pretokenizer_type == "khaiii": api = KhaiiiApi() morpheme_func = api.analyze elif args.tokenizer.pretokenizer_type == "mecab": mecab = Mecab() morpheme_func = mecab.morphs # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe" if args.tokenizer.tokenizer_type == "bbpe": # tokenizer = BytelevelBPETokenizer() tokenizer = Tokenizer(BPE()) # tokenizer.pre_tokenizer = BertPreTokenizer() trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "cbpe": tokenizer = Tokenizer(BPE()) tokenizer.pre_tokenizer = CharDelimiterSplit trainer = BpeTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) elif args.tokenizer.tokenizer_type == "wp": tokenizer = Tokenizer(WordPiece()) # tokenizer.pre_tokenizer = Whitespace trainer = WordPieceTrainer( special_tokens=omegalist_to_list(args.tokenizer.special_tokens), vocab_size=args.tokenizer.vocab_size, min_frequency=args.tokenizer.min_frequency, ) tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func)) tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab") test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어" output = tokenizer.encode(test_string) print(f"output:{output}") print(f"tokens:{output.tokens}") print(f"ids :{output.ids}") print(f"offset:{output.offsets}") print(f"decode:{tokenizer.decode(output.ids)}") datasets = get_datasets(args.tokenizer.data_path) for line in datasets: print(line) break
def gen(tokenizer_tgt: Tokenizer, model: GPT2LMHeadModel, device, prompt=None, n=10, tokenizer_eng=None, token_id_map=[], cfg={}): input_ids = None if prompt is not None and prompt.strip() != '': prompt = prompt.strip() if type(tokenizer_tgt) == Tokenizer: ids = [model.config.bos_token_id] + tokenizer_tgt.encode( prompt, None).ids else: ids = tokenizer_tgt.encode(prompt) input_ids = torch.LongTensor(ids).unsqueeze(0).to(device) for _ in range(max(n // 5, 1)): m = min(5, n) batch_ids = model.generate(input_ids=input_ids, num_return_sequences=m, max_length=200, do_sample=True, top_k=10, top_p=0.9, temperature=2.0, repetition_penalty=10.0, num_beams=10, pad_token_id=cfg['pad_token_id'], bos_token_id=cfg['bos_token_id'], eos_token_id=cfg['eos_token_id'], no_repeat_ngram_size=4) for i in range(m): ids_tgt = batch_ids[i].flatten().tolist() txt_tgt = tokenizer_tgt.decode(ids_tgt, skip_special_tokens=True).strip() if tokenizer_eng is not None: ids_eng = [token_id_map[i] for i in ids_tgt if i not in [1, 2]] txt_eng = tokenizer_eng.decode( ids_eng, skip_special_tokens=True).strip() yield txt_tgt, txt_eng continue yield txt_tgt
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((DataTrainingArguments, CustomOthersArguments)) (data_args, custom_args) = parser.parse_args_into_dataclasses() train_files = list(sorted(glob.glob(f'{data_args.train_dir}/*.{data_args.ext}'))) validation_files = list(sorted(glob.glob(f'{data_args.eval_dir}/*.{data_args.ext}'))) additional_special_tokens = ADDITIONAL_SPECIAL_TOKENS pre_tokenizer_func = PRE_TOKENIZERS_MAP.get(custom_args.pre_tokenizer_type, None) if pre_tokenizer_func is None: raise NotImplementedError elif custom_args.pre_tokenizer_type == 'sefr_cut': raise ValueError('sefr_cut is slow use fake_sefr_cu with sefr_cut_pre_tokenizer instead') if not os.path.exists(custom_args.output_file) or custom_args.overwrite_output_file: trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func, vocab_size=custom_args.vocab_size, vocab_min_freq=custom_args.vocab_min_freq, input_files=train_files, additional_special_tokens=additional_special_tokens) trainer.count_parallel() trainer.save_vocab(custom_args.output_file) if custom_args.pre_tokenizer_type == 'fake_sefr_cut': custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( FakeSefrCustomTokenizer(PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token'])) else: custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom( CustomPreTokenizer(pre_tokenizer_func)) tokenizer = Tokenizer(models.WordLevel.from_file(custom_args.output_file, unk_token='<unk>')) tokenizer.pre_tokenizer = custom_pre_tokenizer if custom_args.debug: print('Tokenize following text.') texts = ['<s>โรนัลโดเขาได้เล่นกับทีม</s>', 'โปรตุเกสมีโรนัลโด', 'โรนัลโดเขาได้เล่นกับทีม\nโปรตุเกสมีโรนัลโด'] ids = [e.ids for e in tokenizer.encode_batch(texts)] decoded_texts = tokenizer.decode_batch(ids) decoded_texts = [text.replace(' ', '') for text in decoded_texts] for text, i, decoded_text in zip(texts, ids, decoded_texts): print('Text: ', text, '>>', 'Tokenized: ', i, '>>', 'Decoded: ', decoded_text) with open(validation_files[0], 'r') as f: while True: line = f.readline() if line: line = line.strip() if len(line) > 0 and not line.isspace(): encoded = tokenizer.encode(line) decoded = tokenizer.decode(encoded.ids).replace(' ', '') print('Text: ', line, '>>', encoded.ids, '>>', decoded) else: break
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"] assert output.ids == [0, 2, 3, 1, 1, 6, 1]
def test_processing(self): tokenizer = Tokenizer(BPE()) tokenizer.add_special_tokens(["[SEP]", "[CLS]"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) output = tokenizer.encode("my name", "pair") assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.ids == [1, 2, 3, 0, 6, 0]
def test_padding(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # By default it does nothing when encoding single sequence tokenizer.enable_padding() output = tokenizer.encode("my name") assert output.tokens == ["my", "name"] # Can pad to the longest in a batch output = tokenizer.encode_batch(["my name", "my name is john"]) assert all([len(encoding) == 4 for encoding in output]) # Can pad to the specified max length otherwise tokenizer.enable_padding(max_length=4) output = tokenizer.encode("my name") assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] output = tokenizer.encode("my name", "pair") assert output.tokens == ["my", "name", "pair", "[PAD]"]
class TextDataset(Dataset): def __init__( self, path_src, path_tgt, path_tokenizer, path_root: Optional[str] = '', ): self.path_src = path_root + path_src self.path_tgt = path_root + path_tgt self.len = 0 self.max_len = 512 self.tokenizer = Tokenizer( BPE( path_root + path_tokenizer + 'vocab.json', path_root + path_tokenizer + 'merges.txt', )) self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) with open(self.path_src, 'r+') as f: lines_src = f.readlines() with open(self.path_tgt, 'r+') as f: lines_tgt = f.readlines() self.len = len(lines_src) self.example = list(zip(lines_src, lines_tgt)) def _encode(self, src_line, tgt_line): src = self.tokenizer.encode(str(src_line)).ids tgt = self.tokenizer.encode(str(tgt_line)).ids if len(src) > self.max_len: self.max_len = len(src) if len(tgt) > self.max_len: self.max_len = len(tgt) return torch.tensor(src), torch.tensor(tgt), len(src), len(tgt) def __len__(self): return self.len def __getitem__(self, i): return self._encode(*self.example[i]) @staticmethod def pad_collate(batch): (x, y, x_len, y_len) = zip(*batch) x_pad = pad_sequence(x, batch_first=True, padding_value=0) y_pad = pad_sequence(y, batch_first=True, padding_value=0) return x_pad, y_pad, x_len, y_len
def tokenize_corpus(src_dir, dst_dir, tokenizer: Tokenizer, force): for i, doc_path in enumerate(sorted(src_dir.glob('*.txt')), start=1): cat = doc_path.name.replace('.txt', '') dst_path = dst_dir / f'{cat}.npy' print(f'[{i:>3,}] {cat} ({dst_path})') if dst_path.exists() and not force: print(f' > destination path {dst_path} already exists. skipping') continue token_ids = [] print(f' > reading {doc_path}') if cat == 'full': n_lines = 500_000 print(f'reading in chunks of {n_lines:,} lines') with open(doc_path) as f: lines = [] for line in f: lines.append(line) if len(lines) >= n_lines: print(f' > tokenizing {len(lines):,} lines') token_ids.extend(tokenizer.encode(''.join(lines)).ids) lines = [] if len(lines) > 0: print(f' > tokenizing {len(lines):,}') token_ids.extend(tokenizer.encode(''.join(lines)).ids) token_ids = np.array(token_ids) else: with open(doc_path) as f: txt = f.read() print(' > tokenizing') token_ids = np.array(tokenizer.encode(txt).ids) print(f' > saving to {dst_path}') np.save(dst_path, token_ids)
class BPETokenizer(object): def __init__(self, vocab_size=25000, min_freq=5, lang="en", files=[None, None]) -> None: """ Args: vocab_size: (int) min_freq: minimum frequency lang: files: (List[str]) ["vocab.json", "merge.txt"] """ super(BPETokenizer, self).__init__() self.tokenizer = Tokenizer(BPE(files[0], files[1])) self.lang = lang self.trainer = BpeTrainer(vocab_size=vocab_size, min_frequency=min_freq, special_tokens=["[PAD]", "[SEP]"], initial_alphabet=ByteLevel.alphabet()) # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()]) # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers self.tokenizer.pre_tokenizer = ByteLevel() self.tokenizer.decoder = ByteLevelDecoder() def train(self, files=None) -> None: if files is None: # files 長這樣:["test.txt", "train.txt", "valid.txt"] files = [ f"data/wikitext-103-raw/wiki.{split}.raw" for split in ["test", "train", "valid"] ] self.tokenizer.train(files, self.trainer) def save(self) -> None: self.tokenizer.model.save(f"data/tokenizer/{self.lang}") def encode(self, input: Union[str, List[str], Tuple[str]]) -> Encoding: return self.tokenizer.encode(input) def decode(self, input: Encoding) -> str: # 注意 type(input) == Encoding return self.tokenizer.decode(input.ids)
def __init__(self, tokenizer: Tokenizer, args, file_paths: str, block_size=512): assert all([os.path.isfile(file_path) for file_path in file_paths]) block_size = block_size - 2 # Reduce by 2 to account for [CLS] and [SEP] tokens directory, filename = os.path.split(file_paths[0]) cached_features_file = os.path.join( directory, args.model_type + "_cached_lm_" + str(block_size) + "_" + Path(filename).stem) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) else: logger.info("Reading dataset at %s", file_paths) text = [] for file_path in file_paths: with open(file_path, encoding="utf-8") as f: text += f.readlines() logger.info("Creating features from dataset file at %s", directory) # Get all token IDs except [CLS] and [SEP] and flat map IDs tokenized_text = [ t for tokenized in tokenizer.encode_batch(text) for t in tokenized.ids[1:-1] ] cls_token, sep_token = tokenizer.encode('').ids self.examples = [] for i in range(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append([cls_token] + tokenized_text[i:i + block_size] + [sep_token]) # Note that we are loosing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. logger.info("Saving features into cached file %s", cached_features_file) Path(cached_features_file).parent.mkdir(exist_ok=True, parents=True) with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)
def test_encode(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) # Can encode single sequence output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name", "is", "john"] assert type(output.ids) == list assert type(output.type_ids) == list assert type(output.offsets) == list assert type(output.words) == list assert type(output.special_tokens_mask) == list assert type(output.attention_mask) == list assert type(output.overflowing) == list # Can encode a pair of sequences output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "name", "is", "john", "pair"] # Can encode a batch with both a single sequence and a pair of sequences output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) assert len(output) == 2
def test_train_with_special_tokens(self): filename = "tests/data/dummy-unigram-special_tokens-train.txt" with open(filename, "w") as f: f.write( """ [CLS] The Zen of Python, by Tim Peters [SEP] [CLS] Beautiful is better than ugly. [SEP] [CLS] Explicit is better than implicit. [SEP] [CLS] Simple is better than complex. [SEP] [CLS] Complex is better than complicated. [SEP] [CLS] Flat is better than nested. [SEP] [CLS] Sparse is better than dense. [SEP] [CLS] Readability counts. [SEP] [CLS] Special cases aren't special enough to break the rules. [SEP] [CLS] Although practicality beats purity. [SEP] [CLS] Errors should never pass silently. [SEP] [CLS] Unless explicitly silenced. [SEP] [CLS] In the face of ambiguity, refuse the temptation to guess. [SEP] [CLS] There should be one-- and preferably only one --obvious way to do it. [SEP] [CLS] Although that way may not be obvious at first unless you're Dutch. [SEP] [CLS] Now is better than never. [SEP] [CLS] Although never is often better than *right* now. [SEP] [CLS] If the implementation is hard to explain, it's a bad idea. [SEP] [CLS] If the implementation is easy to explain, it may be a good idea. [SEP] [CLS] Namespaces are one honking great idea -- let's do more of those! [SEP] """ ) tokenizer = Tokenizer(models.Unigram()) trainer = trainers.UnigramTrainer( show_progress=False, special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]" ) tokenizer.train([filename], trainer=trainer) assert tokenizer.encode("[CLS] This is a test [SEP]").tokens == [ "[CLS]", " T", "h", "i", "s", " is ", "a", " ", "te", "s", "t ", "[SEP]", ]
def test_truncation(self): tokenizer = Tokenizer(BPE()) tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.enable_truncation(2) # Can truncate single sequences output = tokenizer.encode("my name is john") assert output.tokens == ["my", "name"] # Can truncate pair sequences as well output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["my", "pair"] # Can get the params and give them to enable_truncation trunc = tokenizer.truncation tokenizer.enable_truncation(**trunc) # Left truncation direction tokenizer.enable_truncation(2, direction="left") output = tokenizer.encode("my name is john") assert output.tokens == ["is", "john"] output = tokenizer.encode("my name is john", "pair") assert output.tokens == ["john", "pair"]
def wordpiece_tokenize(line): tokenizer = Tokenizer(WordPiece(wordpiece_dict3)) tokenizer.enable_padding(length=200) tokenizer.enable_truncation(max_length=200) tokenizer.pre_tokenizer = WhitespaceSplit() tokenizer.post_processor = TemplateProcessing( single="[CLS] $A [SEP]", pair="[CLS] $A [SEP] $B:1 [SEP]:1", special_tokens=[ ("[CLS]", 1), ("[SEP]", 2), ], ) output = tokenizer.encode(line) return (output.ids)
def main(args): if args.do_train: # Initialize a tokenizer files = get_smi_files(args.training_files) print("Training BPE tokenizer using the following files:{}".format( files)) tokenizer = Tokenizer(models.BPE(unk_token="<unk>")) tokenizer.enable_padding(pad_id=args.vocab_size + 2, pad_token="<pad>", length=args.pad_len) tokenizer.enable_truncation(max_length=args.pad_len, strategy='only_first') tokenizer.normalizer = Sequence([NFKC()]) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel( add_prefix_space=False) tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) # Train the tokenizer trainer = trainers.BpeTrainer(show_progress=True, vocab_size=args.vocab_size, min_frequency=args.min_frequency) tokenizer.train(files, trainer=trainer) tokenizer.add_tokens(["<start>", "<end>"]) tokenizer.save(os.path.join('tokenizers', args.tokenizer_name), pretty=True) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) if args.do_test: # Test the tokenizer tokenizer = Tokenizer.from_file( os.path.join('tokenizers', args.tokenizer_name)) print("Testing with SMILES String: {}".format(args.test_string)) encoding = tokenizer.encode(args.test_string) print("Encoded string: {}".format(encoding.tokens)) print(encoding.ids) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
class LitTokenizer: def __init__(self, padding=False, truncation=False, max_length=None, lower=False, lang=None): super().__init__() self.UNK_WORD = '[UNK]' self.PAD_WORD = '[PAD]' self.MASK_WORD = '[MASK]' self.SOS_WORD = '[SOS]' self.EOS_WORD = '[EOS]' self.special_tokens = [ self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD, self.EOS_WORD ] # Define tokenizer self.tokenizer = None self.configure_tokenizers(padding, truncation, max_length, lower) # Other self.lang = lang def get_vocab_size(self): return self.tokenizer.get_vocab_size() def configure_tokenizers(self, padding, truncation, max_length, lower): # Settings pad_length = None if padding in {True, "longest"}: pass elif padding in {"max_length"}: pad_length = max_length elif padding in {False, "do_not_pad"}: pass else: raise ValueError("Unknown padding type") # SRC tokenizer tok_normalizers = [NFD(), Strip()] if lower: tok_normalizers += [Lowercase()] self.tokenizer = Tokenizer(tok_model()) # unk_token=... not working self.tokenizer.add_special_tokens(self.special_tokens) self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [WhitespaceSplit()]) self.tokenizer.normalizer = normalizers.Sequence( tok_normalizers) # StripAccents requires NFD self.tokenizer.decoder = tok_decoder() # Define template (Needed for the sos/eos tokens) basic_template = TemplateProcessing( single=f"{self.SOS_WORD} $A {self.EOS_WORD}", pair= f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}", special_tokens=[ (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)), (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD)) ], ) self.tokenizer.post_processor = basic_template if padding: self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id( self.PAD_WORD), pad_token=self.PAD_WORD, length=pad_length) if truncation: self.tokenizer.enable_truncation(max_length, stride=0, strategy='longest_first') def load_vocab(self, vocab, merges): vocab, merges = tok_model.read_file(vocab, merges) self.tokenizer.model = tok_model(vocab, merges) def train_vocab(self, files, vocab_size=32000, min_frequency=3): # Train trainer trainer = tok_trainer(vocab_size=vocab_size, min_frequency=min_frequency) self.tokenizer.train(files, trainer) def save_vocab(self, output_dir, prefix): self.tokenizer.model.save(output_dir, prefix) def pad(self, examples, keys=None): pad_idx = self.special_tokens.index(self.PAD_WORD) # Keys to modify if not keys: keys = list(examples[0].keys()) d = {} for k in keys: # Collect same-type items (list of IDs, list of masks,...) d[k] = [x[k] for x in examples] # Get max length (value to pad) max_length = max([x.shape[-1] for x in d[k]]) # Apply padding for i, x in enumerate(examples): unpadded_t = x[k] if k == "ids": tmp = torch.full((max_length, ), fill_value=pad_idx, device=unpadded_t.device) # All padding elif k == "attention_mask": tmp = torch.full( (max_length, ), fill_value=0, device=unpadded_t.device) # No attention mask else: raise TypeError("Unknown key") tmp[:unpadded_t.shape[-1]] = unpadded_t d[k][i] = tmp return d def encode(self, x): return self.tokenizer.encode(x) def decode(self, x): if isinstance(x, torch.Tensor): assert len(x.shape) == 2 x = x.detach().cpu().numpy() return [self.tokenizer.decode(x_i) for x_i in x]
char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1} print(f"Char map size: {len(char_map)}\n") MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab]) print(f"Max length of word: {MAX_LEN_OF_WORD}\n") if ZERO_PAD: word_map = { k: [char_map[c] for c in k] + [0] * (MAX_LEN_OF_WORD - len(k)) for k in bpe_vocab } else: word_map = {k: [char_map[c] for c in k] for k in bpe_vocab} name_bpe_words = {n: tokenizer.encode(w).tokens for n, w in name_words.items()} MAX_LEN_OF_MENTION = max([len(v) for v in name_bpe_words.values()]) print(f"Max length of names: {MAX_LEN_OF_MENTION}\n") gen_data = GenData(name_bpe_words, word_map, bpe_vocab) data_sets = gen_data(data) # train_data, test_data, dev_data = [ # dict(dt) for dt in list(gen_data(data).values())] train_data, test_data, dev_data = [ data_sets[k] for k in ['train', "test", "dev"] ] global_var = { "MAX_LEN_OF_MENTION": MAX_LEN_OF_MENTION, "MAX_LEN_OF_WORD": MAX_LEN_OF_WORD }
class MLMPreprocessor: def __init__( self, load_from: str = None, vocab_size: int = 10000, max_example_len: int = 128, batch_size: int = 16, num_stopwords: int = 250, mask_output_len: int = 4, ): self.char_dict: Dict[str, int] = {} self.char_rev: Dict[int, str] = {} self.token_dict: Dict[str, int] = {} self.token_rev: Dict[str, int] = {} self.vocab_size = vocab_size self.max_example_len = max_example_len self.batch_size = batch_size self.num_stopwords = num_stopwords self.mask_output_len = mask_output_len self.tokenizer_fit = False self.tokenizer = Tokenizer(BPE(unk_token="[UNK]")) self.tokenizer.pre_tokenizer = Whitespace() self.tokenizer.normalizer = Sequence( [NFD(), Lowercase(), StripAccents()]) self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"], vocab_size=self.vocab_size) if load_from: self._load(load_from) def fit(self, data: List[str], min_char_freq: int = 1, progbar: bool = True): """ Create a character-level dictionary based on a list of strings """ if not self.tokenizer_fit: self.tokenizer.train_from_iterator(data, trainer=self.tok_trainer) char_counter: Counter = Counter() token_counter: Counter = Counter() iterator_: Iterable = data if progbar: iterator_ = tqdm(data) for example in iterator_: chars = Counter(example) for char, char_count in chars.items(): try: char_counter[char] += char_count except KeyError: char_counter[char] = char_count counts = [k for k, v in char_counter.items() if v >= min_char_freq] self.char_rev = {0: "", 1: "?", 2: "?", 3: ""} for c in sorted(counts): n = len(self.char_rev) self.char_rev[n] = c self.char_dict[c] = n def scrub(self, token): """ Normalize a token by removing punctuation. Used to build a vocabulary and to choose tokens to mask during pretraining. """ token = token.lower() while len(token) > 0 and token[0] in PUNCT: token = token[1:] while len(token) > 0 and token[-1] in PUNCT: token = token[:-1] token = re.sub("\d", "#", token) return token def tokenize(self, string_to_tokenize: str) -> Encoding: string_to_tokenize = string_to_tokenize return self.tokenizer.encode(string_to_tokenize) def string_to_array(self, string_in, length, padding_pre=False): # truncate if padding_pre: s = string_in[-length:] else: s = string_in[:length] # map char -> int and left-zero-pad mapped = np.ones((len(s))) for n, char in enumerate(s): try: mapped[n] = self.char_dict[char] except KeyError: pass # mapped = [self.char_dict.get(x, 1) for x in s] if padding_pre: r = np.pad(mapped, (length - len(s), 0), "constant", constant_values=(0, 0)) else: r = np.pad(mapped, (0, length - len(s)), "constant", constant_values=(0, 0)) return r def string_to_example(self, example, return_example=False, allow_null_examples=False): # simple tokenization sp = [tok.strip() for tok in example.split(" ") if tok.strip() != ""] # normalize to see what we can replace normed = [self.scrub(tok) for tok in sp] # see which tokens are in the vocabulary replaceable_tokens = [(t, i) for i, t in enumerate(normed) if t in self.token_dict] assert ( len(sp) >= 2 ), "minimum length of an example is 2 tokens (white-space delimited)" assert ( len(replaceable_tokens) > 0 or allow_null_examples ), f"called string_to_example on string with no tokens that are in the vocabulary and allow_null_examples=True\n{example}" if len(replaceable_tokens) == 0 and allow_null_examples: return None # choose a token to replace rep_ind = np.random.randint(0, len(replaceable_tokens)) rep, rep_ind = replaceable_tokens[rep_ind] # get the index of the token for the output mask_ind = self.token_dict[rep] label_array = np.zeros(self.vocab_size) label_array[mask_ind] = 1 # piece the masked input back together left = " ".join(sp[:rep_ind]) right = " ".join(sp[rep_ind + 1:]) if len(sp) > rep_ind + 1 else "" left_len = len(left) right_len = len(right) thresh = (self.max_example_len - self.mask_output_len - 2) // 2 right_diff = thresh - left_len if left_len < thresh else 0 left_diff = thresh - right_len if right_len < thresh else 0 left_sub = left[-(thresh + left_diff):] right_sub = right[:(thresh + right_diff)] combo = left_sub + " " + "?" * 4 + " " + right_sub encoded = self.string_to_array(combo, self.max_example_len) ret_val = [encoded, label_array] if return_example: ret_val += [combo] return ret_val def strings_to_examples(self, strings): enc = np.zeros((len(strings), self.max_example_len)) labels = np.zeros((len(strings), self.vocab_size)) for n in range(len(strings)): enc[n], labels[n] = self.string_to_example(strings[n]) return [enc, labels] def examples_generator(self, strings): assert len(strings) >= 1 while True: ind = 0 while ind < len(strings): yield self.strings_to_examples(strings[ind:ind + self.batch_size]) ind += self.batch_size def save(self, path): """ Write a Preprocessor object to a .JSON config """ config = { "char_rev": self.char_rev, "char_dict": self.char_dict, "max_example_len": self.max_example_len, } with open(os.path.join(path, "cclm_config.json"), "w") as f: json.dump(config, f) def _load(self, path): """ Load a Preprocessor object from disk """ with open(path, "rb") as f: result = json.load(f) for key, value in result.items(): setattr(self, key, value)
# https://github.com/huggingface/tokenizers/tree/master/bindings/python#train-a-new-tokenizer from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors # Initialize a tokenizer tokenizer = Tokenizer(models.BPE()) # Customize pre-tokenization and decoding tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False) # TODO True tokenizer.decoder = decoders.ByteLevel() tokenizer.post_processor = processors.ByteLevel(trim_offsets=True) trainer = trainers.BpeTrainer(vocab_size=10000, min_frequency=2) tokenizer.train(trainer, ['bar']) encoded = tokenizer.encode(seq) print(encoded.tokens) # TODO: Use a clustered set of proteins like UniRef50 # -- https://www.uniprot.org/help/uniref # TODO: Use an LSTM to train on sequences, then freeze early layers and add # classification backend, retrain. # https://github.com/huggingface/tokenizers/tree/master/bindings/python # https://github.com/huggingface/tokenizers/tree/master/bindings/python#provided-tokenizers from tokenizers import CharBPETokenizer tokenizer = CharBPETokenizer(bert_normalizer=False) tokenizer.train(['./bar'], vocab_size=1000, min_frequency=2) # tokenizer.encode(seq).tokens
class SentencePieceBPETokenizer: """Custom SentencePiece tokenizer""" unk_token = '<unk>' pad_token = '<pad>' def __init__(self, vocab: Dict[str, int] = None, merges: List[Tuple[str, str]] = None, dropout: float = None, max_length: Optional[int] = 64) -> None: """Constructor Args: vocab (Dict[str, int]): A dictionary of string keys and their ids. merges (List[Tuple[str, str]]): A list of pairs of tokens. dropout (float): BPE dropout max_length (int, optional): The max length at which to truncate. Defaults to `64`. """ self.tokenizer = Tokenizer( BPE(vocab, merges, dropout=dropout, unk_token=self.unk_token)) self.tokenizer.normalizer = BertNormalizer() # noqa self.tokenizer.pre_tokenizer = pre_tokenizers.Metaspace() # noqa self.tokenizer.decoder = decoders.Metaspace() # noqa self.tokenizer.add_special_tokens([self.pad_token, self.unk_token]) self.tokenizer.enable_padding(pad_token=self.pad_token) self.tokenizer.enable_truncation(max_length) @classmethod def train(cls, dataset: Sequence[str], vocab_size: int = 1000, min_frequency: int = 2, dropout: float = 0.0, max_length: Optional[int] = 64) -> 'SentencePieceBPETokenizer': instance = cls(dropout=dropout, max_length=max_length) trainer = trainers.BpeTrainer( vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[cls.pad_token, cls.unk_token]) instance.tokenizer.train_from_iterator(dataset, trainer=trainer) instance.tokenizer.model.dropout = None return instance @property def vocab_size(self): return len(self.tokenizer.get_vocab()) def serialize(self): return self.tokenizer.to_str() @classmethod def deserialize(cls, s: str) -> 'SentencePieceBPETokenizer': tokenizer = cls() tokenizer.tokenizer = Tokenizer.from_str(s) return tokenizer def encode(self, text: str) -> Dict[str, Any]: encoding = self.tokenizer.encode(text) outputs = { 'ids': torch.tensor(encoding.ids), 'mask': torch.tensor(encoding.attention_mask), 'spans': encoding.offsets, } return outputs def encode_batch(self, batch: List[str]): encodings = self.tokenizer.encode_batch(batch) outputs = { 'ids': torch.tensor([e.ids for e in encodings]), 'mask': torch.tensor([e.attention_mask for e in encodings]), 'spans': [e.offsets for e in encodings], } return outputs
# instantiate trainer trainer = BpeTrainer( special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], min_frequency=2) # get files files = [os.path.join(args.data_dir, f"{f}-sentences.txt")] # train tokenizer tokenizer.train(files=files, trainer=trainer) # save tokenizer config file tokenizer.save(os.path.join(args.save_dir, f"tokenizer-{f}.json")) # load trained tokenizers for f in ['ewe-fon', "ewe", "fon"]: print(f'Using {f} tokenizer : \n') try: tokenizer = Tokenizer.from_file( os.path.join(args.save_dir, f"tokenizer-{f}.json")) output = tokenizer.encode( "Gbadanu tɛgbɛ ɔ, Noah tuun ɖɔ e nɔ cɛ emi") print(output.tokens) print(output.ids) print(output.offsets[9]) except Exception as ex: print(ex) print("\n")
bert_tokenizer.train_from_iterator(sentences, trainer=trainer) if serialize_path: bert_tokenizer.save(serialize_path) return bert_tokenizer ids = bert_tokenizer.encode(sentences[10]).ids bert_tokenizer.decode(ids) from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers tokenizer = Tokenizer(models.Unigram()) tokenizer.normalizer = normalizers.NFKC() tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel() tokenizer.decoders = decoders.ByteLevel() trainer = trainers.UnigramTrainer( vocab_size=20000, initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), special_tokens=["<PAD>", "<BOS>", "<EOS>"], ) tokenizer.train_from_iterator(sentences, trainer=trainer) tokenizer.encode(sentences[4]).ids tokenizer.decode(tokenizer.encode(sentences[4]).ids) tokenizer.save('bert_out/test2') tokenizer.save_pretrained('bert_out/test')