def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Beispiel #2
0
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer



ids = bert_tokenizer.encode(sentences[10]).ids
bert_tokenizer.decode(ids)


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.UnigramTrainer(
    vocab_size=20000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)

tokenizer.train_from_iterator(sentences, trainer=trainer)
tokenizer.encode(sentences[4]).ids
tokenizer.decode(tokenizer.encode(sentences[4]).ids)
tokenizer.save('bert_out/test2')

tokenizer.save_pretrained('bert_out/test')
Beispiel #3
0
class LitTokenizer:
    def __init__(self,
                 padding=False,
                 truncation=False,
                 max_length=None,
                 lower=False,
                 lang=None):
        super().__init__()
        self.UNK_WORD = '[UNK]'
        self.PAD_WORD = '[PAD]'
        self.MASK_WORD = '[MASK]'
        self.SOS_WORD = '[SOS]'
        self.EOS_WORD = '[EOS]'
        self.special_tokens = [
            self.UNK_WORD, self.PAD_WORD, self.MASK_WORD, self.SOS_WORD,
            self.EOS_WORD
        ]

        # Define tokenizer
        self.tokenizer = None
        self.configure_tokenizers(padding, truncation, max_length, lower)

        # Other
        self.lang = lang

    def get_vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')

    def load_vocab(self, vocab, merges):
        vocab, merges = tok_model.read_file(vocab, merges)
        self.tokenizer.model = tok_model(vocab, merges)

    def train_vocab(self, files, vocab_size=32000, min_frequency=3):
        # Train trainer
        trainer = tok_trainer(vocab_size=vocab_size,
                              min_frequency=min_frequency)
        self.tokenizer.train(files, trainer)

    def save_vocab(self, output_dir, prefix):
        self.tokenizer.model.save(output_dir, prefix)

    def pad(self, examples, keys=None):
        pad_idx = self.special_tokens.index(self.PAD_WORD)

        # Keys to modify
        if not keys:
            keys = list(examples[0].keys())

        d = {}
        for k in keys:
            # Collect same-type items (list of IDs, list of masks,...)
            d[k] = [x[k] for x in examples]

            # Get max length (value to pad)
            max_length = max([x.shape[-1] for x in d[k]])

            # Apply padding
            for i, x in enumerate(examples):
                unpadded_t = x[k]
                if k == "ids":
                    tmp = torch.full((max_length, ),
                                     fill_value=pad_idx,
                                     device=unpadded_t.device)  # All padding
                elif k == "attention_mask":
                    tmp = torch.full(
                        (max_length, ), fill_value=0,
                        device=unpadded_t.device)  # No attention mask
                else:
                    raise TypeError("Unknown key")
                tmp[:unpadded_t.shape[-1]] = unpadded_t
                d[k][i] = tmp
        return d

    def encode(self, x):
        return self.tokenizer.encode(x)

    def decode(self, x):
        if isinstance(x, torch.Tensor):
            assert len(x.shape) == 2
            x = x.detach().cpu().numpy()
        return [self.tokenizer.decode(x_i) for x_i in x]
tokenizer = Tokenizer(models.BPE.from_files(args.vocab, args.merges))

# Test the good custom classes
good_custom = GoodCustom()
good_pretok = pre_tokenizers.PreTokenizer.custom(good_custom)
good_decoder = decoders.Decoder.custom(good_custom)

tokenizer.pre_tokenizer = good_pretok
tokenizer.decoder = good_decoder

print("Tokenization will work with good custom:")
encoding = tokenizer.encode("Hey friend!")
print(f"IDS: {encoding.ids}")
print(f"TOKENS: {encoding.tokens}")
print(f"OFFSETS: {encoding.offsets}")
decoded = tokenizer.decode(encoding.ids)
print(f"DECODED: {decoded}")

# Now test with the bad custom classes
bad_custom = BadCustom()
bad_pretok = pre_tokenizers.PreTokenizer.custom(bad_custom)
bad_decoder = decoders.Decoder.custom(bad_custom)

tokenizer.pre_tokenizer = bad_pretok
tokenizer.decoder = bad_decoder
try:
    encoding = tokenizer.encode("Hey friend!")
except:
    print("Bad tokenizer didn't work")

Beispiel #5
0
tokenizer.model.save(str(PATH))

# %%
# 在需要时重新载入使用(可与transformers无缝衔接配合使用)
# 注意,实践中这里需要按训练时的情况重新构建好tokenizer再载入model
tokenizer.model = BPE(vocab=str(PATH / 'vocab.json'),
                      merges=str(PATH / 'merges.txt'))

# %%
# 编码/解码
encoded = \
    tokenizer.encode("This is a simple input to be tokenized.")
print("Encoded string: {}".format(encoded.tokens))

decoded = \
    tokenizer.decode(encoded.ids)
print("Decoded string: {}".format(decoded))

# %%
from tokenizers import ByteLevelBPETokenizer
# tokenizer提供了一些经典tokenization算法的高级封装
# 譬如可以用`ByteLevelBPETokenizer`简单地重写上面的内容
#
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=['data/big.txt'], vocab_size=25000, show_progress=True)

SAVE_PATH = Path('tokenizers')
PATH = SAVE_PATH / 'bytelevel-bpe-tokenizer-model'
if not PATH.exists():
    PATH.mkdir(parents=True, exist_ok=True)