def test_get_vocab_size(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can retrieve vocab's size with added tokens
        size = tokenizer.get_vocab_size(with_added_tokens=True)
        assert size == 5

        # Can retrieve vocab's size without added tokens
        size = tokenizer.get_vocab_size(with_added_tokens=False)
        assert size == 0
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
Beispiel #3
0
with open(f"{proc_path}/names.txt", "w") as f:
    f.write("\n".join(list(name_words.values())))
    # f.write("\n".join(words))

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    # NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True)
tokenizer.train(trainer, [f"{proc_path}/names.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(proc_path)

tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json',
                                f'{proc_path}/merges.txt')

with open(f"{proc_path}/vocab.json", "r") as f:
    bpe_vocab = json.load(f)

bpe_vocab_idx = {v: k for k, v in bpe_vocab.items()}

char_map = {k: v + 1 for k, v in bpe_vocab.items() if len(k) == 1}
print(f"Char map size: {len(char_map)}\n")

MAX_LEN_OF_WORD = max([len(w) for w in bpe_vocab])