def test_get_set_components(self):
        toki = Tokenizer(models.BPE())
        toki.normalizer = normalizers.NFC()
        toki.pre_tokenizer = pre_tokenizers.ByteLevel()
        toki.post_processor = processors.BertProcessing(("A", 0), ("B", 1))
        toki.decoder = decoders.ByteLevel()

        tokenizer = BaseTokenizer(toki)

        assert isinstance(tokenizer.model, models.BPE)
        assert isinstance(tokenizer.normalizer, normalizers.NFC)
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.ByteLevel)
        assert isinstance(tokenizer.post_processor, processors.BertProcessing)
        assert isinstance(tokenizer.decoder, decoders.ByteLevel)

        tokenizer.model = models.Unigram()
        assert isinstance(tokenizer.model, models.Unigram)
        tokenizer.normalizer = normalizers.NFD()
        assert isinstance(tokenizer.normalizer, normalizers.NFD)
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
        assert isinstance(tokenizer.pre_tokenizer, pre_tokenizers.Whitespace)
        tokenizer.post_processor = processors.ByteLevel()
        assert isinstance(tokenizer.post_processor, processors.ByteLevel)
        tokenizer.decoder = decoders.WordPiece()
        assert isinstance(tokenizer.decoder, decoders.WordPiece)
Ejemplo n.º 2
0
    def test_train_with_special_tokens(self):
        filename = "tests/data/dummy-unigram-special_tokens-train.txt"
        with open(filename, "w") as f:
            f.write(
                """
[CLS] The Zen of Python, by Tim Peters [SEP]
[CLS] Beautiful is better than ugly. [SEP]
[CLS] Explicit is better than implicit. [SEP]
[CLS] Simple is better than complex. [SEP]
[CLS] Complex is better than complicated. [SEP]
[CLS] Flat is better than nested. [SEP]
[CLS] Sparse is better than dense. [SEP]
[CLS] Readability counts. [SEP]
[CLS] Special cases aren't special enough to break the rules. [SEP]
[CLS] Although practicality beats purity. [SEP]
[CLS] Errors should never pass silently. [SEP]
[CLS] Unless explicitly silenced. [SEP]
[CLS] In the face of ambiguity, refuse the temptation to guess. [SEP]
[CLS] There should be one-- and preferably only one --obvious way to do it. [SEP]
[CLS] Although that way may not be obvious at first unless you're Dutch. [SEP]
[CLS] Now is better than never. [SEP]
[CLS] Although never is often better than *right* now. [SEP]
[CLS] If the implementation is hard to explain, it's a bad idea. [SEP]
[CLS] If the implementation is easy to explain, it may be a good idea. [SEP]
[CLS] Namespaces are one honking great idea -- let's do more of those! [SEP]
            """
            )

        tokenizer = Tokenizer(models.Unigram())
        trainer = trainers.UnigramTrainer(
            show_progress=False, special_tokens=["[PAD]", "[SEP]", "[CLS]"], unk_token="[UNK]"
        )

        tokenizer.train([filename], trainer=trainer)

        assert tokenizer.encode("[CLS] This is a test [SEP]").tokens == [
            "[CLS]",
            " T",
            "h",
            "i",
            "s",
            " is ",
            "a",
            " ",
            "te",
            "s",
            "t ",
            "[SEP]",
        ]
    def get_tokenizer_trainer():
        # START init_tokenizer_trainer
        from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

        tokenizer = Tokenizer(models.Unigram())
        tokenizer.normalizer = normalizers.NFKC()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoders = decoders.ByteLevel()

        trainer = trainers.UnigramTrainer(
            vocab_size=20000,
            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
            special_tokens=["<PAD>", "<BOS>", "<EOS>"],
        )
        # END init_tokenizer_trainer
        trainer.show_progress = False

        return tokenizer, trainer
Ejemplo n.º 4
0
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer



ids = bert_tokenizer.encode(sentences[10]).ids
bert_tokenizer.decode(ids)


from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoders = decoders.ByteLevel()

trainer = trainers.UnigramTrainer(
    vocab_size=20000,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
    special_tokens=["<PAD>", "<BOS>", "<EOS>"],
)

tokenizer.train_from_iterator(sentences, trainer=trainer)
tokenizer.encode(sentences[4]).ids
tokenizer.decode(tokenizer.encode(sentences[4]).ids)
tokenizer.save('bert_out/test2')