def __init__(self, vocab_file, do_lower_case=True, vocab_override=None):
     super().__init__()
     self.vocab_file = vocab_file
     self.do_lower_case = do_lower_case
     if vocab_override is None:
         self.vocab = tokenization.load_vocab(vocab_file)
     else:
         self.vocab = vocab_override
     self.inv_vocab = {v: k for k, v in self.vocab.items()}
     self.basic_tokenizer = tokenization.BasicTokenizer(
         do_lower_case=do_lower_case)
     self.wordpiece_tokenizer = tokenization.WordpieceTokenizer(
         vocab=self.vocab)
Exemple #2
0
    def test_wordpiece_tokenizer(self):
        vocab_tokens = [
            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un",
            "runn", "##ing"
        ]

        vocab = {}
        for (i, token) in enumerate(vocab_tokens):
            vocab[token] = i
        tokenizer = tokenization.WordpieceTokenizer(vocab=vocab)

        self.assertAllEqual(tokenizer.tokenize(""), [])

        self.assertAllEqual(tokenizer.tokenize("unwanted running"),
                            ["un", "##want", "##ed", "runn", "##ing"])

        self.assertAllEqual(tokenizer.tokenize("unwantedX running"),
                            ["[UNK]", "runn", "##ing"])