def test_tftext_sentencepiece_tokenizer_bos_eos(self):
        """Check that the new tokenizer produces the same result that the tftext one with bos and eos."""
        tftext_sp = tensorflow_text.SentencepieceTokenizer(
            self.sentencepiece_model, add_bos=True, add_eos=True)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(
            self.sentencepiece_model, add_bos=True, add_eos=True)

        input_text = [
            u" ", u"to be or not to be", u"ignored by length text1",
            u"ignored by length text2"
        ]
        tftext_tokenized = tftext_sp.tokenize(input_text)
        opt_tokenized = opt_sp.tokenize(input_text)
        self.assertAllEqual(tftext_tokenized, opt_tokenized)
    def test_tftext_sentencepiece_detokenizer(self):
        """Check that the new tokenizer produces the same result that the tftext one."""
        tftext_sp = tensorflow_text.SentencepieceTokenizer(
            self.sentencepiece_model)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(
            self.sentencepiece_model)

        input_text = [
            u" ", u"to be or not to be", u"ignored by length text1",
            u"ignored by length text2"
        ]
        tftext_tokenized = tftext_sp.tokenize(input_text)

        # Check detokenizer
        tftext_detokenized = tftext_sp.detokenize(tftext_tokenized)
        opt_detokenized = opt_sp.detokenize(tftext_tokenized)
        self.assertAllEqual(tftext_detokenized, opt_detokenized)
    def benchmarkTokenizer(self):
        sp_model = _GetSentencepieceModel()
        test_text = [
            "This week we celebrate the casts and creatives who have come together"
            " to bring us our favorite.",
            "More Stacks products demonstrated commitment to excellent support.",
            "Test, test, test."
        ]

        tftext_sp = tensorflow_text.SentencepieceTokenizer(sp_model)
        opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(sp_model)
        iter_number = 1000
        start = time.time()
        for _ in range(iter_number):
            _ = opt_sp.tokenize(test_text)
        self.report_benchmark(iters=iter_number,
                              wall_time=time.time() - start,
                              name="opt")
        start = time.time()
        for _ in range(iter_number):
            _ = tftext_sp.tokenize(test_text)
        self.report_benchmark(iters=iter_number,
                              wall_time=time.time() - start,
                              name="tf.text")
 def __init__(self, sentencepiece_model, **kwargs):
     super(TokenizerLayer, self).__init__(**kwargs)
     self.sp = sentencepiece_tokenizer.SentencepieceTokenizer(
         sentencepiece_model)