def test_tftext_sentencepiece_tokenizer_bos_eos(self): """Check that the new tokenizer produces the same result that the tftext one with bos and eos.""" tftext_sp = tensorflow_text.SentencepieceTokenizer( self.sentencepiece_model, add_bos=True, add_eos=True) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer( self.sentencepiece_model, add_bos=True, add_eos=True) input_text = [ u" ", u"to be or not to be", u"ignored by length text1", u"ignored by length text2" ] tftext_tokenized = tftext_sp.tokenize(input_text) opt_tokenized = opt_sp.tokenize(input_text) self.assertAllEqual(tftext_tokenized, opt_tokenized)
def test_tftext_sentencepiece_detokenizer(self): """Check that the new tokenizer produces the same result that the tftext one.""" tftext_sp = tensorflow_text.SentencepieceTokenizer( self.sentencepiece_model) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer( self.sentencepiece_model) input_text = [ u" ", u"to be or not to be", u"ignored by length text1", u"ignored by length text2" ] tftext_tokenized = tftext_sp.tokenize(input_text) # Check detokenizer tftext_detokenized = tftext_sp.detokenize(tftext_tokenized) opt_detokenized = opt_sp.detokenize(tftext_tokenized) self.assertAllEqual(tftext_detokenized, opt_detokenized)
def benchmarkTokenizer(self): sp_model = _GetSentencepieceModel() test_text = [ "This week we celebrate the casts and creatives who have come together" " to bring us our favorite.", "More Stacks products demonstrated commitment to excellent support.", "Test, test, test." ] tftext_sp = tensorflow_text.SentencepieceTokenizer(sp_model) opt_sp = sentencepiece_tokenizer.SentencepieceTokenizer(sp_model) iter_number = 1000 start = time.time() for _ in range(iter_number): _ = opt_sp.tokenize(test_text) self.report_benchmark(iters=iter_number, wall_time=time.time() - start, name="opt") start = time.time() for _ in range(iter_number): _ = tftext_sp.tokenize(test_text) self.report_benchmark(iters=iter_number, wall_time=time.time() - start, name="tf.text")
def __init__(self, sentencepiece_model, **kwargs): super(TokenizerLayer, self).__init__(**kwargs) self.sp = sentencepiece_tokenizer.SentencepieceTokenizer( sentencepiece_model)