Beispiel #1
0
 def testKeepWhitespace(self):
   test_value = constant_op.constant([
       b'\'Black Panther,\' \t \xe2\x80\x98A Star Is Born\xe2\x80\x98 among AFI Awards honorees',
       b' .Ok.   Go  '
   ])
   expected_tokens = [[
       b'\'', b'Black', b' ', b'Panther', b',\'', b' \t ', b'\xe2\x80\x98',
       b'A', b' ', b'Star', b' ', b'Is', b' ', b'Born', b'\xe2\x80\x98', b' ',
       b'among', b' ', b'AFI', b' ', b'Awards', b' ', b'honorees'
   ], [b' ', b'.', b'Ok', b'.', b'   ', b'Go', b'  ']]
   expected_offset_starts = [
       [0, 1, 6, 7, 14, 16, 19, 22, 23, 24, 28, 29, 31, 32, 36, 39, 40,
        45, 46, 49, 50, 56, 57],
       [0, 1, 2, 4, 5, 8, 10]]
   expected_offset_limits = [
       [1, 6, 7, 14, 16, 19, 22, 23, 24, 28, 29, 31, 32, 36, 39, 40,
        45, 46, 49, 50, 56, 57, 65],
       [1, 2, 4, 5, 8, 10, 12]]
   self.tokenizer = UnicodeScriptTokenizer(keep_whitespace=True)
   tokens = self.tokenizer.tokenize(test_value)
   self.assertRaggedEqual(tokens, expected_tokens)
   (tokens, starts, limits) = (
       self.tokenizer.tokenize_with_offsets(test_value))
   self.assertRaggedEqual(tokens, expected_tokens)
   self.assertRaggedEqual(starts, expected_offset_starts)
   self.assertRaggedEqual(limits, expected_offset_limits)
Beispiel #2
0
  def tokenize(self, text_input):
    """Performs basic word tokenization for BERT.

    Args:
      text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings.
    Returns:
      A `RaggedTensor` of tokenized strings from text_input.
    """
    # lowercase and strip accents (if option is set)
    if self._lower_case:
      text_input = case_fold_utf8(text_input)
      text_input = normalize_utf8(text_input, "NFD")
      text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "")
    else:
      # utf8 normalization
      if self._normalization_form is not None:
        text_input = normalize_utf8(text_input, self._normalization_form)

    # strip out control characters
    text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ")

    # For chinese and emoji characters, tokenize by unicode codepoints
    unicode_tokenizer = UnicodeScriptTokenizer(
        keep_whitespace=self._keep_whitespace)
    script_tokenized = unicode_tokenizer.tokenize(text_input)

    split_cond = self._should_split(script_tokenized)

    unicode_char_split = ragged_string_ops.unicode_split(
        script_tokenized, "UTF-8")
    unicode_split_tokens = array_ops.where(
        array_ops.squeeze(split_cond),
        y=array_ops.expand_dims(script_tokenized.values, axis=1),
        x=unicode_char_split.values)
    final_tokens = script_tokenized.with_flat_values(unicode_split_tokens)
    return final_tokens.merge_dims(-2, -1)
Beispiel #3
0
 def setUp(self):
   super(UnicodeScriptTokenizerOpTest, self).setUp()
   self.tokenizer = UnicodeScriptTokenizer()
Beispiel #4
0
class UnicodeScriptTokenizerOpTest(ragged_test_util.RaggedTensorTestCase):

  def setUp(self):
    super(UnicodeScriptTokenizerOpTest, self).setUp()
    self.tokenizer = UnicodeScriptTokenizer()

  def testRequireParams(self):
    with self.cached_session():
      with self.assertRaises(TypeError):
        self.tokenizer.tokenize()

  def testScalar(self):
    test_value = constant_op.constant(b'I love Flume!')
    expected_tokens = [b'I', b'love', b'Flume', b'!']
    expected_offset_starts = [0, 2, 7, 12]
    expected_offset_limits = [1, 6, 12, 13]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testVectorSingleValue(self):
    test_value = constant_op.constant([b'I love Flume!'])
    expected_tokens = [[b'I', b'love', b'Flume', b'!']]
    expected_offset_starts = [[0, 2, 7, 12]]
    expected_offset_limits = [[1, 6, 12, 13]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testVector(self):
    test_value = constant_op.constant([b'I love Flume!', b'Good day'])
    expected_tokens = [[b'I', b'love', b'Flume', b'!'], [b'Good', b'day']]
    expected_offset_starts = [[0, 2, 7, 12], [0, 5]]
    expected_offset_limits = [[1, 6, 12, 13], [4, 8]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testMatrix(self):
    test_value = constant_op.constant([[b'I love Flume!', b'Good day'],
                                       [b'I don\'t want', b'no scrubs']])
    expected_tokens = [[[b'I', b'love', b'Flume', b'!'], [b'Good', b'day']],
                       [[b'I', b'don', b'\'', b't', b'want'],
                        [b'no', b'scrubs']]]
    expected_offset_starts = [[[0, 2, 7, 12], [0, 5]],
                              [[0, 2, 5, 6, 8], [0, 3]]]
    expected_offset_limits = [[[1, 6, 12, 13], [4, 8]],
                              [[1, 5, 6, 7, 12], [2, 9]]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testMatrixRagged(self):
    test_value = ragged_factory_ops.constant([[b'I love Flume!'],
                                              [b'I don\'t want', b'no scrubs']])
    expected_tokens = [[[b'I', b'love', b'Flume', b'!']],
                       [[b'I', b'don', b'\'', b't', b'want'],
                        [b'no', b'scrubs']]]
    expected_offset_starts = [[[0, 2, 7, 12]],
                              [[0, 2, 5, 6, 8], [0, 3]]]
    expected_offset_limits = [[[1, 6, 12, 13]],
                              [[1, 5, 6, 7, 12], [2, 9]]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def test3DimMatrix(self):
    test_value = constant_op.constant([[[b'I love Flume!', b'Good day'],
                                        [b'I don\'t want', b'no scrubs']],
                                       [[b'I love Zhu!', b'Good night'],
                                        [b'A scrub is', b'a guy']]])
    expected_tokens = [[[[b'I', b'love', b'Flume', b'!'], [b'Good', b'day']],
                        [[b'I', b'don', b'\'', b't', b'want'],
                         [b'no', b'scrubs']]],
                       [[[b'I', b'love', b'Zhu', b'!'], [b'Good', b'night']],
                        [[b'A', b'scrub', b'is'], [b'a', b'guy']]]]
    expected_offset_starts = [[[[0, 2, 7, 12], [0, 5]],
                               [[0, 2, 5, 6, 8], [0, 3]]],
                              [[[0, 2, 7, 10], [0, 5]],
                               [[0, 2, 8], [0, 2]]]]
    expected_offset_limits = [[[[1, 6, 12, 13], [4, 8]],
                               [[1, 5, 6, 7, 12], [2, 9]]],
                              [[[1, 6, 10, 11], [4, 10]],
                               [[1, 7, 10], [1, 5]]]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def test3DimMatrixRagged(self):
    test_value = ragged_factory_ops.constant([[[b'I love Flume!'],
                                               [b'I don\'t want',
                                                b'no scrubs']],
                                              [[b'I love Zhu!',
                                                b'Good night']]])
    expected_tokens = [[[[b'I', b'love', b'Flume', b'!']],
                        [[b'I', b'don', b'\'', b't', b'want'],
                         [b'no', b'scrubs']]],
                       [[[b'I', b'love', b'Zhu', b'!'], [b'Good', b'night']]]]
    expected_offset_starts = [[[[0, 2, 7, 12]],
                               [[0, 2, 5, 6, 8], [0, 3]]],
                              [[[0, 2, 7, 10], [0, 5]]]]
    expected_offset_limits = [[[[1, 6, 12, 13]],
                               [[1, 5, 6, 7, 12], [2, 9]]],
                              [[[1, 6, 10, 11], [4, 10]]]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testInternationalization(self):
    test_value = constant_op.constant([u"J'adore la灯".encode('utf8'),
                                       u'¡Escríbeme!'.encode('utf8')])
    expected_tokens = [[b'J', b"'", b'adore', b'la', u'灯'.encode('utf8')],
                       [u'¡'.encode('utf8'), u'Escríbeme'.encode('utf8'), b'!']]
    expected_offset_starts = [[0, 1, 2, 8, 10], [0, 2, 12]]
    expected_offset_limits = [[1, 2, 7, 10, 13], [2, 12, 13]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testSpaceBoundaries(self):
    test_value = constant_op.constant([b' Hook em! ', b' .Ok.   Go  '])
    expected_tokens = [[b'Hook', b'em', b'!'], [b'.', b'Ok', b'.', b'Go']]
    expected_offset_starts = [[1, 6, 8], [1, 2, 4, 8]]
    expected_offset_limits = [[5, 8, 9], [2, 4, 5, 10]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testKeepWhitespace(self):
    test_value = constant_op.constant([
        b'\'Black Panther,\' \t \xe2\x80\x98A Star Is Born\xe2\x80\x98 among AFI Awards honorees',
        b' .Ok.   Go  '
    ])
    expected_tokens = [[
        b'\'', b'Black', b' ', b'Panther', b',\'', b' \t ', b'\xe2\x80\x98',
        b'A', b' ', b'Star', b' ', b'Is', b' ', b'Born', b'\xe2\x80\x98', b' ',
        b'among', b' ', b'AFI', b' ', b'Awards', b' ', b'honorees'
    ], [b' ', b'.', b'Ok', b'.', b'   ', b'Go', b'  ']]
    expected_offset_starts = [
        [0, 1, 6, 7, 14, 16, 19, 22, 23, 24, 28, 29, 31, 32, 36, 39, 40,
         45, 46, 49, 50, 56, 57],
        [0, 1, 2, 4, 5, 8, 10]]
    expected_offset_limits = [
        [1, 6, 7, 14, 16, 19, 22, 23, 24, 28, 29, 31, 32, 36, 39, 40,
         45, 46, 49, 50, 56, 57, 65],
        [1, 2, 4, 5, 8, 10, 12]]
    self.tokenizer = UnicodeScriptTokenizer(keep_whitespace=True)
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testOnlySpaces(self):
    test_value = constant_op.constant([b' ', b'     '])
    expected_tokens = [[], []]
    expected_offset_starts = [[], []]
    expected_offset_limits = [[], []]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testWhitespaceCharacters(self):
    test_value = constant_op.constant([b'things:\tcarpet\rdesk\nlamp'])
    expected_tokens = [[b'things', b':', b'carpet', b'desk', b'lamp']]
    expected_offset_starts = [[0, 6, 8, 15, 20]]
    expected_offset_limits = [[6, 7, 14, 19, 24]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testEmptyStringSingle(self):
    test_value = constant_op.constant([b''])
    expected_tokens = [[]]
    expected_offset_starts = [[]]
    expected_offset_limits = [[]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testEmptyString(self):
    test_value = constant_op.constant(
        [b'', b'I love Flume!', b'', b'O hai', b''])
    expected_tokens = [[], [b'I', b'love', b'Flume', b'!'], [], [b'O', b'hai'],
                       []]
    expected_offset_starts = [[], [0, 2, 7, 12], [], [0, 2], []]
    expected_offset_limits = [[], [1, 6, 12, 13], [], [1, 5], []]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)

  def testEmptyDimensions(self):
    test_value = ragged_factory_ops.constant(
        [[[b'I love Flume!', b'Good day. . .'], []], [],
         [[b'I love Zhu!', b'Good night'], [b'A scrub is', b'a guy']]])
    expected_tokens = [[[[b'I', b'love', b'Flume', b'!'],
                         [b'Good', b'day', b'...']], []], [],
                       [[[b'I', b'love', b'Zhu', b'!'], [b'Good', b'night']],
                        [[b'A', b'scrub', b'is'], [b'a', b'guy']]]]
    expected_offset_starts = [[[[0, 2, 7, 12], [0, 5, 8]],
                               []],
                              [],
                              [[[0, 2, 7, 10], [0, 5]],
                               [[0, 2, 8], [0, 2]]]]
    expected_offset_limits = [[[[1, 6, 12, 13], [4, 8, 13]],
                               []],
                              [],
                              [[[1, 6, 10, 11], [4, 10]],
                               [[1, 7, 10], [1, 5]]]]
    tokens = self.tokenizer.tokenize(test_value)
    self.assertRaggedEqual(tokens, expected_tokens)
    (tokens, starts, limits) = (
        self.tokenizer.tokenize_with_offsets(test_value))
    self.assertRaggedEqual(tokens, expected_tokens)
    self.assertRaggedEqual(starts, expected_offset_starts)
    self.assertRaggedEqual(limits, expected_offset_limits)
Beispiel #5
0
 def setUp(self):
     self.tokenizer = UnicodeScriptTokenizer()
Beispiel #6
0
class UnicodeScriptTokenizerOpTest(ragged_test_util.RaggedTensorTestCase):
    def setUp(self):
        self.tokenizer = UnicodeScriptTokenizer()

    def testRequireParams(self):
        with self.cached_session():
            with self.assertRaises(TypeError):
                self.tokenizer.tokenize()

    def testScalar(self):
        with self.cached_session():
            with self.assertRaises(ValueError):
                self.tokenizer.tokenize('I love Flume!')

    def testVectorSingleValue(self):
        test_value = constant_op.constant(['I love Flume!'])
        expected_tokens = [['I', 'love', 'Flume', '!']]
        expected_offset_starts = [[0, 2, 7, 12]]
        expected_offset_limits = [[1, 6, 12, 13]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testVector(self):
        test_value = constant_op.constant(['I love Flume!', 'Good day'])
        expected_tokens = [['I', 'love', 'Flume', '!'], ['Good', 'day']]
        expected_offset_starts = [[0, 2, 7, 12], [0, 5]]
        expected_offset_limits = [[1, 6, 12, 13], [4, 8]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testMatrix(self):
        test_value = constant_op.constant([['I love Flume!', 'Good day'],
                                           ['I don\'t want', 'no scrubs']])
        expected_tokens = [[['I', 'love', 'Flume', '!'], ['Good', 'day']],
                           [['I', 'don', '\'', 't', 'want'], ['no', 'scrubs']]]
        expected_offset_starts = [[[0, 2, 7, 12], [0, 5]],
                                  [[0, 2, 5, 6, 8], [0, 3]]]
        expected_offset_limits = [[[1, 6, 12, 13], [4, 8]],
                                  [[1, 5, 6, 7, 12], [2, 9]]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testMatrixRagged(self):
        test_value = ragged_factory_ops.constant(
            [['I love Flume!'], ['I don\'t want', 'no scrubs']])
        expected_tokens = [[['I', 'love', 'Flume', '!']],
                           [['I', 'don', '\'', 't', 'want'], ['no', 'scrubs']]]
        expected_offset_starts = [[[0, 2, 7, 12]], [[0, 2, 5, 6, 8], [0, 3]]]
        expected_offset_limits = [[[1, 6, 12, 13]], [[1, 5, 6, 7, 12], [2, 9]]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def test3DimMatrix(self):
        test_value = constant_op.constant([[['I love Flume!', 'Good day'],
                                            ['I don\'t want', 'no scrubs']],
                                           [['I love Zhu!', 'Good night'],
                                            ['A scrub is', 'a guy']]])
        expected_tokens = [[[['I', 'love', 'Flume', '!'], ['Good', 'day']],
                            [['I', 'don', '\'', 't', 'want'], ['no',
                                                               'scrubs']]],
                           [[['I', 'love', 'Zhu', '!'], ['Good', 'night']],
                            [['A', 'scrub', 'is'], ['a', 'guy']]]]
        expected_offset_starts = [[[[0, 2, 7, 12], [0, 5]],
                                   [[0, 2, 5, 6, 8], [0, 3]]],
                                  [[[0, 2, 7, 10], [0, 5]], [[0, 2, 8], [0,
                                                                         2]]]]
        expected_offset_limits = [[[[1, 6, 12, 13], [4, 8]],
                                   [[1, 5, 6, 7, 12], [2, 9]]],
                                  [[[1, 6, 10, 11], [4, 10]],
                                   [[1, 7, 10], [1, 5]]]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def test3DimMatrixRagged(self):
        test_value = ragged_factory_ops.constant(
            [[['I love Flume!'], ['I don\'t want', 'no scrubs']],
             [['I love Zhu!', 'Good night']]])
        expected_tokens = [[[['I', 'love', 'Flume', '!']],
                            [['I', 'don', '\'', 't', 'want'], ['no',
                                                               'scrubs']]],
                           [[['I', 'love', 'Zhu', '!'], ['Good', 'night']]]]
        expected_offset_starts = [[[[0, 2, 7, 12]], [[0, 2, 5, 6, 8], [0, 3]]],
                                  [[[0, 2, 7, 10], [0, 5]]]]
        expected_offset_limits = [[[[1, 6, 12, 13]], [[1, 5, 6, 7, 12], [2,
                                                                         9]]],
                                  [[[1, 6, 10, 11], [4, 10]]]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testInternationalization(self):
        test_value = constant_op.constant(
            [u"J'adore la灯".encode('utf8'), u'¡Escríbeme!'.encode('utf8')])
        expected_tokens = [['J', "'", 'adore', 'la', u'灯'.encode('utf8')],
                           [
                               u'¡'.encode('utf8'),
                               u'Escríbeme'.encode('utf8'), '!'
                           ]]
        expected_offset_starts = [[0, 1, 2, 8, 10], [0, 2, 12]]
        expected_offset_limits = [[1, 2, 7, 10, 13], [2, 12, 13]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testSpaceBoundaries(self):
        test_value = constant_op.constant([' Hook em! ', ' .Ok.   Go  '])
        expected_tokens = [['Hook', 'em', '!'], ['.', 'Ok', '.', 'Go']]
        expected_offset_starts = [[1, 6, 8], [1, 2, 4, 8]]
        expected_offset_limits = [[5, 8, 9], [2, 4, 5, 10]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testOnlySpaces(self):
        test_value = constant_op.constant([' ', '     '])
        expected_tokens = [[], []]
        expected_offset_starts = [[], []]
        expected_offset_limits = [[], []]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testWhitespaceCharacters(self):
        test_value = constant_op.constant(['things:\tcarpet\rdesk\nlamp'])
        expected_tokens = [['things', ':', 'carpet', 'desk', 'lamp']]
        expected_offset_starts = [[0, 6, 8, 15, 20]]
        expected_offset_limits = [[6, 7, 14, 19, 24]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testEmptyStringSingle(self):
        test_value = constant_op.constant([''])
        expected_tokens = [[]]
        expected_offset_starts = [[]]
        expected_offset_limits = [[]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testEmptyString(self):
        test_value = constant_op.constant(
            ['', 'I love Flume!', '', 'O hai', ''])
        expected_tokens = [[], ['I', 'love', 'Flume', '!'], [], ['O', 'hai'],
                           []]
        expected_offset_starts = [[], [0, 2, 7, 12], [], [0, 2], []]
        expected_offset_limits = [[], [1, 6, 12, 13], [], [1, 5], []]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)

    def testEmptyDimensions(self):
        test_value = ragged_factory_ops.constant(
            [[['I love Flume!', 'Good day. . .'], []], [],
             [['I love Zhu!', 'Good night'], ['A scrub is', 'a guy']]])
        expected_tokens = [[[['I', 'love', 'Flume', '!'],
                             ['Good', 'day', '...']], []], [],
                           [[['I', 'love', 'Zhu', '!'], ['Good', 'night']],
                            [['A', 'scrub', 'is'], ['a', 'guy']]]]
        expected_offset_starts = [[[[0, 2, 7, 12], [0, 5, 8]], []], [],
                                  [[[0, 2, 7, 10], [0, 5]], [[0, 2, 8], [0,
                                                                         2]]]]
        expected_offset_limits = [[[[1, 6, 12, 13], [4, 8, 13]], []], [],
                                  [[[1, 6, 10, 11], [4, 10]],
                                   [[1, 7, 10], [1, 5]]]]
        tokens = self.tokenizer.tokenize(test_value)
        self.assertRaggedEqual(tokens, expected_tokens)
        (tokens, starts,
         limits) = (self.tokenizer.tokenize_with_offsets(test_value))
        self.assertRaggedEqual(tokens, expected_tokens)
        self.assertRaggedEqual(starts, expected_offset_starts)
        self.assertRaggedEqual(limits, expected_offset_limits)