def _whitespace_tokenize_with_offsets_encode_decode_wrapper(
      self, input_tensor):
    """Tokenizes a tensor of UTF-8 strings with rank of 1.

    Args:
      input_tensor: The single dimensional Tensor to tokenize.

    Returns:
      Tuple of RaggedTensors of tokenized text and byte offsets, with shapes
      [num_strings, (num_tokens or num_offsets)].
    """
    # Decode the strings and get byte offsets
    (codepoints, byte_start_offsets) = (
        ragged_string_ops.unicode_decode_with_offsets(input_tensor, "UTF-8"))
    byte_limit_offsets = array_ops.concat([
        byte_start_offsets[:, 1:],
        math_ops.cast(
            array_ops.expand_dims(string_ops.string_length(input_tensor), 1),
            dtypes.int64)
    ], 1)

    # Tokenize
    (codepoint_tokens, codepoint_start_offsets, codepoint_limit_offsets) = (
        self._whitespace_tokenize_codepoints_with_offsets(codepoints))

    # Encode the codepoints and translate the codepoint offsets to byte offsets.
    return (ragged_string_ops.unicode_encode(codepoint_tokens, "UTF-8"),
            array_ops.batch_gather(byte_start_offsets, codepoint_start_offsets),
            array_ops.batch_gather(
                byte_limit_offsets,
                math_ops.subtract(codepoint_limit_offsets, [1])))
 def testVectorDecodeWithOffset(self):
   text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
   chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
   expected_chars = [[ord(c) for c in u"仅今年前"],
                     [ord(c) for c in u"hello"]]
   self.assertAllEqual(chars, expected_chars)
   self.assertAllEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
    def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
        """Tokenizes a tensor of UTF-8 strings to Unicode characters.

    Returned token tensors are of integer type.

    Args:
      input: A `RaggedTensor`or `Tensor` of UTF-8 strings with any shape.

    Returns:
      A tuple `(tokens, start_offsets, end_offsets)` where:

        * `tokens`: A `RaggedTensor` of codepoints (integer type).
        * `start_offsets`: A `RaggedTensor` of the tokens' starting byte offset.
        * `end_offsets`: A `RaggedTensor` of the tokens' ending byte offset.
    """
        name = None
        with ops.name_scope(name, "UnicodeCharTokenize", [input]):
            input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(
                input)
            (codepoints, byte_start_offsets) = (
                ragged_string_ops.unicode_decode_with_offsets(
                    input_tensor, "UTF-8"))
            strlens = math_ops.cast(
                array_ops.expand_dims(string_ops.string_length(input_tensor),
                                      -1), dtypes.int64)
            # Adjust strlens to set 0-length strings to empty array (there will be no
            # tokens in this case).
            final_ends = ragged_array_ops.boolean_mask(strlens, strlens > 0)
            byte_end_offsets = array_ops.concat(
                [byte_start_offsets[..., 1:], final_ends], -1)
            return codepoints, byte_start_offsets, byte_end_offsets
 def testErrorModesWithOffsets(self,
                               expected=None,
                               expected_offsets=None,
                               **args):
   result = ragged_string_ops.unicode_decode_with_offsets(**args)
   self.assertAllEqual(result[0], expected)
   self.assertAllEqual(result[1], expected_offsets)
 def testErrorModesWithOffsets(self,
                               expected=None,
                               expected_offsets=None,
                               **args):
   result = ragged_string_ops.unicode_decode_with_offsets(**args)
   self.assertRaggedEqual(result[0], expected)
   self.assertRaggedEqual(result[1], expected_offsets)
 def testVectorDecodeWithOffset(self):
   text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
   chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
   expected_chars = [[ord(c) for c in u"仅今年前"],
                     [ord(c) for c in u"hello"]]
   self.assertRaggedEqual(chars, expected_chars)
   self.assertRaggedEqual(starts, [[0, 3, 6, 9], [0, 1, 2, 3, 4]])
 def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts):
   expected_codepoints = _nested_codepoints(texts)
   expected_offsets = _nested_offsets(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_decode_with_offsets(
       input_tensor, encoding)
   self.assertAllEqual(expected_codepoints, result[0])
   self.assertAllEqual(expected_offsets, result[1])
 def testDecodeWithOffsetsWithDifferentEncodings(self, encoding, texts):
   expected_codepoints = _nested_codepoints(texts)
   expected_offsets = _nested_offsets(texts, encoding)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_decode_with_offsets(
       input_tensor, encoding)
   self.assertRaggedEqual(expected_codepoints, result[0])
   self.assertRaggedEqual(expected_offsets, result[1])
 def testBasicDecodeWithOffsets(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_decode_with_offsets(
       input_tensor, "UTF-8")
   expected_codepoints = _nested_codepoints(texts)
   expected_offsets = _nested_offsets(texts, "UTF-8")
   self.assertAllEqual(expected_codepoints, result[0])
   self.assertAllEqual(expected_offsets, result[1])
 def testBasicDecodeWithOffsets(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_decode_with_offsets(
       input_tensor, "UTF-8")
   expected_codepoints = _nested_codepoints(texts)
   expected_offsets = _nested_offsets(texts, "UTF-8")
   self.assertRaggedEqual(expected_codepoints, result[0])
   self.assertRaggedEqual(expected_offsets, result[1])
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
       texts, "UTF-8")
   self.assertAllEqual(
       codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertAllEqual(
       codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
       texts, "UTF-8")
   self.assertRaggedEqual(
       codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertRaggedEqual(
       codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
 def testScalarDecodeWithOffset(self):
   text = constant_op.constant(u"仅今年前".encode("utf-8"))
   chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
   self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
   self.assertAllEqual(starts, [0, 3, 6, 9])
 def testScalarDecodeWithOffset(self):
   text = constant_op.constant(u"仅今年前".encode("utf-8"))
   chars, starts = ragged_string_ops.unicode_decode_with_offsets(text, "utf-8")
   self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
   self.assertAllEqual(starts, [0, 3, 6, 9])