def testScalar(self):
   with self.cached_session():
     with self.assertRaises(ValueError):
       ragged_string_ops.unicode_encode(72, "UTF-8")
   with self.cached_session():
     with self.assertRaises(ValueError):
       ragged_string_ops.unicode_encode(constant_op.constant(72), "UTF-8")
    def testReplaceErrors(self, encoding):
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace")
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        # Test custom replacement character
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"Heooo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 111)
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        # Verify "replace" is default
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        # Replacement_char must be within range
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 1114112)
        with self.assertRaises(errors.InvalidArgumentError):
            self.evaluate(unicode_encode_op)
    def testVector(self, encoding):
        test_value = np.array([72, 101, 108, 108, 111], np.int32)
        expected_value = u"Hello".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        test_value = np.array([72, 101, 195, 195, 128516], np.int32)
        expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        # Single character string
        test_value = np.array([72], np.int32)
        expected_value = u"H".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertRaggedEqual(unicode_encode_op, expected_value)

        test_value = np.array([128516], np.int32)
        expected_value = u"\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertRaggedEqual(unicode_encode_op, expected_value)
    def testVector(self, encoding):
        test_value = np.array(
            [ord('H'), ord('e'),
             ord('l'), ord('l'),
             ord('o')], np.int32)
        expected_value = u"Hello".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertAllEqual(unicode_encode_op, expected_value)

        test_value = np.array(
            [ord('H'), ord('e'), 0xC3, 0xC3, 0x1F604], np.int32)
        expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Single character string
        test_value = np.array([ord('H')], np.int32)
        expected_value = u"H".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertAllEqual(unicode_encode_op, expected_value)

        test_value = np.array([0x1F604], np.int32)
        expected_value = u"\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertAllEqual(unicode_encode_op, expected_value)
  def testVector(self, encoding):
    test_value = np.array([72, 101, 108, 108, 111], np.int32)
    expected_value = u"Hello".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    test_value = np.array([72, 101, 195, 195, 128516], np.int32)
    expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    # Single character string
    test_value = np.array([72], np.int32)
    expected_value = u"H".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    test_value = np.array([128516], np.int32)
    expected_value = u"\U0001f604".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)
 def testScalar(self):
   with self.cached_session():
     with self.assertRaises(ValueError):
       ragged_string_ops.unicode_encode(72, "UTF-8")
   with self.cached_session():
     with self.assertRaises(ValueError):
       ragged_string_ops.unicode_encode(constant_op.constant(72), "UTF-8")
 def testStrictErrors(self, encoding):
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     with self.cached_session() as session:
         with self.assertRaises(errors.InvalidArgumentError):
             session.run(
                 ragged_string_ops.unicode_encode(test_value, encoding,
                                                  "strict"))
 def testStrictErrors(self, encoding):
   test_value = np.array([ord('H'), ord('e'), 0x7FFFFFFF, -1, ord('o')],
                         np.int32)
   with self.cached_session() as session:
     with self.assertRaises(errors.InvalidArgumentError):
       session.run(
           ragged_string_ops.unicode_encode(test_value, encoding, "strict"))
Exemple #9
0
 def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
     test_inner_values = constant_op.constant([[[72, 101, 108, 108, 111],
                                                [87, 111, 114, 108, 100]],
                                               [[102, 105, 120, 101, 100],
                                                [119, 111, 114, 100, 115]],
                                               [[72, 121, 112, 101, 114],
                                                [99, 117, 98, 101, 46]]])
     test_row_splits = [
         constant_op.constant([0, 2, 3], dtype=np.int64),
         constant_op.constant([0, 1, 1, 3], dtype=np.int64)
     ]
     test_value = ragged_factory_ops.from_nested_row_splits(
         test_inner_values, test_row_splits)
     expected_value = [
         [[[u"Hello".encode(encoding), u"World".encode(encoding)]], []],
         [[[u"fixed".encode(encoding), u"words".encode(encoding)],
           [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]]
     ]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertEqual(unicode_encode_op.ragged_rank, 2)
         self.assertAllEqual(result.tolist(), expected_value)
         # These next two assertions don't necessarily need to be here as they test
         # internal representations and we already verified the value is correct.
         self.assertAllEqual(len(result.nested_row_splits),
                             len(test_row_splits))
         self.assertEqual(unicode_encode_op.inner_values.shape.ndims,
                          test_inner_values.shape.ndims - 1)
 def testIgnoreErrors(self, encoding):
   test_value = np.array([ord('H'), ord('e'), 0x7FFFFFFF, -1, ord('o')],
                         np.int32)
   expected_value = u"Heo".encode(encoding)
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                        "ignore")
   self.assertAllEqual(unicode_encode_op, expected_value)
 def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
   test_inner_values = constant_op.constant([[[72, 101, 108, 108, 111],
                                              [87, 111, 114, 108, 100]],
                                             [[102, 105, 120, 101, 100],
                                              [119, 111, 114, 100, 115]],
                                             [[72, 121, 112, 101, 114],
                                              [99, 117, 98, 101, 46]]])
   test_row_splits = [
       constant_op.constant([0, 2, 3], dtype=np.int64),
       constant_op.constant([0, 1, 1, 3], dtype=np.int64)
   ]
   test_value = ragged_factory_ops.from_nested_row_splits(test_inner_values,
                                                          test_row_splits)
   expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                      []],
                     [[[u"fixed".encode(encoding), u"words".encode(encoding)],
                       [u"Hyper".encode(encoding),
                        u"cube.".encode(encoding)]]]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertEqual(unicode_encode_op.ragged_rank, 2)
     self.assertAllEqual(result.tolist(), expected_value)
     # These next two assertions don't necessarily need to be here as they test
     # internal representations and we already verified the value is correct.
     self.assertAllEqual(len(result.nested_row_splits), len(test_row_splits))
     self.assertEqual(unicode_encode_op.inner_values.shape.ndims,
                      test_inner_values.shape.ndims - 1)
  def _whitespace_tokenize_with_offsets_encode_decode_wrapper(
      self, input_tensor):
    """Tokenizes a tensor of UTF-8 strings with rank of 1.

    Args:
      input_tensor: The single dimensional Tensor to tokenize.

    Returns:
      Tuple of RaggedTensors of tokenized text and byte offsets, with shapes
      [num_strings, (num_tokens or num_offsets)].
    """
    # Decode the strings and get byte offsets
    (codepoints, byte_start_offsets) = (
        ragged_string_ops.unicode_decode_with_offsets(input_tensor, "UTF-8"))
    byte_limit_offsets = array_ops.concat([
        byte_start_offsets[:, 1:],
        math_ops.cast(
            array_ops.expand_dims(string_ops.string_length(input_tensor), 1),
            dtypes.int64)
    ], 1)

    # Tokenize
    (codepoint_tokens, codepoint_start_offsets, codepoint_limit_offsets) = (
        self._whitespace_tokenize_codepoints_with_offsets(codepoints))

    # Encode the codepoints and translate the codepoint offsets to byte offsets.
    return (ragged_string_ops.unicode_encode(codepoint_tokens, "UTF-8"),
            array_ops.batch_gather(byte_start_offsets, codepoint_start_offsets),
            array_ops.batch_gather(
                byte_limit_offsets,
                math_ops.subtract(codepoint_limit_offsets, [1])))
 def testMatrix(self, encoding):
   test_value = np.array(
       [[72, 0x1F604, 108, 108, 111], [87, 0x1F604, 114, 108, 100]], np.int32)
   expected_value = [
       u"H\U0001f604llo".encode(encoding), u"W\U0001f604rld".encode(encoding)
   ]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
Exemple #14
0
 def testRaggedMatrix(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
   expected_value = [
       u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
   ]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
 def test4DimRaggedMatrix(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
        [[[]], [[72, 121, 112, 101]]]], np.int32)
   expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                     [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
 def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
        [[0x1F604]]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
                     [], [u"\U0001f604".encode(encoding)]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
 def testRaggedMatrix(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[ord('H'), 0xC3, ord('l'), ord('l'), ord('o')],
        [ord('W'), 0x1F604, ord('r'), ord('l'), ord('d'), ord('.')]], np.int32)
   expected_value = [
       u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
   ]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
 def testIgnoreErrors(self, encoding):
     test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
     expected_value = u"Heo".encode(encoding)
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding, "ignore")
     with self.cached_session() as session:
         result = session.run(unicode_encode_op)
         self.assertIsInstance(result, bytes)
         self.assertAllEqual(result, expected_value)
 def testIgnoreErrors(self, encoding):
   test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
   expected_value = u"Heo".encode(encoding)
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                        "ignore")
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertIsInstance(result, bytes)
     self.assertAllEqual(result, expected_value)
 def test3DimMatrix(self, encoding):
   test_value = constant_op.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
        [[102, 105, 120, 101, 100], [119, 111, 114, 100, 115]],
        [[72, 121, 112, 101, 114], [99, 117, 98, 101, 46]]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
                     [u"fixed".encode(encoding), u"words".encode(encoding)],
                     [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
 def testRaggedMatrix(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
   expected_value = [
       u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
   ]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertIsInstance(unicode_encode_op, ops.Tensor)
     self.assertAllEqual(result, expected_value)
 def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
        [[128516]]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
                     [], [u"\U0001f604".encode(encoding)]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertEqual(unicode_encode_op.ragged_rank, 1)
     self.assertAllEqual(result.tolist(), expected_value)
 def test4DimRaggedMatrix(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
        [[[]], [[72, 121, 112, 101]]]], np.int32)
   expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                     [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertEqual(unicode_encode_op.ragged_rank, 2)
     self.assertAllEqual(result.tolist(), expected_value)
 def test3DimMatrixWithRagged3rdDim(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]],
        [[68, 111, 110, 39, 116], [119, 195, 114, 114, 121, 44, 32, 98, 101]],
        [[0x1F604], []]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
                     [
                         u"Don't".encode(encoding),
                         u"w\xc3rry, be".encode(encoding)
                     ], [u"\U0001f604".encode(encoding), u"".encode(encoding)]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
Exemple #25
0
 def testRaggedMatrix(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]],
         np.int32)
     expected_value = [
         u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
     ]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertIsInstance(unicode_encode_op, ops.Tensor)
         self.assertAllEqual(result, expected_value)
Exemple #26
0
 def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
          [[128516]]], np.int32)
     expected_value = [[
         u"Hello".encode(encoding), u"World.".encode(encoding)
     ], [], [u"\U0001f604".encode(encoding)]]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertEqual(unicode_encode_op.ragged_rank, 1)
         self.assertAllEqual(result.tolist(), expected_value)
Exemple #27
0
 def test4DimRaggedMatrix(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
          [[[]], [[72, 121, 112, 101]]]], np.int32)
     expected_value = [[[
         u"Hello".encode(encoding), u"World".encode(encoding)
     ]], [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertEqual(unicode_encode_op.ragged_rank, 2)
         self.assertAllEqual(result.tolist(), expected_value)
 def test3DimMatrix(self, encoding):
   test_value = constant_op.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
        [[102, 105, 120, 101, 100], [119, 111, 114, 100, 115]],
        [[72, 121, 112, 101, 114], [99, 117, 98, 101, 46]]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
                     [u"fixed".encode(encoding), u"words".encode(encoding)],
                     [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertIsInstance(unicode_encode_op, ops.Tensor)
     self.assertAllEqual(result, expected_value)
Exemple #29
0
    def testReplaceErrors(self, encoding):
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace")
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        # Test custom replacement character
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"Heooo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 111)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        # Verify "replace" is default
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        # Replacement_char must be within range
        test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 1114112)
        with self.cached_session():
            with self.assertRaises(errors.InvalidArgumentError):
                unicode_encode_op.eval()
Exemple #30
0
    def testVector(self, encoding):
        test_value = np.array([72, 101, 108, 108, 111], np.int32)
        expected_value = u"Hello".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        test_value = np.array([72, 101, 195, 195, 128516], np.int32)
        expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        # Single character string
        test_value = np.array([72], np.int32)
        expected_value = u"H".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)

        test_value = np.array([128516], np.int32)
        expected_value = u"\U0001f604".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        with self.cached_session():
            result = unicode_encode_op.eval()
            self.assertIsInstance(result, bytes)
            self.assertAllEqual(result, expected_value)
Exemple #31
0
  def detokenize(self, input, name=None):  # pylint: disable=redefined-builtin
    """Detokenizes input codepoints (integers) to UTF-8 strings.

    Args:
      input: A `RaggedTensor` or `Tensor` of codepoints (ints) with a rank of at
        least 1.
      name: The name argument that is passed to the op function.

    Returns:
      A N-1 dimensional string tensor of the detokenized text.
    """
    name = None
    with ops.name_scope(name, "UnicodeCharTokenize", [input, self]):
      input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
      return ragged_string_ops.unicode_encode(input_tensor, "UTF-8")
  def testReplaceErrors(self, encoding):
    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                         "replace")
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    # Test custom replacement character
    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
    expected_value = u"Heooo".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                         "replace", 111)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    # Verify "replace" is default
    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
    with self.cached_session():
      result = unicode_encode_op.eval()
      self.assertIsInstance(result, bytes)
      self.assertAllEqual(result, expected_value)

    # Replacement_char must be within range
    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
                                                         "replace", 1114112)
    with self.cached_session():
      with self.assertRaises(errors.InvalidArgumentError):
        unicode_encode_op.eval()
 def testRequireParams(self):
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode()
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode(72)
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode(encoding="UTF-8")
 def testRequireParams(self):
     with self.cached_session():
         with self.assertRaises(TypeError):
             ragged_string_ops.unicode_encode()  # pylint: disable=no-value-for-parameter
     with self.cached_session():
         with self.assertRaises(TypeError):
             ragged_string_ops.unicode_encode(72)  # pylint: disable=no-value-for-parameter
     with self.cached_session():
         with self.assertRaises(TypeError):
             ragged_string_ops.unicode_encode(encoding="UTF-8")  # pylint: disable=no-value-for-parameter,unexpected-keyword-arg
 def testRequireParams(self):
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode()
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode(72)
   with self.cached_session():
     with self.assertRaises(TypeError):
       ragged_string_ops.unicode_encode(encoding="UTF-8")
Exemple #36
0
 def test4DimMatrix(self, encoding):
     test_value = constant_op.constant(
         [[[[72, 101, 108, 108, 111]], [[87, 111, 114, 108, 100]]],
          [[[102, 105, 120, 101, 100]], [[119, 111, 114, 100, 115]]],
          [[[72, 121, 112, 101, 114]], [[99, 117, 98, 101, 46]]]], np.int32)
     expected_value = [[[u"Hello".encode(encoding)],
                        [u"World".encode(encoding)]],
                       [[u"fixed".encode(encoding)],
                        [u"words".encode(encoding)]],
                       [[u"Hyper".encode(encoding)],
                        [u"cube.".encode(encoding)]]]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertIsInstance(unicode_encode_op, ops.Tensor)
         self.assertAllEqual(result, expected_value)
 def test3DimMatrixWithRagged2ndDim(self, encoding):
   test_value = ragged_factory_ops.constant(
       [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
        [[102, 105, 120, 101, 100]],
        [[72, 121, 112, 101, 114], [119, 111, 114, 100, 115],
         [99, 117, 98, 101, 46]]], np.int32)
   expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
                     [u"fixed".encode(encoding)],
                     [
                         u"Hyper".encode(encoding), u"words".encode(encoding),
                         u"cube.".encode(encoding)
                     ]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   with self.cached_session():
     result = unicode_encode_op.eval()
     self.assertEqual(unicode_encode_op.ragged_rank, 1)
     self.assertAllEqual(result.tolist(), expected_value)
 def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
   test_flat_values = constant_op.constant([[[72, 101, 108, 108, 111],
                                             [87, 111, 114, 108, 100]],
                                            [[102, 105, 120, 101, 100],
                                             [119, 111, 114, 100, 115]],
                                            [[72, 121, 112, 101, 114],
                                             [99, 117, 98, 101, 46]]])
   test_row_splits = [
       constant_op.constant([0, 2, 3], dtype=np.int64),
       constant_op.constant([0, 1, 1, 3], dtype=np.int64)
   ]
   test_value = ragged_tensor.RaggedTensor.from_nested_row_splits(
       test_flat_values, test_row_splits)
   expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
                      []],
                     [[[u"fixed".encode(encoding), u"words".encode(encoding)],
                       [u"Hyper".encode(encoding),
                        u"cube.".encode(encoding)]]]]
   unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
   self.assertAllEqual(unicode_encode_op, expected_value)
    def testReplaceErrors(self, encoding):
        test_value = np.array(
            [ord('H'), ord('e'), 0x7FFFFFFF, -1,
             ord('o')], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace")
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Test custom replacement character
        test_value = np.array(
            [ord('H'), ord('e'), 0x7FFFFFFF, -1,
             ord('o')], np.int32)
        expected_value = u"Heooo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", ord('o'))
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Verify "replace" is default
        test_value = np.array(
            [ord('H'), ord('e'), 0x7FFFFFFF, -1,
             ord('o')], np.int32)
        expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding)
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Verify non-default replacement with an unpaired surrogate.
        test_value = np.array([0xD801], np.int32)
        expected_value = u"A".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 0x41)
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Test with a noncharacter code point.
        test_value = np.array([0x1FFFF], np.int32)
        expected_value = u"A".encode(encoding)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 0x41)
        self.assertAllEqual(unicode_encode_op, expected_value)

        # Replacement_char must be within range
        test_value = np.array(
            [ord('H'), ord('e'), 0x7FFFFFFF, -1,
             ord('o')], np.int32)
        unicode_encode_op = ragged_string_ops.unicode_encode(
            test_value, encoding, "replace", 0x110000)
        with self.assertRaises(errors.InvalidArgumentError):
            self.evaluate(unicode_encode_op)
Exemple #40
0
 def test3DimMatrixWithRagged2ndDim(self, encoding):
     test_value = ragged_factory_ops.constant(
         [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
          [[102, 105, 120, 101, 100]],
          [[72, 121, 112, 101, 114], [119, 111, 114, 100, 115],
           [99, 117, 98, 101, 46]]], np.int32)
     expected_value = [
         [u"Hello".encode(encoding), u"World".encode(encoding)],
         [u"fixed".encode(encoding)],
         [
             u"Hyper".encode(encoding), u"words".encode(encoding),
             u"cube.".encode(encoding)
         ]
     ]
     unicode_encode_op = ragged_string_ops.unicode_encode(
         test_value, encoding)
     with self.cached_session():
         result = unicode_encode_op.eval()
         self.assertEqual(unicode_encode_op.ragged_rank, 1)
         self.assertAllEqual(result.tolist(), expected_value)
 def testStrictErrors(self, encoding):
   test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
   with self.cached_session():
     with self.assertRaises(errors.InvalidArgumentError):
       ragged_string_ops.unicode_encode(test_value, encoding, "strict").eval()
 def f(v):
     return ragged_string_ops.unicode_encode(v, "UTF-8")