def testDecodeWithSparseOutput(self, texts, expected): input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes) result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8").to_sparse() self.assertIsInstance(result, sparse_tensor.SparseTensor) self.assertAllEqual(expected.indices, result.indices) self.assertAllEqual(expected.values, result.values) self.assertAllEqual(expected.dense_shape, result.dense_shape)
def testUnknownRankError(self): if context.executing_eagerly(): return s = array_ops.placeholder(dtypes.string) message = "Rank of `input` must be statically known." with self.assertRaisesRegex(ValueError, message): self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
def testUnknownRankError(self): if context.executing_eagerly(): return s = array_ops.placeholder(dtypes.string) message = "Rank of `input` must be statically known." with self.assertRaisesRegexp(ValueError, message): self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
def testDecodeWithPaddedOutput(self, texts, expected, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_decode( input_tensor, "UTF-8").to_tensor(default_value=-1) self.assertAllEqual(expected, result)
def testBasicDecode(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8") expected = _nested_codepoints(texts) self.assertAllEqual(expected, result)
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets( texts, "UTF-8") self.assertAllEqual( codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertAllEqual( codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets( texts, "UTF-8") self.assertRaggedEqual( codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertRaggedEqual( codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]) self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def _should_split(self, script_tokenized): token_script_ids = string_ops.unicode_script( ragged_string_ops.unicode_decode(script_tokenized.flat_values, "UTF-8"))[:, :1] token_script_ids_flat = token_script_ids.flat_values is_cjk = self._is_cjk(token_script_ids_flat) is_emoji = wordshape_ops.wordshape(script_tokenized.flat_values, wordshape_ops.WordShape.HAS_EMOJI) is_punct = wordshape_ops.wordshape( script_tokenized.flat_values, wordshape_ops.WordShape.IS_PUNCT_OR_SYMBOL) split_cond = is_cjk | is_emoji | is_punct return split_cond
def testVectorDecode(self): text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"]) chars = ragged_string_ops.unicode_decode(text, "utf-8") expected_chars = [[ord(c) for c in u"仅今年前"], [ord(c) for c in u"hello"]] self.assertRaggedEqual(chars, expected_chars)
def testScalarDecode(self): text = constant_op.constant(u"仅今年前".encode("utf-8")) chars = ragged_string_ops.unicode_decode(text, "utf-8") self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
def testDecodeWithDifferentEncodings(self, encoding, texts): expected = _nested_codepoints(texts) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_decode(input_tensor, encoding) self.assertAllEqual(expected, result)
def testExceptions(self, exception=None, message=None, **args): with self.assertRaisesRegex(exception, message): self.evaluate(ragged_string_ops.unicode_decode(**args))
def testErrorModes(self, expected=None, **args): result = ragged_string_ops.unicode_decode(**args) self.assertRaggedEqual(expected, result)
def testErrorModes(self, expected=None, **args): result = ragged_string_ops.unicode_decode(**args) self.assertAllEqual(expected, result)
def testDecodeWithDifferentEncodings(self, encoding, texts): expected = _nested_codepoints(texts) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_decode(input_tensor, encoding) self.assertRaggedEqual(expected, result)
def testVectorDecode(self): text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"]) chars = ragged_string_ops.unicode_decode(text, "utf-8") expected_chars = [[ord(c) for c in u"仅今年前"], [ord(c) for c in u"hello"]] self.assertAllEqual(chars, expected_chars)
def testExceptions(self, exception=None, message=None, **args): with self.assertRaisesRegexp(exception, message): self.evaluate(ragged_string_ops.unicode_decode(**args))
def testBasicDecode(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8") expected = _nested_codepoints(texts) self.assertRaggedEqual(expected, result)