Beispiel #1
0
 def testDecodeWithSparseOutput(self, texts, expected):
   input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
   result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8").to_sparse()
   self.assertIsInstance(result, sparse_tensor.SparseTensor)
   self.assertAllEqual(expected.indices, result.indices)
   self.assertAllEqual(expected.values, result.values)
   self.assertAllEqual(expected.dense_shape, result.dense_shape)
Beispiel #2
0
 def testUnknownRankError(self):
   if context.executing_eagerly():
     return
   s = array_ops.placeholder(dtypes.string)
   message = "Rank of `input` must be statically known."
   with self.assertRaisesRegex(ValueError, message):
     self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
 def testDecodeWithSparseOutput(self, texts, expected):
   input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes)
   result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8").to_sparse()
   self.assertIsInstance(result, sparse_tensor.SparseTensor)
   self.assertAllEqual(expected.indices, result.indices)
   self.assertAllEqual(expected.values, result.values)
   self.assertAllEqual(expected.dense_shape, result.dense_shape)
 def testUnknownRankError(self):
   if context.executing_eagerly():
     return
   s = array_ops.placeholder(dtypes.string)
   message = "Rank of `input` must be statically known."
   with self.assertRaisesRegexp(ValueError, message):
     self.evaluate(ragged_string_ops.unicode_decode(s, input_encoding="UTF-8"))
Beispiel #5
0
 def testDecodeWithPaddedOutput(self, texts, expected, ragged_rank=None):
     input_tensor = ragged_factory_ops.constant_value(
         _nested_encode(texts, "UTF-8"),
         ragged_rank=ragged_rank,
         dtype=bytes)
     result = ragged_string_ops.unicode_decode(
         input_tensor, "UTF-8").to_tensor(default_value=-1)
     self.assertAllEqual(expected, result)
Beispiel #6
0
 def testBasicDecode(self, texts, ragged_rank=None):
     input_tensor = ragged_factory_ops.constant_value(
         _nested_encode(texts, "UTF-8"),
         ragged_rank=ragged_rank,
         dtype=bytes)
     result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8")
     expected = _nested_codepoints(texts)
     self.assertAllEqual(expected, result)
Beispiel #7
0
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
       texts, "UTF-8")
   self.assertAllEqual(
       codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertAllEqual(
       codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
 def testDocstringExamples(self):
   texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]]
   codepoints1 = ragged_string_ops.unicode_decode(texts, "UTF-8")
   codepoints2, offsets = ragged_string_ops.unicode_decode_with_offsets(
       texts, "UTF-8")
   self.assertRaggedEqual(
       codepoints1, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertRaggedEqual(
       codepoints2, [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]])
   self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
Beispiel #9
0
  def _should_split(self, script_tokenized):
    token_script_ids = string_ops.unicode_script(
        ragged_string_ops.unicode_decode(script_tokenized.flat_values,
                                         "UTF-8"))[:, :1]

    token_script_ids_flat = token_script_ids.flat_values
    is_cjk = self._is_cjk(token_script_ids_flat)
    is_emoji = wordshape_ops.wordshape(script_tokenized.flat_values,
                                       wordshape_ops.WordShape.HAS_EMOJI)
    is_punct = wordshape_ops.wordshape(
        script_tokenized.flat_values,
        wordshape_ops.WordShape.IS_PUNCT_OR_SYMBOL)
    split_cond = is_cjk | is_emoji | is_punct
    return split_cond
 def testVectorDecode(self):
   text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
   chars = ragged_string_ops.unicode_decode(text, "utf-8")
   expected_chars = [[ord(c) for c in u"仅今年前"],
                     [ord(c) for c in u"hello"]]
   self.assertRaggedEqual(chars, expected_chars)
 def testDecodeWithPaddedOutput(self, texts, expected, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_decode(
       input_tensor, "UTF-8").to_tensor(default_value=-1)
   self.assertAllEqual(expected, result)
Beispiel #12
0
 def testScalarDecode(self):
   text = constant_op.constant(u"仅今年前".encode("utf-8"))
   chars = ragged_string_ops.unicode_decode(text, "utf-8")
   self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])
Beispiel #13
0
 def testDecodeWithDifferentEncodings(self, encoding, texts):
   expected = _nested_codepoints(texts)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_decode(input_tensor, encoding)
   self.assertAllEqual(expected, result)
Beispiel #14
0
 def testExceptions(self, exception=None, message=None, **args):
   with self.assertRaisesRegex(exception, message):
     self.evaluate(ragged_string_ops.unicode_decode(**args))
 def testErrorModes(self, expected=None, **args):
   result = ragged_string_ops.unicode_decode(**args)
   self.assertRaggedEqual(expected, result)
Beispiel #16
0
 def testErrorModes(self, expected=None, **args):
   result = ragged_string_ops.unicode_decode(**args)
   self.assertAllEqual(expected, result)
 def testDecodeWithDifferentEncodings(self, encoding, texts):
   expected = _nested_codepoints(texts)
   input_tensor = constant_op.constant(_nested_encode(texts, encoding))
   result = ragged_string_ops.unicode_decode(input_tensor, encoding)
   self.assertRaggedEqual(expected, result)
Beispiel #18
0
 def testVectorDecode(self):
   text = constant_op.constant([u"仅今年前".encode("utf-8"), b"hello"])
   chars = ragged_string_ops.unicode_decode(text, "utf-8")
   expected_chars = [[ord(c) for c in u"仅今年前"],
                     [ord(c) for c in u"hello"]]
   self.assertAllEqual(chars, expected_chars)
 def testExceptions(self, exception=None, message=None, **args):
   with self.assertRaisesRegexp(exception, message):
     self.evaluate(ragged_string_ops.unicode_decode(**args))
 def testBasicDecode(self, texts, ragged_rank=None):
   input_tensor = ragged_factory_ops.constant_value(
       _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes)
   result = ragged_string_ops.unicode_decode(input_tensor, "UTF-8")
   expected = _nested_codepoints(texts)
   self.assertRaggedEqual(expected, result)
 def testScalarDecode(self):
   text = constant_op.constant(u"仅今年前".encode("utf-8"))
   chars = ragged_string_ops.unicode_decode(text, "utf-8")
   self.assertAllEqual(chars, [ord(c) for c in u"仅今年前"])