def testSplitWithSparseOutput(self, texts, expected): input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes) result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse() self.assertIsInstance(result, sparse_tensor.SparseTensor) self.assertAllEqual(expected.indices, result.indices) self.assertAllEqual(expected.values, result.values) self.assertAllEqual(expected.dense_shape, result.dense_shape)
def testSplitWithSparseOutput(self, texts, expected): input_tensor = np.array(_nested_encode(texts, "UTF-8"), dtype=bytes) result = ragged_string_ops.unicode_split(input_tensor, "UTF-8").to_sparse() self.assertIsInstance(result, sparse_tensor.SparseTensor) self.assertAllEqual(expected.indices, result.indices) self.assertAllEqual(expected.values, result.values) self.assertAllEqual(expected.dense_shape, result.dense_shape)
def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_split( input_tensor, "UTF-8").to_tensor(default_value="") self.assertAllEqual(np.array(expected, dtype=bytes), result)
def testBasicSplit(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_split(input_tensor, "UTF-8") expected = _nested_splitchars(texts, "UTF-8") self.assertAllEqual(expected, result)
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets( texts, "UTF-8") self.assertAllEqual(codepoints1, [[ b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t" ], [b"\xf0\x9f\x98\x8a"]]) self.assertAllEqual(codepoints2, [[ b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t" ], [b"\xf0\x9f\x98\x8a"]]) self.assertAllEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def testDocstringExamples(self): texts = [s.encode("utf8") for s in [u"G\xf6\xf6dnight", u"\U0001f60a"]] codepoints1 = ragged_string_ops.unicode_split(texts, "UTF-8") codepoints2, offsets = ragged_string_ops.unicode_split_with_offsets( texts, "UTF-8") self.assertRaggedEqual( codepoints1, [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"], [b"\xf0\x9f\x98\x8a"]]) self.assertRaggedEqual( codepoints2, [[b"G", b"\xc3\xb6", b"\xc3\xb6", b"d", b"n", b"i", b"g", b"h", b"t"], [b"\xf0\x9f\x98\x8a"]]) self.assertRaggedEqual(offsets, [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]])
def tokenize(self, text_input): """Performs basic word tokenization for BERT. Args: text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokenized strings from text_input. """ # lowercase and strip accents (if option is set) if self._lower_case: text_input = case_fold_utf8(text_input) text_input = normalize_utf8(text_input, "NFD") text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "") else: # utf8 normalization if self._normalization_form is not None: text_input = normalize_utf8(text_input, self._normalization_form) # strip out control characters text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ") # For chinese and emoji characters, tokenize by unicode codepoints unicode_tokenizer = UnicodeScriptTokenizer( keep_whitespace=self._keep_whitespace) script_tokenized = unicode_tokenizer.tokenize(text_input) split_cond = self._should_split(script_tokenized) unicode_char_split = ragged_string_ops.unicode_split( script_tokenized, "UTF-8") unicode_split_tokens = array_ops.where( array_ops.squeeze(split_cond), y=array_ops.expand_dims(script_tokenized.values, axis=1), x=unicode_char_split.values) final_tokens = script_tokenized.with_flat_values(unicode_split_tokens) return final_tokens.merge_dims(-2, -1)
def testScalarSplit(self): text = constant_op.constant(u"仅今年前".encode("UTF-8")) chars = ragged_string_ops.unicode_split(text, "UTF-8") self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
def testSplitWithDifferentEncodings(self, encoding, texts): expected = _nested_splitchars(texts, encoding) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_split(input_tensor, encoding) self.assertAllEqual(expected, result)
def testExceptions(self, exception=None, message=None, **args): with self.assertRaisesRegex(exception, message): self.evaluate(ragged_string_ops.unicode_split(**args))
def testBasicSplit(self, texts, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_split(input_tensor, "UTF-8") expected = _nested_splitchars(texts, "UTF-8") self.assertRaggedEqual(expected, result)
def testErrorModes(self, expected=None, **args): result = ragged_string_ops.unicode_split(**args) self.assertAllEqual(expected, result)
def testScalarSplit(self): text = constant_op.constant(u"仅今年前".encode("UTF-8")) chars = ragged_string_ops.unicode_split(text, "UTF-8") self.assertAllEqual(chars, [c.encode("UTF-8") for c in u"仅今年前"])
def testVectorSplit(self): text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"]) chars = ragged_string_ops.unicode_split(text, "UTF-8") expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"], [c.encode("UTF-8") for c in u"hello"]] self.assertAllEqual(chars, expected_chars)
def testSplitWithPaddedOutput(self, texts, expected, ragged_rank=None): input_tensor = ragged_factory_ops.constant_value( _nested_encode(texts, "UTF-8"), ragged_rank=ragged_rank, dtype=bytes) result = ragged_string_ops.unicode_split( input_tensor, "UTF-8").to_tensor(default_value="") self.assertAllEqual(np.array(expected, dtype=bytes), result)
def testErrorModes(self, expected=None, **args): result = ragged_string_ops.unicode_split(**args) self.assertRaggedEqual(expected, result)
def testVectorSplit(self): text = constant_op.constant([u"仅今年前".encode("UTF-8"), b"hello"]) chars = ragged_string_ops.unicode_split(text, "UTF-8") expected_chars = [[c.encode("UTF-8") for c in u"仅今年前"], [c.encode("UTF-8") for c in u"hello"]] self.assertRaggedEqual(chars, expected_chars)
def testExceptions(self, exception=None, message=None, **args): with self.assertRaisesRegexp(exception, message): self.evaluate(ragged_string_ops.unicode_split(**args))
def testSplitWithDifferentEncodings(self, encoding, texts): expected = _nested_splitchars(texts, encoding) input_tensor = constant_op.constant(_nested_encode(texts, encoding)) result = ragged_string_ops.unicode_split(input_tensor, encoding) self.assertRaggedEqual(expected, result)