def test_normalize_nfd(self): txt = [u"\u1e9b\u0323"] expected = [ u"\u017f\u0323\u0307".encode("utf-8"), ] self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFD")) self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfd"))
def tokenize_with_offsets(self, text_input): """Performs basic word tokenization for BERT. Args: text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokenized strings from text_input. """ # lowercase and strip accents (if option is set) if self._lower_case: text_input = case_fold_utf8(text_input) text_input = normalize_utf8(text_input, "NFD") text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "") else: # utf8 normalization if self._normalization_form is not None: text_input = normalize_utf8(text_input, self._normalization_form) # strip out control characters text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ") return regex_split_ops.regex_split_with_offsets( text_input, _DELIM_REGEX_PATTERN, self._keep_delim_regex_pattern, "BertBasicTokenizer")
def test_normalize_nfkc_ragged(self): txt = ragged_factory_ops.constant([[[u"\u1e9b\u0323 \ufb01"], []], [[u"\u1e9b\u0323", u"\ufb01"]]]) expected = [[[u"ṩ fi".encode("utf-8")], []], [[u"ṩ".encode("utf-8"), b"fi"]]] self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC")) self.assertRaggedEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
def test_normalize_nfc(self): txt = [ u"\u1e9b\u0323", ] expected = [ u"\u1e9b\u0323".encode("utf-8"), ] self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFC")) self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfc"))
def test_normalize_nfkc_batch(self): txt = [ u"\u1e9b\u0323", u"\ufb01", ] expected = [ b"\xe1\xb9\xa9", b"fi", ] self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"NFKC")) self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, u"nfkc"))
def test_normalize_nfkc_batch(self): txt = [ u"\u1e9b\u0323", u"\ufb01", ] expected = [ u"ṩ".encode("utf-8"), "fi", ] self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "NFKC")) self.assertAllEqual(expected, normalize_ops.normalize_utf8(txt, "nfkc"))
def tokenize(self, text_input): """Performs basic word tokenization for BERT. Args: text_input: A `Tensor` or `RaggedTensor` of untokenized UTF-8 strings. Returns: A `RaggedTensor` of tokenized strings from text_input. """ # lowercase and strip accents (if option is set) if self._lower_case: text_input = case_fold_utf8(text_input) text_input = normalize_utf8(text_input, "NFD") text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "") else: # utf8 normalization if self._normalization_form is not None: text_input = normalize_utf8(text_input, self._normalization_form) # strip out control characters text_input = string_ops.regex_replace(text_input, r"\p{Cc}|\p{Cf}", " ") # For chinese and emoji characters, tokenize by unicode codepoints unicode_tokenizer = UnicodeScriptTokenizer( keep_whitespace=self._keep_whitespace) script_tokenized = unicode_tokenizer.tokenize(text_input) split_cond = self._should_split(script_tokenized) unicode_char_split = ragged_string_ops.unicode_split( script_tokenized, "UTF-8") unicode_split_tokens = array_ops.where( array_ops.squeeze(split_cond), y=array_ops.expand_dims(script_tokenized.values, axis=1), x=unicode_char_split.values) final_tokens = script_tokenized.with_flat_values(unicode_split_tokens) return final_tokens.merge_dims(-2, -1)
def test_unknown_normalization_form(self): with self.assertRaises(errors.InvalidArgumentError): bomb = normalize_ops.normalize_utf8( ["cant readme", "wont read me"], "cantfindme") self.evaluate(bomb)
def lower_case(self, text_input): """Lower-cases the `text_input'.""" text_input = case_fold_utf8(text_input) text_input = normalize_utf8(text_input, "NFD") text_input = string_ops.regex_replace(text_input, r"\p{Mn}", "") return text_input